aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorAndrea Bastoni <bastoni@cs.unc.edu>2011-08-27 09:43:54 -0400
committerAndrea Bastoni <bastoni@cs.unc.edu>2011-08-27 10:06:11 -0400
commit7b1bb388bc879ffcc6c69b567816d5c354afe42b (patch)
tree5a217fdfb0b5e5a327bdcd624506337c1ae1fe32 /kernel
parent7d754596756240fa918b94cd0c3011c77a638987 (diff)
parent02f8c6aee8df3cdc935e9bdd4f2d020306035dbe (diff)
Merge 'Linux v3.0' into Litmus
Some notes: * Litmus^RT scheduling class is the topmost scheduling class (above stop_sched_class). * scheduler_ipi() function (e.g., in smp_reschedule_interrupt()) may increase IPI latencies. * Added path into schedule() to quickly re-evaluate scheduling decision without becoming preemptive again. This used to be a standard path before the removal of BKL. Conflicts: Makefile arch/arm/kernel/calls.S arch/arm/kernel/smp.c arch/x86/include/asm/unistd_32.h arch/x86/kernel/smp.c arch/x86/kernel/syscall_table_32.S include/linux/hrtimer.h kernel/printk.c kernel/sched.c kernel/sched_fair.c
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks2
-rw-r--r--kernel/Makefile19
-rw-r--r--kernel/audit.c77
-rw-r--r--kernel/audit.h5
-rw-r--r--kernel/audit_tree.c11
-rw-r--r--kernel/audit_watch.c89
-rw-r--r--kernel/auditfilter.c18
-rw-r--r--kernel/auditsc.c45
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/capability.c112
-rw-r--r--kernel/cgroup.c880
-rw-r--r--kernel/cgroup_freezer.c88
-rw-r--r--kernel/compat.c191
-rw-r--r--kernel/configs.c1
-rw-r--r--kernel/cpu.c42
-rw-r--r--kernel/cpuset.c195
-rw-r--r--kernel/crash_dump.c34
-rw-r--r--kernel/cred.c36
-rw-r--r--kernel/debug/debug_core.c155
-rw-r--r--kernel/debug/debug_core.h1
-rw-r--r--kernel/debug/gdbstub.c30
-rw-r--r--kernel/debug/kdb/kdb_debugger.c3
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_main.c99
-rw-r--r--kernel/debug/kdb/kdb_private.h48
-rw-r--r--kernel/debug/kdb/kdb_support.c2
-rw-r--r--kernel/early_res.c590
-rw-r--r--kernel/events/Makefile6
-rw-r--r--kernel/events/core.c (renamed from kernel/perf_event.c)4278
-rw-r--r--kernel/events/hw_breakpoint.c (renamed from kernel/hw_breakpoint.c)78
-rw-r--r--kernel/exit.c188
-rw-r--r--kernel/extable.c18
-rw-r--r--kernel/fork.c305
-rw-r--r--kernel/freezer.c13
-rw-r--r--kernel/futex.c524
-rw-r--r--kernel/futex_compat.c16
-rw-r--r--kernel/gcov/Kconfig5
-rw-r--r--kernel/gcov/Makefile2
-rw-r--r--kernel/gcov/fs.c1
-rw-r--r--kernel/groups.c2
-rw-r--r--kernel/hrtimer.c313
-rw-r--r--kernel/hung_task.c6
-rw-r--r--kernel/irq/Kconfig74
-rw-r--r--kernel/irq/Makefile4
-rw-r--r--kernel/irq/autoprobe.c57
-rw-r--r--kernel/irq/chip.c737
-rw-r--r--kernel/irq/debug.h45
-rw-r--r--kernel/irq/dummychip.c59
-rw-r--r--kernel/irq/generic-chip.c368
-rw-r--r--kernel/irq/handle.c559
-rw-r--r--kernel/irq/internals.h174
-rw-r--r--kernel/irq/irqdesc.c466
-rw-r--r--kernel/irq/manage.c690
-rw-r--r--kernel/irq/migration.c43
-rw-r--r--kernel/irq/numa_migrate.c120
-rw-r--r--kernel/irq/pm.c30
-rw-r--r--kernel/irq/proc.c169
-rw-r--r--kernel/irq/resend.c21
-rw-r--r--kernel/irq/settings.h142
-rw-r--r--kernel/irq/spurious.c195
-rw-r--r--kernel/irq_work.c166
-rw-r--r--kernel/jump_label.c393
-rw-r--r--kernel/kallsyms.c58
-rw-r--r--kernel/kexec.c21
-rw-r--r--kernel/kmod.c124
-rw-r--r--kernel/kprobes.c631
-rw-r--r--kernel/ksysfs.c10
-rw-r--r--kernel/kthread.c50
-rw-r--r--kernel/latencytop.c42
-rw-r--r--kernel/lockdep.c281
-rw-r--r--kernel/lockdep_proc.c25
-rw-r--r--kernel/module.c284
-rw-r--r--kernel/mutex-debug.c2
-rw-r--r--kernel/mutex-debug.h2
-rw-r--r--kernel/mutex.c38
-rw-r--r--kernel/mutex.h2
-rw-r--r--kernel/ns_cgroup.c110
-rw-r--r--kernel/nsproxy.c50
-rw-r--r--kernel/padata.c8
-rw-r--r--kernel/panic.c11
-rw-r--r--kernel/params.c84
-rw-r--r--kernel/pid.c10
-rw-r--r--kernel/pid_namespace.c11
-rw-r--r--kernel/pm_qos_params.c101
-rw-r--r--kernel/posix-cpu-timers.c124
-rw-r--r--kernel/posix-timers.c379
-rw-r--r--kernel/power/Kconfig253
-rw-r--r--kernel/power/Makefile5
-rw-r--r--kernel/power/block_io.c2
-rw-r--r--kernel/power/hibernate.c331
-rw-r--r--kernel/power/main.c35
-rw-r--r--kernel/power/nvs.c136
-rw-r--r--kernel/power/power.h14
-rw-r--r--kernel/power/process.c23
-rw-r--r--kernel/power/snapshot.c69
-rw-r--r--kernel/power/suspend.c21
-rw-r--r--kernel/power/swap.c336
-rw-r--r--kernel/power/user.c13
-rw-r--r--kernel/printk.c461
-rw-r--r--kernel/profile.c23
-rw-r--r--kernel/ptrace.c206
-rw-r--r--kernel/range.c2
-rw-r--r--kernel/rcupdate.c36
-rw-r--r--kernel/rcutiny.c151
-rw-r--r--kernel/rcutiny_plugin.h976
-rw-r--r--kernel/rcutorture.c288
-rw-r--r--kernel/rcutree.c573
-rw-r--r--kernel/rcutree.h193
-rw-r--r--kernel/rcutree_plugin.h1192
-rw-r--r--kernel/rcutree_trace.c226
-rw-r--r--kernel/relay.c15
-rw-r--r--kernel/res_counter.c14
-rw-r--r--kernel/resource.c171
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex-tester.c34
-rw-r--r--kernel/rtmutex.c318
-rw-r--r--kernel/rtmutex_common.h16
-rw-r--r--kernel/sched.c3432
-rw-r--r--kernel/sched_autogroup.c275
-rw-r--r--kernel/sched_autogroup.h41
-rw-r--r--kernel/sched_clock.c2
-rw-r--r--kernel/sched_debug.c127
-rw-r--r--kernel/sched_fair.c1084
-rw-r--r--kernel/sched_features.h15
-rw-r--r--kernel/sched_idletask.c30
-rw-r--r--kernel/sched_rt.c192
-rw-r--r--kernel/sched_stats.h24
-rw-r--r--kernel/sched_stoptask.c104
-rw-r--r--kernel/signal.c893
-rw-r--r--kernel/smp.c210
-rw-r--r--kernel/softirq.c195
-rw-r--r--kernel/srcu.c21
-rw-r--r--kernel/stop_machine.c20
-rw-r--r--kernel/sys.c95
-rw-r--r--kernel/sys_ni.c15
-rw-r--r--kernel/sysctl.c188
-rw-r--r--kernel/sysctl_binary.c22
-rw-r--r--kernel/sysctl_check.c10
-rw-r--r--kernel/taskstats.c227
-rw-r--r--kernel/test_kprobes.c12
-rw-r--r--kernel/time.c39
-rw-r--r--kernel/time/Makefile3
-rw-r--r--kernel/time/alarmtimer.c720
-rw-r--r--kernel/time/clockevents.c70
-rw-r--r--kernel/time/clocksource.c77
-rw-r--r--kernel/time/jiffies.c22
-rw-r--r--kernel/time/ntp.c454
-rw-r--r--kernel/time/posix-clock.c445
-rw-r--r--kernel/time/tick-broadcast.c39
-rw-r--r--kernel/time/tick-common.c9
-rw-r--r--kernel/time/tick-internal.h12
-rw-r--r--kernel/time/tick-oneshot.c5
-rw-r--r--kernel/time/tick-sched.c8
-rw-r--r--kernel/time/timecompare.c5
-rw-r--r--kernel/time/timekeeping.c297
-rw-r--r--kernel/time/timer_list.c12
-rw-r--r--kernel/time/timer_stats.c2
-rw-r--r--kernel/timer.c112
-rw-r--r--kernel/trace/Kconfig26
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c128
-rw-r--r--kernel/trace/ftrace.c1457
-rw-r--r--kernel/trace/power-traces.c5
-rw-r--r--kernel/trace/ring_buffer.c398
-rw-r--r--kernel/trace/trace.c96
-rw-r--r--kernel/trace/trace.h62
-rw-r--r--kernel/trace/trace_clock.c2
-rw-r--r--kernel/trace/trace_entries.h10
-rw-r--r--kernel/trace/trace_event_perf.c59
-rw-r--r--kernel/trace/trace_events.c89
-rw-r--r--kernel/trace/trace_events_filter.c885
-rw-r--r--kernel/trace/trace_export.c20
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_functions_graph.c209
-rw-r--r--kernel/trace/trace_irqsoff.c163
-rw-r--r--kernel/trace/trace_kdb.c1
-rw-r--r--kernel/trace/trace_kprobe.c125
-rw-r--r--kernel/trace/trace_output.c66
-rw-r--r--kernel/trace/trace_printk.c117
-rw-r--r--kernel/trace/trace_sched_switch.c48
-rw-r--r--kernel/trace/trace_sched_wakeup.c257
-rw-r--r--kernel/trace/trace_selftest.c216
-rw-r--r--kernel/trace/trace_selftest_dynamic.c6
-rw-r--r--kernel/trace/trace_stack.c2
-rw-r--r--kernel/trace/trace_syscalls.c92
-rw-r--r--kernel/trace/trace_workqueue.c10
-rw-r--r--kernel/tracepoint.c46
-rw-r--r--kernel/tsacct.c10
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/user-return-notifier.c2
-rw-r--r--kernel/user.c10
-rw-r--r--kernel/user_namespace.c15
-rw-r--r--kernel/utsname.c51
-rw-r--r--kernel/wait.c8
-rw-r--r--kernel/watchdog.c185
-rw-r--r--kernel/workqueue.c458
196 files changed, 24954 insertions, 11587 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 88c92fb44618..5068e2a4e75f 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -199,4 +199,4 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
199 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE 199 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
200 200
201config MUTEX_SPIN_ON_OWNER 201config MUTEX_SPIN_ON_OWNER
202 def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES 202 def_bool SMP && !DEBUG_MUTEXES
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b72d1a74be0..2d64cfcc8b42 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,8 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o range.o 13 async.o range.o jump_label.o
14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
15obj-y += groups.o 14obj-y += groups.o
16 15
17ifdef CONFIG_FUNCTION_TRACER 16ifdef CONFIG_FUNCTION_TRACER
@@ -22,7 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
22CFLAGS_REMOVE_rtmutex-debug.o = -pg 21CFLAGS_REMOVE_rtmutex-debug.o = -pg
23CFLAGS_REMOVE_cgroup-debug.o = -pg 22CFLAGS_REMOVE_cgroup-debug.o = -pg
24CFLAGS_REMOVE_sched_clock.o = -pg 23CFLAGS_REMOVE_sched_clock.o = -pg
25CFLAGS_REMOVE_perf_event.o = -pg 24CFLAGS_REMOVE_irq_work.o = -pg
26endif 25endif
27 26
28obj-$(CONFIG_FREEZER) += freezer.o 27obj-$(CONFIG_FREEZER) += freezer.o
@@ -43,7 +42,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
43obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o 42obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
44obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o 43obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
45obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 44obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
46obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o 45obj-$(CONFIG_SMP) += smp.o
47ifneq ($(CONFIG_SMP),y) 46ifneq ($(CONFIG_SMP),y)
48obj-y += up.o 47obj-y += up.o
49endif 48endif
@@ -62,7 +61,6 @@ obj-$(CONFIG_COMPAT) += compat.o
62obj-$(CONFIG_CGROUPS) += cgroup.o 61obj-$(CONFIG_CGROUPS) += cgroup.o
63obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o 62obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
64obj-$(CONFIG_CPUSETS) += cpuset.o 63obj-$(CONFIG_CPUSETS) += cpuset.o
65obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
66obj-$(CONFIG_UTS_NS) += utsname.o 64obj-$(CONFIG_UTS_NS) += utsname.o
67obj-$(CONFIG_USER_NS) += user_namespace.o 65obj-$(CONFIG_USER_NS) += user_namespace.o
68obj-$(CONFIG_PID_NS) += pid_namespace.o 66obj-$(CONFIG_PID_NS) += pid_namespace.o
@@ -86,6 +84,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o
86obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o 84obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
87obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 85obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
88obj-$(CONFIG_TINY_RCU) += rcutiny.o 86obj-$(CONFIG_TINY_RCU) += rcutiny.o
87obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
89obj-$(CONFIG_RELAY) += relay.o 88obj-$(CONFIG_RELAY) += relay.o
90obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 89obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
91obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 90obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -99,11 +98,15 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
99obj-$(CONFIG_TRACING) += trace/ 98obj-$(CONFIG_TRACING) += trace/
100obj-$(CONFIG_X86_DS) += trace/ 99obj-$(CONFIG_X86_DS) += trace/
101obj-$(CONFIG_RING_BUFFER) += trace/ 100obj-$(CONFIG_RING_BUFFER) += trace/
101obj-$(CONFIG_TRACEPOINTS) += trace/
102obj-$(CONFIG_SMP) += sched_cpupri.o 102obj-$(CONFIG_SMP) += sched_cpupri.o
103obj-$(CONFIG_PERF_EVENTS) += perf_event.o 103obj-$(CONFIG_IRQ_WORK) += irq_work.o
104obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 104
105obj-$(CONFIG_PERF_EVENTS) += events/
106
105obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 107obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
106obj-$(CONFIG_PADATA) += padata.o 108obj-$(CONFIG_PADATA) += padata.o
109obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
107 110
108ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 111ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
109# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 112# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
@@ -119,7 +122,7 @@ $(obj)/configs.o: $(obj)/config_data.h
119# config_data.h contains the same information as ikconfig.h but gzipped. 122# config_data.h contains the same information as ikconfig.h but gzipped.
120# Info from config_data can be extracted from /proc/config* 123# Info from config_data can be extracted from /proc/config*
121targets += config_data.gz 124targets += config_data.gz
122$(obj)/config_data.gz: .config FORCE 125$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
123 $(call if_changed,gzip) 126 $(call if_changed,gzip)
124 127
125quiet_cmd_ikconfiggz = IKCFG $@ 128quiet_cmd_ikconfiggz = IKCFG $@
diff --git a/kernel/audit.c b/kernel/audit.c
index d96045789b54..939500317066 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -74,6 +74,8 @@ static int audit_initialized;
74int audit_enabled; 74int audit_enabled;
75int audit_ever_enabled; 75int audit_ever_enabled;
76 76
77EXPORT_SYMBOL_GPL(audit_enabled);
78
77/* Default state when kernel boots without any parameters. */ 79/* Default state when kernel boots without any parameters. */
78static int audit_default; 80static int audit_default;
79 81
@@ -400,7 +402,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
400 if (err < 0) { 402 if (err < 0) {
401 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ 403 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 404 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
403 audit_log_lost("auditd dissapeared\n"); 405 audit_log_lost("auditd disappeared\n");
404 audit_pid = 0; 406 audit_pid = 0;
405 /* we might get lucky and get this in the next auditd */ 407 /* we might get lucky and get this in the next auditd */
406 audit_hold_skb(skb); 408 audit_hold_skb(skb);
@@ -467,23 +469,16 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
467 struct task_struct *tsk; 469 struct task_struct *tsk;
468 int err; 470 int err;
469 471
470 read_lock(&tasklist_lock); 472 rcu_read_lock();
471 tsk = find_task_by_vpid(pid); 473 tsk = find_task_by_vpid(pid);
472 err = -ESRCH; 474 if (!tsk) {
473 if (!tsk) 475 rcu_read_unlock();
474 goto out; 476 return -ESRCH;
475 err = 0; 477 }
476 478 get_task_struct(tsk);
477 spin_lock_irq(&tsk->sighand->siglock); 479 rcu_read_unlock();
478 if (!tsk->signal->audit_tty) 480 err = tty_audit_push_task(tsk, loginuid, sessionid);
479 err = -EPERM; 481 put_task_struct(tsk);
480 spin_unlock_irq(&tsk->sighand->siglock);
481 if (err)
482 goto out;
483
484 tty_audit_push_task(tsk, loginuid, sessionid);
485out:
486 read_unlock(&tasklist_lock);
487 return err; 482 return err;
488} 483}
489 484
@@ -506,7 +501,7 @@ int audit_send_list(void *_dest)
506} 501}
507 502
508struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, 503struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
509 int multi, void *payload, int size) 504 int multi, const void *payload, int size)
510{ 505{
511 struct sk_buff *skb; 506 struct sk_buff *skb;
512 struct nlmsghdr *nlh; 507 struct nlmsghdr *nlh;
@@ -555,8 +550,8 @@ static int audit_send_reply_thread(void *arg)
555 * Allocates an skb, builds the netlink message, and sends it to the pid. 550 * Allocates an skb, builds the netlink message, and sends it to the pid.
556 * No failure notifications. 551 * No failure notifications.
557 */ 552 */
558void audit_send_reply(int pid, int seq, int type, int done, int multi, 553static void audit_send_reply(int pid, int seq, int type, int done, int multi,
559 void *payload, int size) 554 const void *payload, int size)
560{ 555{
561 struct sk_buff *skb; 556 struct sk_buff *skb;
562 struct task_struct *tsk; 557 struct task_struct *tsk;
@@ -678,9 +673,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
678 673
679 pid = NETLINK_CREDS(skb)->pid; 674 pid = NETLINK_CREDS(skb)->pid;
680 uid = NETLINK_CREDS(skb)->uid; 675 uid = NETLINK_CREDS(skb)->uid;
681 loginuid = NETLINK_CB(skb).loginuid; 676 loginuid = audit_get_loginuid(current);
682 sessionid = NETLINK_CB(skb).sessionid; 677 sessionid = audit_get_sessionid(current);
683 sid = NETLINK_CB(skb).sid; 678 security_task_getsecid(current, &sid);
684 seq = nlh->nlmsg_seq; 679 seq = nlh->nlmsg_seq;
685 data = NLMSG_DATA(nlh); 680 data = NLMSG_DATA(nlh);
686 681
@@ -880,40 +875,40 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
880 case AUDIT_TTY_GET: { 875 case AUDIT_TTY_GET: {
881 struct audit_tty_status s; 876 struct audit_tty_status s;
882 struct task_struct *tsk; 877 struct task_struct *tsk;
878 unsigned long flags;
883 879
884 read_lock(&tasklist_lock); 880 rcu_read_lock();
885 tsk = find_task_by_vpid(pid); 881 tsk = find_task_by_vpid(pid);
886 if (!tsk) 882 if (tsk && lock_task_sighand(tsk, &flags)) {
887 err = -ESRCH;
888 else {
889 spin_lock_irq(&tsk->sighand->siglock);
890 s.enabled = tsk->signal->audit_tty != 0; 883 s.enabled = tsk->signal->audit_tty != 0;
891 spin_unlock_irq(&tsk->sighand->siglock); 884 unlock_task_sighand(tsk, &flags);
892 } 885 } else
893 read_unlock(&tasklist_lock); 886 err = -ESRCH;
894 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0, 887 rcu_read_unlock();
895 &s, sizeof(s)); 888
889 if (!err)
890 audit_send_reply(NETLINK_CB(skb).pid, seq,
891 AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
896 break; 892 break;
897 } 893 }
898 case AUDIT_TTY_SET: { 894 case AUDIT_TTY_SET: {
899 struct audit_tty_status *s; 895 struct audit_tty_status *s;
900 struct task_struct *tsk; 896 struct task_struct *tsk;
897 unsigned long flags;
901 898
902 if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) 899 if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
903 return -EINVAL; 900 return -EINVAL;
904 s = data; 901 s = data;
905 if (s->enabled != 0 && s->enabled != 1) 902 if (s->enabled != 0 && s->enabled != 1)
906 return -EINVAL; 903 return -EINVAL;
907 read_lock(&tasklist_lock); 904 rcu_read_lock();
908 tsk = find_task_by_vpid(pid); 905 tsk = find_task_by_vpid(pid);
909 if (!tsk) 906 if (tsk && lock_task_sighand(tsk, &flags)) {
910 err = -ESRCH;
911 else {
912 spin_lock_irq(&tsk->sighand->siglock);
913 tsk->signal->audit_tty = s->enabled != 0; 907 tsk->signal->audit_tty = s->enabled != 0;
914 spin_unlock_irq(&tsk->sighand->siglock); 908 unlock_task_sighand(tsk, &flags);
915 } 909 } else
916 read_unlock(&tasklist_lock); 910 err = -ESRCH;
911 rcu_read_unlock();
917 break; 912 break;
918 } 913 }
919 default: 914 default:
diff --git a/kernel/audit.h b/kernel/audit.h
index f7206db4e13d..91e7071c4d2c 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -84,10 +84,7 @@ extern int audit_compare_dname_path(const char *dname, const char *path,
84 int *dirlen); 84 int *dirlen);
85extern struct sk_buff * audit_make_reply(int pid, int seq, int type, 85extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
86 int done, int multi, 86 int done, int multi,
87 void *payload, int size); 87 const void *payload, int size);
88extern void audit_send_reply(int pid, int seq, int type,
89 int done, int multi,
90 void *payload, int size);
91extern void audit_panic(const char *message); 88extern void audit_panic(const char *message);
92 89
93struct audit_netlink_list { 90struct audit_netlink_list {
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 7f18d3a4527e..e99dda04b126 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -223,7 +223,7 @@ static void untag_chunk(struct node *p)
223{ 223{
224 struct audit_chunk *chunk = find_chunk(p); 224 struct audit_chunk *chunk = find_chunk(p);
225 struct fsnotify_mark *entry = &chunk->mark; 225 struct fsnotify_mark *entry = &chunk->mark;
226 struct audit_chunk *new; 226 struct audit_chunk *new = NULL;
227 struct audit_tree *owner; 227 struct audit_tree *owner;
228 int size = chunk->count - 1; 228 int size = chunk->count - 1;
229 int i, j; 229 int i, j;
@@ -232,9 +232,14 @@ static void untag_chunk(struct node *p)
232 232
233 spin_unlock(&hash_lock); 233 spin_unlock(&hash_lock);
234 234
235 if (size)
236 new = alloc_chunk(size);
237
235 spin_lock(&entry->lock); 238 spin_lock(&entry->lock);
236 if (chunk->dead || !entry->i.inode) { 239 if (chunk->dead || !entry->i.inode) {
237 spin_unlock(&entry->lock); 240 spin_unlock(&entry->lock);
241 if (new)
242 free_chunk(new);
238 goto out; 243 goto out;
239 } 244 }
240 245
@@ -255,9 +260,9 @@ static void untag_chunk(struct node *p)
255 goto out; 260 goto out;
256 } 261 }
257 262
258 new = alloc_chunk(size);
259 if (!new) 263 if (!new)
260 goto Fallback; 264 goto Fallback;
265
261 fsnotify_duplicate_mark(&new->mark, entry); 266 fsnotify_duplicate_mark(&new->mark, entry);
262 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { 267 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
263 free_chunk(new); 268 free_chunk(new);
@@ -602,7 +607,7 @@ void audit_trim_trees(void)
602 spin_lock(&hash_lock); 607 spin_lock(&hash_lock);
603 list_for_each_entry(node, &tree->chunks, list) { 608 list_for_each_entry(node, &tree->chunks, list) {
604 struct audit_chunk *chunk = find_chunk(node); 609 struct audit_chunk *chunk = find_chunk(node);
605 /* this could be NULL if the watch is dieing else where... */ 610 /* this could be NULL if the watch is dying else where... */
606 struct inode *inode = chunk->mark.i.inode; 611 struct inode *inode = chunk->mark.i.inode;
607 node->index |= 1U<<31; 612 node->index |= 1U<<31;
608 if (iterate_mounts(compare_root, inode, root_mnt)) 613 if (iterate_mounts(compare_root, inode, root_mnt))
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index f0c9b2e7542d..e683869365d9 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -60,7 +60,7 @@ struct audit_parent {
60}; 60};
61 61
62/* fsnotify handle. */ 62/* fsnotify handle. */
63struct fsnotify_group *audit_watch_group; 63static struct fsnotify_group *audit_watch_group;
64 64
65/* fsnotify events we care about. */ 65/* fsnotify events we care about. */
66#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ 66#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
@@ -123,7 +123,7 @@ void audit_put_watch(struct audit_watch *watch)
123 } 123 }
124} 124}
125 125
126void audit_remove_watch(struct audit_watch *watch) 126static void audit_remove_watch(struct audit_watch *watch)
127{ 127{
128 list_del(&watch->wlist); 128 list_del(&watch->wlist);
129 audit_put_parent(watch->parent); 129 audit_put_parent(watch->parent);
@@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
144} 144}
145 145
146/* Initialize a parent watch entry. */ 146/* Initialize a parent watch entry. */
147static struct audit_parent *audit_init_parent(struct nameidata *ndp) 147static struct audit_parent *audit_init_parent(struct path *path)
148{ 148{
149 struct inode *inode = ndp->path.dentry->d_inode; 149 struct inode *inode = path->dentry->d_inode;
150 struct audit_parent *parent; 150 struct audit_parent *parent;
151 int ret; 151 int ret;
152 152
@@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
353} 353}
354 354
355/* Get path information necessary for adding watches. */ 355/* Get path information necessary for adding watches. */
356static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) 356static int audit_get_nd(struct audit_watch *watch, struct path *parent)
357{ 357{
358 struct nameidata *ndparent, *ndwatch; 358 struct nameidata nd;
359 struct dentry *d;
359 int err; 360 int err;
360 361
361 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); 362 err = kern_path_parent(watch->path, &nd);
362 if (unlikely(!ndparent)) 363 if (err)
363 return -ENOMEM; 364 return err;
364 365
365 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); 366 if (nd.last_type != LAST_NORM) {
366 if (unlikely(!ndwatch)) { 367 path_put(&nd.path);
367 kfree(ndparent); 368 return -EINVAL;
368 return -ENOMEM;
369 } 369 }
370 370
371 err = path_lookup(path, LOOKUP_PARENT, ndparent); 371 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
372 if (err) { 372 d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
373 kfree(ndparent); 373 if (IS_ERR(d)) {
374 kfree(ndwatch); 374 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
375 return err; 375 path_put(&nd.path);
376 return PTR_ERR(d);
376 } 377 }
377 378 if (d->d_inode) {
378 err = path_lookup(path, 0, ndwatch); 379 /* update watch filter fields */
379 if (err) { 380 watch->dev = d->d_inode->i_sb->s_dev;
380 kfree(ndwatch); 381 watch->ino = d->d_inode->i_ino;
381 ndwatch = NULL;
382 } 382 }
383 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
383 384
384 *ndp = ndparent; 385 *parent = nd.path;
385 *ndw = ndwatch; 386 dput(d);
386
387 return 0; 387 return 0;
388} 388}
389 389
390/* Release resources used for watch path information. */
391static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
392{
393 if (ndp) {
394 path_put(&ndp->path);
395 kfree(ndp);
396 }
397 if (ndw) {
398 path_put(&ndw->path);
399 kfree(ndw);
400 }
401}
402
403/* Associate the given rule with an existing parent. 390/* Associate the given rule with an existing parent.
404 * Caller must hold audit_filter_mutex. */ 391 * Caller must hold audit_filter_mutex. */
405static void audit_add_to_parent(struct audit_krule *krule, 392static void audit_add_to_parent(struct audit_krule *krule,
@@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
440{ 427{
441 struct audit_watch *watch = krule->watch; 428 struct audit_watch *watch = krule->watch;
442 struct audit_parent *parent; 429 struct audit_parent *parent;
443 struct nameidata *ndp = NULL, *ndw = NULL; 430 struct path parent_path;
444 int h, ret = 0; 431 int h, ret = 0;
445 432
446 mutex_unlock(&audit_filter_mutex); 433 mutex_unlock(&audit_filter_mutex);
447 434
448 /* Avoid calling path_lookup under audit_filter_mutex. */ 435 /* Avoid calling path_lookup under audit_filter_mutex. */
449 ret = audit_get_nd(watch->path, &ndp, &ndw); 436 ret = audit_get_nd(watch, &parent_path);
450 if (ret) {
451 /* caller expects mutex locked */
452 mutex_lock(&audit_filter_mutex);
453 goto error;
454 }
455 437
438 /* caller expects mutex locked */
456 mutex_lock(&audit_filter_mutex); 439 mutex_lock(&audit_filter_mutex);
457 440
458 /* update watch filter fields */ 441 if (ret)
459 if (ndw) { 442 return ret;
460 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
461 watch->ino = ndw->path.dentry->d_inode->i_ino;
462 }
463 443
464 /* either find an old parent or attach a new one */ 444 /* either find an old parent or attach a new one */
465 parent = audit_find_parent(ndp->path.dentry->d_inode); 445 parent = audit_find_parent(parent_path.dentry->d_inode);
466 if (!parent) { 446 if (!parent) {
467 parent = audit_init_parent(ndp); 447 parent = audit_init_parent(&parent_path);
468 if (IS_ERR(parent)) { 448 if (IS_ERR(parent)) {
469 ret = PTR_ERR(parent); 449 ret = PTR_ERR(parent);
470 goto error; 450 goto error;
@@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
479 h = audit_hash_ino((u32)watch->ino); 459 h = audit_hash_ino((u32)watch->ino);
480 *list = &audit_inode_hash[h]; 460 *list = &audit_inode_hash[h];
481error: 461error:
482 audit_put_nd(ndp, ndw); /* NULL args OK */ 462 path_put(&parent_path);
483 return ret; 463 return ret;
484
485} 464}
486 465
487void audit_remove_watch_rule(struct audit_krule *krule) 466void audit_remove_watch_rule(struct audit_krule *krule)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index eb7675499fb5..f8277c80d678 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1238,6 +1238,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1238 for (i = 0; i < rule->field_count; i++) { 1238 for (i = 0; i < rule->field_count; i++) {
1239 struct audit_field *f = &rule->fields[i]; 1239 struct audit_field *f = &rule->fields[i];
1240 int result = 0; 1240 int result = 0;
1241 u32 sid;
1241 1242
1242 switch (f->type) { 1243 switch (f->type) {
1243 case AUDIT_PID: 1244 case AUDIT_PID:
@@ -1250,7 +1251,22 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1250 result = audit_comparator(cb->creds.gid, f->op, f->val); 1251 result = audit_comparator(cb->creds.gid, f->op, f->val);
1251 break; 1252 break;
1252 case AUDIT_LOGINUID: 1253 case AUDIT_LOGINUID:
1253 result = audit_comparator(cb->loginuid, f->op, f->val); 1254 result = audit_comparator(audit_get_loginuid(current),
1255 f->op, f->val);
1256 break;
1257 case AUDIT_SUBJ_USER:
1258 case AUDIT_SUBJ_ROLE:
1259 case AUDIT_SUBJ_TYPE:
1260 case AUDIT_SUBJ_SEN:
1261 case AUDIT_SUBJ_CLR:
1262 if (f->lsm_rule) {
1263 security_task_getsecid(current, &sid);
1264 result = security_audit_rule_match(sid,
1265 f->type,
1266 f->op,
1267 f->lsm_rule,
1268 NULL);
1269 }
1254 break; 1270 break;
1255 } 1271 }
1256 1272
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1b31c130d034..00d79df03e76 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -241,6 +241,10 @@ struct audit_context {
241 pid_t pid; 241 pid_t pid;
242 struct audit_cap_data cap; 242 struct audit_cap_data cap;
243 } capset; 243 } capset;
244 struct {
245 int fd;
246 int flags;
247 } mmap;
244 }; 248 };
245 int fds[2]; 249 int fds[2];
246 250
@@ -439,17 +443,25 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
439 443
440/* Determine if any context name data matches a rule's watch data */ 444/* Determine if any context name data matches a rule's watch data */
441/* Compare a task_struct with an audit_rule. Return 1 on match, 0 445/* Compare a task_struct with an audit_rule. Return 1 on match, 0
442 * otherwise. */ 446 * otherwise.
447 *
448 * If task_creation is true, this is an explicit indication that we are
449 * filtering a task rule at task creation time. This and tsk == current are
450 * the only situations where tsk->cred may be accessed without an rcu read lock.
451 */
443static int audit_filter_rules(struct task_struct *tsk, 452static int audit_filter_rules(struct task_struct *tsk,
444 struct audit_krule *rule, 453 struct audit_krule *rule,
445 struct audit_context *ctx, 454 struct audit_context *ctx,
446 struct audit_names *name, 455 struct audit_names *name,
447 enum audit_state *state) 456 enum audit_state *state,
457 bool task_creation)
448{ 458{
449 const struct cred *cred = get_task_cred(tsk); 459 const struct cred *cred;
450 int i, j, need_sid = 1; 460 int i, j, need_sid = 1;
451 u32 sid; 461 u32 sid;
452 462
463 cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
464
453 for (i = 0; i < rule->field_count; i++) { 465 for (i = 0; i < rule->field_count; i++) {
454 struct audit_field *f = &rule->fields[i]; 466 struct audit_field *f = &rule->fields[i];
455 int result = 0; 467 int result = 0;
@@ -633,10 +645,8 @@ static int audit_filter_rules(struct task_struct *tsk,
633 break; 645 break;
634 } 646 }
635 647
636 if (!result) { 648 if (!result)
637 put_cred(cred);
638 return 0; 649 return 0;
639 }
640 } 650 }
641 651
642 if (ctx) { 652 if (ctx) {
@@ -652,7 +662,6 @@ static int audit_filter_rules(struct task_struct *tsk,
652 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 662 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
653 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; 663 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
654 } 664 }
655 put_cred(cred);
656 return 1; 665 return 1;
657} 666}
658 667
@@ -667,7 +676,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
667 676
668 rcu_read_lock(); 677 rcu_read_lock();
669 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { 678 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
670 if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { 679 if (audit_filter_rules(tsk, &e->rule, NULL, NULL,
680 &state, true)) {
671 if (state == AUDIT_RECORD_CONTEXT) 681 if (state == AUDIT_RECORD_CONTEXT)
672 *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); 682 *key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
673 rcu_read_unlock(); 683 rcu_read_unlock();
@@ -701,7 +711,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
701 list_for_each_entry_rcu(e, list, list) { 711 list_for_each_entry_rcu(e, list, list) {
702 if ((e->rule.mask[word] & bit) == bit && 712 if ((e->rule.mask[word] & bit) == bit &&
703 audit_filter_rules(tsk, &e->rule, ctx, NULL, 713 audit_filter_rules(tsk, &e->rule, ctx, NULL,
704 &state)) { 714 &state, false)) {
705 rcu_read_unlock(); 715 rcu_read_unlock();
706 ctx->current_state = state; 716 ctx->current_state = state;
707 return state; 717 return state;
@@ -739,7 +749,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
739 749
740 list_for_each_entry_rcu(e, list, list) { 750 list_for_each_entry_rcu(e, list, list) {
741 if ((e->rule.mask[word] & bit) == bit && 751 if ((e->rule.mask[word] & bit) == bit &&
742 audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { 752 audit_filter_rules(tsk, &e->rule, ctx, n,
753 &state, false)) {
743 rcu_read_unlock(); 754 rcu_read_unlock();
744 ctx->current_state = state; 755 ctx->current_state = state;
745 return; 756 return;
@@ -1007,7 +1018,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
1007/* 1018/*
1008 * to_send and len_sent accounting are very loose estimates. We aren't 1019 * to_send and len_sent accounting are very loose estimates. We aren't
1009 * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being 1020 * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
1010 * within about 500 bytes (next page boundry) 1021 * within about 500 bytes (next page boundary)
1011 * 1022 *
1012 * why snprintf? an int is up to 12 digits long. if we just assumed when 1023 * why snprintf? an int is up to 12 digits long. if we just assumed when
1013 * logging that a[%d]= was going to be 16 characters long we would be wasting 1024 * logging that a[%d]= was going to be 16 characters long we would be wasting
@@ -1305,6 +1316,10 @@ static void show_special(struct audit_context *context, int *call_panic)
1305 audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); 1316 audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
1306 audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); 1317 audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
1307 break; } 1318 break; }
1319 case AUDIT_MMAP: {
1320 audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
1321 context->mmap.flags);
1322 break; }
1308 } 1323 }
1309 audit_log_end(ab); 1324 audit_log_end(ab);
1310} 1325}
@@ -2476,6 +2491,14 @@ void __audit_log_capset(pid_t pid,
2476 context->type = AUDIT_CAPSET; 2491 context->type = AUDIT_CAPSET;
2477} 2492}
2478 2493
2494void __audit_mmap_fd(int fd, int flags)
2495{
2496 struct audit_context *context = current->audit_context;
2497 context->mmap.fd = fd;
2498 context->mmap.flags = flags;
2499 context->type = AUDIT_MMAP;
2500}
2501
2479/** 2502/**
2480 * audit_core_dumps - record information about processes that end abnormally 2503 * audit_core_dumps - record information about processes that end abnormally
2481 * @signr: signal value 2504 * @signr: signal value
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 98a51f26c136..0c9b862292b2 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,11 +9,13 @@
9#include <linux/page-flags.h> 9#include <linux/page-flags.h>
10#include <linux/mmzone.h> 10#include <linux/mmzone.h>
11#include <linux/kbuild.h> 11#include <linux/kbuild.h>
12#include <linux/page_cgroup.h>
12 13
13void foo(void) 14void foo(void)
14{ 15{
15 /* The enum constants to put into include/generated/bounds.h */ 16 /* The enum constants to put into include/generated/bounds.h */
16 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); 17 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
17 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); 18 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
19 DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
18 /* End of constants */ 20 /* End of constants */
19} 21}
diff --git a/kernel/capability.c b/kernel/capability.c
index 2f05303715a5..283c529f8b1c 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -14,6 +14,7 @@
14#include <linux/security.h> 14#include <linux/security.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
17#include <linux/user_namespace.h>
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
18 19
19/* 20/*
@@ -21,12 +22,8 @@
21 */ 22 */
22 23
23const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; 24const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
24const kernel_cap_t __cap_full_set = CAP_FULL_SET;
25const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET;
26 25
27EXPORT_SYMBOL(__cap_empty_set); 26EXPORT_SYMBOL(__cap_empty_set);
28EXPORT_SYMBOL(__cap_full_set);
29EXPORT_SYMBOL(__cap_init_eff_set);
30 27
31int file_caps_enabled = 1; 28int file_caps_enabled = 1;
32 29
@@ -290,6 +287,60 @@ error:
290} 287}
291 288
292/** 289/**
290 * has_capability - Does a task have a capability in init_user_ns
291 * @t: The task in question
292 * @cap: The capability to be tested for
293 *
294 * Return true if the specified task has the given superior capability
295 * currently in effect to the initial user namespace, false if not.
296 *
297 * Note that this does not set PF_SUPERPRIV on the task.
298 */
299bool has_capability(struct task_struct *t, int cap)
300{
301 int ret = security_real_capable(t, &init_user_ns, cap);
302
303 return (ret == 0);
304}
305
306/**
307 * has_capability - Does a task have a capability in a specific user ns
308 * @t: The task in question
309 * @ns: target user namespace
310 * @cap: The capability to be tested for
311 *
312 * Return true if the specified task has the given superior capability
313 * currently in effect to the specified user namespace, false if not.
314 *
315 * Note that this does not set PF_SUPERPRIV on the task.
316 */
317bool has_ns_capability(struct task_struct *t,
318 struct user_namespace *ns, int cap)
319{
320 int ret = security_real_capable(t, ns, cap);
321
322 return (ret == 0);
323}
324
325/**
326 * has_capability_noaudit - Does a task have a capability (unaudited)
327 * @t: The task in question
328 * @cap: The capability to be tested for
329 *
330 * Return true if the specified task has the given superior capability
331 * currently in effect to init_user_ns, false if not. Don't write an
332 * audit message for the check.
333 *
334 * Note that this does not set PF_SUPERPRIV on the task.
335 */
336bool has_capability_noaudit(struct task_struct *t, int cap)
337{
338 int ret = security_real_capable_noaudit(t, &init_user_ns, cap);
339
340 return (ret == 0);
341}
342
343/**
293 * capable - Determine if the current task has a superior capability in effect 344 * capable - Determine if the current task has a superior capability in effect
294 * @cap: The capability to be tested for 345 * @cap: The capability to be tested for
295 * 346 *
@@ -299,17 +350,60 @@ error:
299 * This sets PF_SUPERPRIV on the task if the capability is available on the 350 * This sets PF_SUPERPRIV on the task if the capability is available on the
300 * assumption that it's about to be used. 351 * assumption that it's about to be used.
301 */ 352 */
302int capable(int cap) 353bool capable(int cap)
354{
355 return ns_capable(&init_user_ns, cap);
356}
357EXPORT_SYMBOL(capable);
358
359/**
360 * ns_capable - Determine if the current task has a superior capability in effect
361 * @ns: The usernamespace we want the capability in
362 * @cap: The capability to be tested for
363 *
364 * Return true if the current task has the given superior capability currently
365 * available for use, false if not.
366 *
367 * This sets PF_SUPERPRIV on the task if the capability is available on the
368 * assumption that it's about to be used.
369 */
370bool ns_capable(struct user_namespace *ns, int cap)
303{ 371{
304 if (unlikely(!cap_valid(cap))) { 372 if (unlikely(!cap_valid(cap))) {
305 printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); 373 printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
306 BUG(); 374 BUG();
307 } 375 }
308 376
309 if (security_capable(cap) == 0) { 377 if (security_capable(ns, current_cred(), cap) == 0) {
310 current->flags |= PF_SUPERPRIV; 378 current->flags |= PF_SUPERPRIV;
311 return 1; 379 return true;
312 } 380 }
313 return 0; 381 return false;
382}
383EXPORT_SYMBOL(ns_capable);
384
385/**
386 * task_ns_capable - Determine whether current task has a superior
387 * capability targeted at a specific task's user namespace.
388 * @t: The task whose user namespace is targeted.
389 * @cap: The capability in question.
390 *
391 * Return true if it does, false otherwise.
392 */
393bool task_ns_capable(struct task_struct *t, int cap)
394{
395 return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
396}
397EXPORT_SYMBOL(task_ns_capable);
398
399/**
400 * nsown_capable - Check superior capability to one's own user_ns
401 * @cap: The capability in question
402 *
403 * Return true if the current task has the given superior capability
404 * targeted at its own user namespace.
405 */
406bool nsown_capable(int cap)
407{
408 return ns_capable(current_user_ns(), cap);
314} 409}
315EXPORT_SYMBOL(capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c9483d8f6140..2731d115d725 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,12 +52,12 @@
52#include <linux/cgroupstats.h> 52#include <linux/cgroupstats.h>
53#include <linux/hash.h> 53#include <linux/hash.h>
54#include <linux/namei.h> 54#include <linux/namei.h>
55#include <linux/smp_lock.h>
56#include <linux/pid_namespace.h> 55#include <linux/pid_namespace.h>
57#include <linux/idr.h> 56#include <linux/idr.h>
58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
59#include <linux/eventfd.h> 58#include <linux/eventfd.h>
60#include <linux/poll.h> 59#include <linux/poll.h>
60#include <linux/flex_array.h> /* used in cgroup_attach_proc */
61 61
62#include <asm/atomic.h> 62#include <asm/atomic.h>
63 63
@@ -138,7 +138,7 @@ struct css_id {
138 * is called after synchronize_rcu(). But for safe use, css_is_removed() 138 * is called after synchronize_rcu(). But for safe use, css_is_removed()
139 * css_tryget() should be used for avoiding race. 139 * css_tryget() should be used for avoiding race.
140 */ 140 */
141 struct cgroup_subsys_state *css; 141 struct cgroup_subsys_state __rcu *css;
142 /* 142 /*
143 * ID of this css. 143 * ID of this css.
144 */ 144 */
@@ -158,7 +158,7 @@ struct css_id {
158}; 158};
159 159
160/* 160/*
161 * cgroup_event represents events which userspace want to recieve. 161 * cgroup_event represents events which userspace want to receive.
162 */ 162 */
163struct cgroup_event { 163struct cgroup_event {
164 /* 164 /*
@@ -244,6 +244,11 @@ static int notify_on_release(const struct cgroup *cgrp)
244 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 244 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
245} 245}
246 246
247static int clone_children(const struct cgroup *cgrp)
248{
249 return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
250}
251
247/* 252/*
248 * for_each_subsys() allows you to iterate on each subsystem attached to 253 * for_each_subsys() allows you to iterate on each subsystem attached to
249 * an active hierarchy 254 * an active hierarchy
@@ -322,12 +327,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
322 return &css_set_table[index]; 327 return &css_set_table[index];
323} 328}
324 329
325static void free_css_set_rcu(struct rcu_head *obj)
326{
327 struct css_set *cg = container_of(obj, struct css_set, rcu_head);
328 kfree(cg);
329}
330
331/* We don't maintain the lists running through each css_set to its 330/* We don't maintain the lists running through each css_set to its
332 * task until after the first call to cgroup_iter_start(). This 331 * task until after the first call to cgroup_iter_start(). This
333 * reduces the fork()/exit() overhead for people who have cgroups 332 * reduces the fork()/exit() overhead for people who have cgroups
@@ -371,7 +370,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
371 } 370 }
372 371
373 write_unlock(&css_set_lock); 372 write_unlock(&css_set_lock);
374 call_rcu(&cg->rcu_head, free_css_set_rcu); 373 kfree_rcu(cg, rcu_head);
375} 374}
376 375
377/* 376/*
@@ -760,6 +759,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
760 */ 759 */
761 760
762static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); 761static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
762static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
763static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 763static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
764static int cgroup_populate_dir(struct cgroup *cgrp); 764static int cgroup_populate_dir(struct cgroup *cgrp);
765static const struct inode_operations cgroup_dir_inode_operations; 765static const struct inode_operations cgroup_dir_inode_operations;
@@ -778,6 +778,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
778 struct inode *inode = new_inode(sb); 778 struct inode *inode = new_inode(sb);
779 779
780 if (inode) { 780 if (inode) {
781 inode->i_ino = get_next_ino();
781 inode->i_mode = mode; 782 inode->i_mode = mode;
782 inode->i_uid = current_fsuid(); 783 inode->i_uid = current_fsuid();
783 inode->i_gid = current_fsgid(); 784 inode->i_gid = current_fsgid();
@@ -806,13 +807,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
806 return ret; 807 return ret;
807} 808}
808 809
809static void free_cgroup_rcu(struct rcu_head *obj)
810{
811 struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
812
813 kfree(cgrp);
814}
815
816static void cgroup_diput(struct dentry *dentry, struct inode *inode) 810static void cgroup_diput(struct dentry *dentry, struct inode *inode)
817{ 811{
818 /* is dentry a directory ? if so, kfree() associated cgroup */ 812 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -850,11 +844,16 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
850 */ 844 */
851 BUG_ON(!list_empty(&cgrp->pidlists)); 845 BUG_ON(!list_empty(&cgrp->pidlists));
852 846
853 call_rcu(&cgrp->rcu_head, free_cgroup_rcu); 847 kfree_rcu(cgrp, rcu_head);
854 } 848 }
855 iput(inode); 849 iput(inode);
856} 850}
857 851
852static int cgroup_delete(const struct dentry *d)
853{
854 return 1;
855}
856
858static void remove_dir(struct dentry *d) 857static void remove_dir(struct dentry *d)
859{ 858{
860 struct dentry *parent = dget(d->d_parent); 859 struct dentry *parent = dget(d->d_parent);
@@ -869,25 +868,29 @@ static void cgroup_clear_directory(struct dentry *dentry)
869 struct list_head *node; 868 struct list_head *node;
870 869
871 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); 870 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
872 spin_lock(&dcache_lock); 871 spin_lock(&dentry->d_lock);
873 node = dentry->d_subdirs.next; 872 node = dentry->d_subdirs.next;
874 while (node != &dentry->d_subdirs) { 873 while (node != &dentry->d_subdirs) {
875 struct dentry *d = list_entry(node, struct dentry, d_u.d_child); 874 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
875
876 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
876 list_del_init(node); 877 list_del_init(node);
877 if (d->d_inode) { 878 if (d->d_inode) {
878 /* This should never be called on a cgroup 879 /* This should never be called on a cgroup
879 * directory with child cgroups */ 880 * directory with child cgroups */
880 BUG_ON(d->d_inode->i_mode & S_IFDIR); 881 BUG_ON(d->d_inode->i_mode & S_IFDIR);
881 d = dget_locked(d); 882 dget_dlock(d);
882 spin_unlock(&dcache_lock); 883 spin_unlock(&d->d_lock);
884 spin_unlock(&dentry->d_lock);
883 d_delete(d); 885 d_delete(d);
884 simple_unlink(dentry->d_inode, d); 886 simple_unlink(dentry->d_inode, d);
885 dput(d); 887 dput(d);
886 spin_lock(&dcache_lock); 888 spin_lock(&dentry->d_lock);
887 } 889 } else
890 spin_unlock(&d->d_lock);
888 node = dentry->d_subdirs.next; 891 node = dentry->d_subdirs.next;
889 } 892 }
890 spin_unlock(&dcache_lock); 893 spin_unlock(&dentry->d_lock);
891} 894}
892 895
893/* 896/*
@@ -895,11 +898,16 @@ static void cgroup_clear_directory(struct dentry *dentry)
895 */ 898 */
896static void cgroup_d_remove_dir(struct dentry *dentry) 899static void cgroup_d_remove_dir(struct dentry *dentry)
897{ 900{
901 struct dentry *parent;
902
898 cgroup_clear_directory(dentry); 903 cgroup_clear_directory(dentry);
899 904
900 spin_lock(&dcache_lock); 905 parent = dentry->d_parent;
906 spin_lock(&parent->d_lock);
907 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
901 list_del_init(&dentry->d_u.d_child); 908 list_del_init(&dentry->d_u.d_child);
902 spin_unlock(&dcache_lock); 909 spin_unlock(&dentry->d_lock);
910 spin_unlock(&parent->d_lock);
903 remove_dir(dentry); 911 remove_dir(dentry);
904} 912}
905 913
@@ -1040,6 +1048,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
1040 seq_puts(seq, ",noprefix"); 1048 seq_puts(seq, ",noprefix");
1041 if (strlen(root->release_agent_path)) 1049 if (strlen(root->release_agent_path))
1042 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1050 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1051 if (clone_children(&root->top_cgroup))
1052 seq_puts(seq, ",clone_children");
1043 if (strlen(root->name)) 1053 if (strlen(root->name))
1044 seq_printf(seq, ",name=%s", root->name); 1054 seq_printf(seq, ",name=%s", root->name);
1045 mutex_unlock(&cgroup_mutex); 1055 mutex_unlock(&cgroup_mutex);
@@ -1050,6 +1060,7 @@ struct cgroup_sb_opts {
1050 unsigned long subsys_bits; 1060 unsigned long subsys_bits;
1051 unsigned long flags; 1061 unsigned long flags;
1052 char *release_agent; 1062 char *release_agent;
1063 bool clone_children;
1053 char *name; 1064 char *name;
1054 /* User explicitly requested empty subsystem */ 1065 /* User explicitly requested empty subsystem */
1055 bool none; 1066 bool none;
@@ -1066,7 +1077,8 @@ struct cgroup_sb_opts {
1066 */ 1077 */
1067static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) 1078static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1068{ 1079{
1069 char *token, *o = data ?: "all"; 1080 char *token, *o = data;
1081 bool all_ss = false, one_ss = false;
1070 unsigned long mask = (unsigned long)-1; 1082 unsigned long mask = (unsigned long)-1;
1071 int i; 1083 int i;
1072 bool module_pin_failed = false; 1084 bool module_pin_failed = false;
@@ -1082,22 +1094,27 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1082 while ((token = strsep(&o, ",")) != NULL) { 1094 while ((token = strsep(&o, ",")) != NULL) {
1083 if (!*token) 1095 if (!*token)
1084 return -EINVAL; 1096 return -EINVAL;
1085 if (!strcmp(token, "all")) { 1097 if (!strcmp(token, "none")) {
1086 /* Add all non-disabled subsystems */
1087 opts->subsys_bits = 0;
1088 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1089 struct cgroup_subsys *ss = subsys[i];
1090 if (ss == NULL)
1091 continue;
1092 if (!ss->disabled)
1093 opts->subsys_bits |= 1ul << i;
1094 }
1095 } else if (!strcmp(token, "none")) {
1096 /* Explicitly have no subsystems */ 1098 /* Explicitly have no subsystems */
1097 opts->none = true; 1099 opts->none = true;
1098 } else if (!strcmp(token, "noprefix")) { 1100 continue;
1101 }
1102 if (!strcmp(token, "all")) {
1103 /* Mutually exclusive option 'all' + subsystem name */
1104 if (one_ss)
1105 return -EINVAL;
1106 all_ss = true;
1107 continue;
1108 }
1109 if (!strcmp(token, "noprefix")) {
1099 set_bit(ROOT_NOPREFIX, &opts->flags); 1110 set_bit(ROOT_NOPREFIX, &opts->flags);
1100 } else if (!strncmp(token, "release_agent=", 14)) { 1111 continue;
1112 }
1113 if (!strcmp(token, "clone_children")) {
1114 opts->clone_children = true;
1115 continue;
1116 }
1117 if (!strncmp(token, "release_agent=", 14)) {
1101 /* Specifying two release agents is forbidden */ 1118 /* Specifying two release agents is forbidden */
1102 if (opts->release_agent) 1119 if (opts->release_agent)
1103 return -EINVAL; 1120 return -EINVAL;
@@ -1105,7 +1122,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1105 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); 1122 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1106 if (!opts->release_agent) 1123 if (!opts->release_agent)
1107 return -ENOMEM; 1124 return -ENOMEM;
1108 } else if (!strncmp(token, "name=", 5)) { 1125 continue;
1126 }
1127 if (!strncmp(token, "name=", 5)) {
1109 const char *name = token + 5; 1128 const char *name = token + 5;
1110 /* Can't specify an empty name */ 1129 /* Can't specify an empty name */
1111 if (!strlen(name)) 1130 if (!strlen(name))
@@ -1127,20 +1146,44 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1127 GFP_KERNEL); 1146 GFP_KERNEL);
1128 if (!opts->name) 1147 if (!opts->name)
1129 return -ENOMEM; 1148 return -ENOMEM;
1130 } else { 1149
1131 struct cgroup_subsys *ss; 1150 continue;
1132 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1151 }
1133 ss = subsys[i]; 1152
1134 if (ss == NULL) 1153 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1135 continue; 1154 struct cgroup_subsys *ss = subsys[i];
1136 if (!strcmp(token, ss->name)) { 1155 if (ss == NULL)
1137 if (!ss->disabled) 1156 continue;
1138 set_bit(i, &opts->subsys_bits); 1157 if (strcmp(token, ss->name))
1139 break; 1158 continue;
1140 } 1159 if (ss->disabled)
1141 } 1160 continue;
1142 if (i == CGROUP_SUBSYS_COUNT) 1161
1143 return -ENOENT; 1162 /* Mutually exclusive option 'all' + subsystem name */
1163 if (all_ss)
1164 return -EINVAL;
1165 set_bit(i, &opts->subsys_bits);
1166 one_ss = true;
1167
1168 break;
1169 }
1170 if (i == CGROUP_SUBSYS_COUNT)
1171 return -ENOENT;
1172 }
1173
1174 /*
1175 * If the 'all' option was specified select all the subsystems,
1176 * otherwise 'all, 'none' and a subsystem name options were not
1177 * specified, let's default to 'all'
1178 */
1179 if (all_ss || (!all_ss && !one_ss && !opts->none)) {
1180 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1181 struct cgroup_subsys *ss = subsys[i];
1182 if (ss == NULL)
1183 continue;
1184 if (ss->disabled)
1185 continue;
1186 set_bit(i, &opts->subsys_bits);
1144 } 1187 }
1145 } 1188 }
1146 1189
@@ -1222,7 +1265,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1222 struct cgroup *cgrp = &root->top_cgroup; 1265 struct cgroup *cgrp = &root->top_cgroup;
1223 struct cgroup_sb_opts opts; 1266 struct cgroup_sb_opts opts;
1224 1267
1225 lock_kernel();
1226 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1268 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1227 mutex_lock(&cgroup_mutex); 1269 mutex_lock(&cgroup_mutex);
1228 1270
@@ -1255,7 +1297,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1255 kfree(opts.name); 1297 kfree(opts.name);
1256 mutex_unlock(&cgroup_mutex); 1298 mutex_unlock(&cgroup_mutex);
1257 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1299 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1258 unlock_kernel();
1259 return ret; 1300 return ret;
1260} 1301}
1261 1302
@@ -1357,6 +1398,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1357 strcpy(root->release_agent_path, opts->release_agent); 1398 strcpy(root->release_agent_path, opts->release_agent);
1358 if (opts->name) 1399 if (opts->name)
1359 strcpy(root->name, opts->name); 1400 strcpy(root->name, opts->name);
1401 if (opts->clone_children)
1402 set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
1360 return root; 1403 return root;
1361} 1404}
1362 1405
@@ -1400,6 +1443,11 @@ static int cgroup_set_super(struct super_block *sb, void *data)
1400 1443
1401static int cgroup_get_rootdir(struct super_block *sb) 1444static int cgroup_get_rootdir(struct super_block *sb)
1402{ 1445{
1446 static const struct dentry_operations cgroup_dops = {
1447 .d_iput = cgroup_diput,
1448 .d_delete = cgroup_delete,
1449 };
1450
1403 struct inode *inode = 1451 struct inode *inode =
1404 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1452 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1405 struct dentry *dentry; 1453 struct dentry *dentry;
@@ -1417,12 +1465,14 @@ static int cgroup_get_rootdir(struct super_block *sb)
1417 return -ENOMEM; 1465 return -ENOMEM;
1418 } 1466 }
1419 sb->s_root = dentry; 1467 sb->s_root = dentry;
1468 /* for everything else we want ->d_op set */
1469 sb->s_d_op = &cgroup_dops;
1420 return 0; 1470 return 0;
1421} 1471}
1422 1472
1423static int cgroup_get_sb(struct file_system_type *fs_type, 1473static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1424 int flags, const char *unused_dev_name, 1474 int flags, const char *unused_dev_name,
1425 void *data, struct vfsmount *mnt) 1475 void *data)
1426{ 1476{
1427 struct cgroup_sb_opts opts; 1477 struct cgroup_sb_opts opts;
1428 struct cgroupfs_root *root; 1478 struct cgroupfs_root *root;
@@ -1556,10 +1606,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1556 drop_parsed_module_refcounts(opts.subsys_bits); 1606 drop_parsed_module_refcounts(opts.subsys_bits);
1557 } 1607 }
1558 1608
1559 simple_set_mnt(mnt, sb);
1560 kfree(opts.release_agent); 1609 kfree(opts.release_agent);
1561 kfree(opts.name); 1610 kfree(opts.name);
1562 return 0; 1611 return dget(sb->s_root);
1563 1612
1564 drop_new_super: 1613 drop_new_super:
1565 deactivate_locked_super(sb); 1614 deactivate_locked_super(sb);
@@ -1568,8 +1617,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1568 out_err: 1617 out_err:
1569 kfree(opts.release_agent); 1618 kfree(opts.release_agent);
1570 kfree(opts.name); 1619 kfree(opts.name);
1571 1620 return ERR_PTR(ret);
1572 return ret;
1573} 1621}
1574 1622
1575static void cgroup_kill_sb(struct super_block *sb) { 1623static void cgroup_kill_sb(struct super_block *sb) {
@@ -1619,7 +1667,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1619 1667
1620static struct file_system_type cgroup_fs_type = { 1668static struct file_system_type cgroup_fs_type = {
1621 .name = "cgroup", 1669 .name = "cgroup",
1622 .get_sb = cgroup_get_sb, 1670 .mount = cgroup_mount,
1623 .kill_sb = cgroup_kill_sb, 1671 .kill_sb = cgroup_kill_sb,
1624}; 1672};
1625 1673
@@ -1688,6 +1736,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1688} 1736}
1689EXPORT_SYMBOL_GPL(cgroup_path); 1737EXPORT_SYMBOL_GPL(cgroup_path);
1690 1738
1739/*
1740 * cgroup_task_migrate - move a task from one cgroup to another.
1741 *
1742 * 'guarantee' is set if the caller promises that a new css_set for the task
1743 * will already exist. If not set, this function might sleep, and can fail with
1744 * -ENOMEM. Otherwise, it can only fail with -ESRCH.
1745 */
1746static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1747 struct task_struct *tsk, bool guarantee)
1748{
1749 struct css_set *oldcg;
1750 struct css_set *newcg;
1751
1752 /*
1753 * get old css_set. we need to take task_lock and refcount it, because
1754 * an exiting task can change its css_set to init_css_set and drop its
1755 * old one without taking cgroup_mutex.
1756 */
1757 task_lock(tsk);
1758 oldcg = tsk->cgroups;
1759 get_css_set(oldcg);
1760 task_unlock(tsk);
1761
1762 /* locate or allocate a new css_set for this task. */
1763 if (guarantee) {
1764 /* we know the css_set we want already exists. */
1765 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1766 read_lock(&css_set_lock);
1767 newcg = find_existing_css_set(oldcg, cgrp, template);
1768 BUG_ON(!newcg);
1769 get_css_set(newcg);
1770 read_unlock(&css_set_lock);
1771 } else {
1772 might_sleep();
1773 /* find_css_set will give us newcg already referenced. */
1774 newcg = find_css_set(oldcg, cgrp);
1775 if (!newcg) {
1776 put_css_set(oldcg);
1777 return -ENOMEM;
1778 }
1779 }
1780 put_css_set(oldcg);
1781
1782 /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
1783 task_lock(tsk);
1784 if (tsk->flags & PF_EXITING) {
1785 task_unlock(tsk);
1786 put_css_set(newcg);
1787 return -ESRCH;
1788 }
1789 rcu_assign_pointer(tsk->cgroups, newcg);
1790 task_unlock(tsk);
1791
1792 /* Update the css_set linked lists if we're using them */
1793 write_lock(&css_set_lock);
1794 if (!list_empty(&tsk->cg_list))
1795 list_move(&tsk->cg_list, &newcg->tasks);
1796 write_unlock(&css_set_lock);
1797
1798 /*
1799 * We just gained a reference on oldcg by taking it from the task. As
1800 * trading it for newcg is protected by cgroup_mutex, we're safe to drop
1801 * it here; it will be freed under RCU.
1802 */
1803 put_css_set(oldcg);
1804
1805 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1806 return 0;
1807}
1808
1691/** 1809/**
1692 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1810 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
1693 * @cgrp: the cgroup the task is attaching to 1811 * @cgrp: the cgroup the task is attaching to
@@ -1698,11 +1816,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
1698 */ 1816 */
1699int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1817int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1700{ 1818{
1701 int retval = 0; 1819 int retval;
1702 struct cgroup_subsys *ss, *failed_ss = NULL; 1820 struct cgroup_subsys *ss, *failed_ss = NULL;
1703 struct cgroup *oldcgrp; 1821 struct cgroup *oldcgrp;
1704 struct css_set *cg;
1705 struct css_set *newcg;
1706 struct cgroupfs_root *root = cgrp->root; 1822 struct cgroupfs_root *root = cgrp->root;
1707 1823
1708 /* Nothing to do if the task is already in that cgroup */ 1824 /* Nothing to do if the task is already in that cgroup */
@@ -1712,7 +1828,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1712 1828
1713 for_each_subsys(root, ss) { 1829 for_each_subsys(root, ss) {
1714 if (ss->can_attach) { 1830 if (ss->can_attach) {
1715 retval = ss->can_attach(ss, cgrp, tsk, false); 1831 retval = ss->can_attach(ss, cgrp, tsk);
1716 if (retval) { 1832 if (retval) {
1717 /* 1833 /*
1718 * Remember on which subsystem the can_attach() 1834 * Remember on which subsystem the can_attach()
@@ -1724,48 +1840,29 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1724 goto out; 1840 goto out;
1725 } 1841 }
1726 } 1842 }
1843 if (ss->can_attach_task) {
1844 retval = ss->can_attach_task(cgrp, tsk);
1845 if (retval) {
1846 failed_ss = ss;
1847 goto out;
1848 }
1849 }
1727 } 1850 }
1728 1851
1729 task_lock(tsk); 1852 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
1730 cg = tsk->cgroups; 1853 if (retval)
1731 get_css_set(cg);
1732 task_unlock(tsk);
1733 /*
1734 * Locate or allocate a new css_set for this task,
1735 * based on its final set of cgroups
1736 */
1737 newcg = find_css_set(cg, cgrp);
1738 put_css_set(cg);
1739 if (!newcg) {
1740 retval = -ENOMEM;
1741 goto out;
1742 }
1743
1744 task_lock(tsk);
1745 if (tsk->flags & PF_EXITING) {
1746 task_unlock(tsk);
1747 put_css_set(newcg);
1748 retval = -ESRCH;
1749 goto out; 1854 goto out;
1750 }
1751 rcu_assign_pointer(tsk->cgroups, newcg);
1752 task_unlock(tsk);
1753
1754 /* Update the css_set linked lists if we're using them */
1755 write_lock(&css_set_lock);
1756 if (!list_empty(&tsk->cg_list)) {
1757 list_del(&tsk->cg_list);
1758 list_add(&tsk->cg_list, &newcg->tasks);
1759 }
1760 write_unlock(&css_set_lock);
1761 1855
1762 for_each_subsys(root, ss) { 1856 for_each_subsys(root, ss) {
1857 if (ss->pre_attach)
1858 ss->pre_attach(cgrp);
1859 if (ss->attach_task)
1860 ss->attach_task(cgrp, tsk);
1763 if (ss->attach) 1861 if (ss->attach)
1764 ss->attach(ss, cgrp, oldcgrp, tsk, false); 1862 ss->attach(ss, cgrp, oldcgrp, tsk);
1765 } 1863 }
1766 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1864
1767 synchronize_rcu(); 1865 synchronize_rcu();
1768 put_css_set(cg);
1769 1866
1770 /* 1867 /*
1771 * wake up rmdir() waiter. the rmdir should fail since the cgroup 1868 * wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1784,7 +1881,7 @@ out:
1784 */ 1881 */
1785 break; 1882 break;
1786 if (ss->cancel_attach) 1883 if (ss->cancel_attach)
1787 ss->cancel_attach(ss, cgrp, tsk, false); 1884 ss->cancel_attach(ss, cgrp, tsk);
1788 } 1885 }
1789 } 1886 }
1790 return retval; 1887 return retval;
@@ -1815,49 +1912,370 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
1815EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 1912EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1816 1913
1817/* 1914/*
1818 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex 1915 * cgroup_attach_proc works in two stages, the first of which prefetches all
1819 * held. May take task_lock of task 1916 * new css_sets needed (to make sure we have enough memory before committing
1917 * to the move) and stores them in a list of entries of the following type.
1918 * TODO: possible optimization: use css_set->rcu_head for chaining instead
1919 */
1920struct cg_list_entry {
1921 struct css_set *cg;
1922 struct list_head links;
1923};
1924
1925static bool css_set_check_fetched(struct cgroup *cgrp,
1926 struct task_struct *tsk, struct css_set *cg,
1927 struct list_head *newcg_list)
1928{
1929 struct css_set *newcg;
1930 struct cg_list_entry *cg_entry;
1931 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1932
1933 read_lock(&css_set_lock);
1934 newcg = find_existing_css_set(cg, cgrp, template);
1935 if (newcg)
1936 get_css_set(newcg);
1937 read_unlock(&css_set_lock);
1938
1939 /* doesn't exist at all? */
1940 if (!newcg)
1941 return false;
1942 /* see if it's already in the list */
1943 list_for_each_entry(cg_entry, newcg_list, links) {
1944 if (cg_entry->cg == newcg) {
1945 put_css_set(newcg);
1946 return true;
1947 }
1948 }
1949
1950 /* not found */
1951 put_css_set(newcg);
1952 return false;
1953}
1954
1955/*
1956 * Find the new css_set and store it in the list in preparation for moving the
1957 * given task to the given cgroup. Returns 0 or -ENOMEM.
1958 */
1959static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
1960 struct list_head *newcg_list)
1961{
1962 struct css_set *newcg;
1963 struct cg_list_entry *cg_entry;
1964
1965 /* ensure a new css_set will exist for this thread */
1966 newcg = find_css_set(cg, cgrp);
1967 if (!newcg)
1968 return -ENOMEM;
1969 /* add it to the list */
1970 cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
1971 if (!cg_entry) {
1972 put_css_set(newcg);
1973 return -ENOMEM;
1974 }
1975 cg_entry->cg = newcg;
1976 list_add(&cg_entry->links, newcg_list);
1977 return 0;
1978}
1979
1980/**
1981 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
1982 * @cgrp: the cgroup to attach to
1983 * @leader: the threadgroup leader task_struct of the group to be attached
1984 *
1985 * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
1986 * take task_lock of each thread in leader's threadgroup individually in turn.
1820 */ 1987 */
1821static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) 1988int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
1989{
1990 int retval, i, group_size;
1991 struct cgroup_subsys *ss, *failed_ss = NULL;
1992 bool cancel_failed_ss = false;
1993 /* guaranteed to be initialized later, but the compiler needs this */
1994 struct cgroup *oldcgrp = NULL;
1995 struct css_set *oldcg;
1996 struct cgroupfs_root *root = cgrp->root;
1997 /* threadgroup list cursor and array */
1998 struct task_struct *tsk;
1999 struct flex_array *group;
2000 /*
2001 * we need to make sure we have css_sets for all the tasks we're
2002 * going to move -before- we actually start moving them, so that in
2003 * case we get an ENOMEM we can bail out before making any changes.
2004 */
2005 struct list_head newcg_list;
2006 struct cg_list_entry *cg_entry, *temp_nobe;
2007
2008 /*
2009 * step 0: in order to do expensive, possibly blocking operations for
2010 * every thread, we cannot iterate the thread group list, since it needs
2011 * rcu or tasklist locked. instead, build an array of all threads in the
2012 * group - threadgroup_fork_lock prevents new threads from appearing,
2013 * and if threads exit, this will just be an over-estimate.
2014 */
2015 group_size = get_nr_threads(leader);
2016 /* flex_array supports very large thread-groups better than kmalloc. */
2017 group = flex_array_alloc(sizeof(struct task_struct *), group_size,
2018 GFP_KERNEL);
2019 if (!group)
2020 return -ENOMEM;
2021 /* pre-allocate to guarantee space while iterating in rcu read-side. */
2022 retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
2023 if (retval)
2024 goto out_free_group_list;
2025
2026 /* prevent changes to the threadgroup list while we take a snapshot. */
2027 rcu_read_lock();
2028 if (!thread_group_leader(leader)) {
2029 /*
2030 * a race with de_thread from another thread's exec() may strip
2031 * us of our leadership, making while_each_thread unsafe to use
2032 * on this task. if this happens, there is no choice but to
2033 * throw this task away and try again (from cgroup_procs_write);
2034 * this is "double-double-toil-and-trouble-check locking".
2035 */
2036 rcu_read_unlock();
2037 retval = -EAGAIN;
2038 goto out_free_group_list;
2039 }
2040 /* take a reference on each task in the group to go in the array. */
2041 tsk = leader;
2042 i = 0;
2043 do {
2044 /* as per above, nr_threads may decrease, but not increase. */
2045 BUG_ON(i >= group_size);
2046 get_task_struct(tsk);
2047 /*
2048 * saying GFP_ATOMIC has no effect here because we did prealloc
2049 * earlier, but it's good form to communicate our expectations.
2050 */
2051 retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
2052 BUG_ON(retval != 0);
2053 i++;
2054 } while_each_thread(leader, tsk);
2055 /* remember the number of threads in the array for later. */
2056 group_size = i;
2057 rcu_read_unlock();
2058
2059 /*
2060 * step 1: check that we can legitimately attach to the cgroup.
2061 */
2062 for_each_subsys(root, ss) {
2063 if (ss->can_attach) {
2064 retval = ss->can_attach(ss, cgrp, leader);
2065 if (retval) {
2066 failed_ss = ss;
2067 goto out_cancel_attach;
2068 }
2069 }
2070 /* a callback to be run on every thread in the threadgroup. */
2071 if (ss->can_attach_task) {
2072 /* run on each task in the threadgroup. */
2073 for (i = 0; i < group_size; i++) {
2074 tsk = flex_array_get_ptr(group, i);
2075 retval = ss->can_attach_task(cgrp, tsk);
2076 if (retval) {
2077 failed_ss = ss;
2078 cancel_failed_ss = true;
2079 goto out_cancel_attach;
2080 }
2081 }
2082 }
2083 }
2084
2085 /*
2086 * step 2: make sure css_sets exist for all threads to be migrated.
2087 * we use find_css_set, which allocates a new one if necessary.
2088 */
2089 INIT_LIST_HEAD(&newcg_list);
2090 for (i = 0; i < group_size; i++) {
2091 tsk = flex_array_get_ptr(group, i);
2092 /* nothing to do if this task is already in the cgroup */
2093 oldcgrp = task_cgroup_from_root(tsk, root);
2094 if (cgrp == oldcgrp)
2095 continue;
2096 /* get old css_set pointer */
2097 task_lock(tsk);
2098 if (tsk->flags & PF_EXITING) {
2099 /* ignore this task if it's going away */
2100 task_unlock(tsk);
2101 continue;
2102 }
2103 oldcg = tsk->cgroups;
2104 get_css_set(oldcg);
2105 task_unlock(tsk);
2106 /* see if the new one for us is already in the list? */
2107 if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
2108 /* was already there, nothing to do. */
2109 put_css_set(oldcg);
2110 } else {
2111 /* we don't already have it. get new one. */
2112 retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2113 put_css_set(oldcg);
2114 if (retval)
2115 goto out_list_teardown;
2116 }
2117 }
2118
2119 /*
2120 * step 3: now that we're guaranteed success wrt the css_sets, proceed
2121 * to move all tasks to the new cgroup, calling ss->attach_task for each
2122 * one along the way. there are no failure cases after here, so this is
2123 * the commit point.
2124 */
2125 for_each_subsys(root, ss) {
2126 if (ss->pre_attach)
2127 ss->pre_attach(cgrp);
2128 }
2129 for (i = 0; i < group_size; i++) {
2130 tsk = flex_array_get_ptr(group, i);
2131 /* leave current thread as it is if it's already there */
2132 oldcgrp = task_cgroup_from_root(tsk, root);
2133 if (cgrp == oldcgrp)
2134 continue;
2135 /* attach each task to each subsystem */
2136 for_each_subsys(root, ss) {
2137 if (ss->attach_task)
2138 ss->attach_task(cgrp, tsk);
2139 }
2140 /* if the thread is PF_EXITING, it can just get skipped. */
2141 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
2142 BUG_ON(retval != 0 && retval != -ESRCH);
2143 }
2144 /* nothing is sensitive to fork() after this point. */
2145
2146 /*
2147 * step 4: do expensive, non-thread-specific subsystem callbacks.
2148 * TODO: if ever a subsystem needs to know the oldcgrp for each task
2149 * being moved, this call will need to be reworked to communicate that.
2150 */
2151 for_each_subsys(root, ss) {
2152 if (ss->attach)
2153 ss->attach(ss, cgrp, oldcgrp, leader);
2154 }
2155
2156 /*
2157 * step 5: success! and cleanup
2158 */
2159 synchronize_rcu();
2160 cgroup_wakeup_rmdir_waiter(cgrp);
2161 retval = 0;
2162out_list_teardown:
2163 /* clean up the list of prefetched css_sets. */
2164 list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
2165 list_del(&cg_entry->links);
2166 put_css_set(cg_entry->cg);
2167 kfree(cg_entry);
2168 }
2169out_cancel_attach:
2170 /* same deal as in cgroup_attach_task */
2171 if (retval) {
2172 for_each_subsys(root, ss) {
2173 if (ss == failed_ss) {
2174 if (cancel_failed_ss && ss->cancel_attach)
2175 ss->cancel_attach(ss, cgrp, leader);
2176 break;
2177 }
2178 if (ss->cancel_attach)
2179 ss->cancel_attach(ss, cgrp, leader);
2180 }
2181 }
2182 /* clean up the array of referenced threads in the group. */
2183 for (i = 0; i < group_size; i++) {
2184 tsk = flex_array_get_ptr(group, i);
2185 put_task_struct(tsk);
2186 }
2187out_free_group_list:
2188 flex_array_free(group);
2189 return retval;
2190}
2191
2192/*
2193 * Find the task_struct of the task to attach by vpid and pass it along to the
2194 * function to attach either it or all tasks in its threadgroup. Will take
2195 * cgroup_mutex; may take task_lock of task.
2196 */
2197static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
1822{ 2198{
1823 struct task_struct *tsk; 2199 struct task_struct *tsk;
1824 const struct cred *cred = current_cred(), *tcred; 2200 const struct cred *cred = current_cred(), *tcred;
1825 int ret; 2201 int ret;
1826 2202
2203 if (!cgroup_lock_live_group(cgrp))
2204 return -ENODEV;
2205
1827 if (pid) { 2206 if (pid) {
1828 rcu_read_lock(); 2207 rcu_read_lock();
1829 tsk = find_task_by_vpid(pid); 2208 tsk = find_task_by_vpid(pid);
1830 if (!tsk || tsk->flags & PF_EXITING) { 2209 if (!tsk) {
2210 rcu_read_unlock();
2211 cgroup_unlock();
2212 return -ESRCH;
2213 }
2214 if (threadgroup) {
2215 /*
2216 * RCU protects this access, since tsk was found in the
2217 * tid map. a race with de_thread may cause group_leader
2218 * to stop being the leader, but cgroup_attach_proc will
2219 * detect it later.
2220 */
2221 tsk = tsk->group_leader;
2222 } else if (tsk->flags & PF_EXITING) {
2223 /* optimization for the single-task-only case */
1831 rcu_read_unlock(); 2224 rcu_read_unlock();
2225 cgroup_unlock();
1832 return -ESRCH; 2226 return -ESRCH;
1833 } 2227 }
1834 2228
2229 /*
2230 * even if we're attaching all tasks in the thread group, we
2231 * only need to check permissions on one of them.
2232 */
1835 tcred = __task_cred(tsk); 2233 tcred = __task_cred(tsk);
1836 if (cred->euid && 2234 if (cred->euid &&
1837 cred->euid != tcred->uid && 2235 cred->euid != tcred->uid &&
1838 cred->euid != tcred->suid) { 2236 cred->euid != tcred->suid) {
1839 rcu_read_unlock(); 2237 rcu_read_unlock();
2238 cgroup_unlock();
1840 return -EACCES; 2239 return -EACCES;
1841 } 2240 }
1842 get_task_struct(tsk); 2241 get_task_struct(tsk);
1843 rcu_read_unlock(); 2242 rcu_read_unlock();
1844 } else { 2243 } else {
1845 tsk = current; 2244 if (threadgroup)
2245 tsk = current->group_leader;
2246 else
2247 tsk = current;
1846 get_task_struct(tsk); 2248 get_task_struct(tsk);
1847 } 2249 }
1848 2250
1849 ret = cgroup_attach_task(cgrp, tsk); 2251 if (threadgroup) {
2252 threadgroup_fork_write_lock(tsk);
2253 ret = cgroup_attach_proc(cgrp, tsk);
2254 threadgroup_fork_write_unlock(tsk);
2255 } else {
2256 ret = cgroup_attach_task(cgrp, tsk);
2257 }
1850 put_task_struct(tsk); 2258 put_task_struct(tsk);
2259 cgroup_unlock();
1851 return ret; 2260 return ret;
1852} 2261}
1853 2262
1854static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2263static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
1855{ 2264{
2265 return attach_task_by_pid(cgrp, pid, false);
2266}
2267
2268static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2269{
1856 int ret; 2270 int ret;
1857 if (!cgroup_lock_live_group(cgrp)) 2271 do {
1858 return -ENODEV; 2272 /*
1859 ret = attach_task_by_pid(cgrp, pid); 2273 * attach_proc fails with -EAGAIN if threadgroup leadership
1860 cgroup_unlock(); 2274 * changes in the middle of the operation, in which case we need
2275 * to find the task_struct for the new leader and start over.
2276 */
2277 ret = attach_task_by_pid(cgrp, tgid, true);
2278 } while (ret == -EAGAIN);
1861 return ret; 2279 return ret;
1862} 2280}
1863 2281
@@ -1883,6 +2301,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1883 const char *buffer) 2301 const char *buffer)
1884{ 2302{
1885 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); 2303 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
2304 if (strlen(buffer) >= PATH_MAX)
2305 return -EINVAL;
1886 if (!cgroup_lock_live_group(cgrp)) 2306 if (!cgroup_lock_live_group(cgrp))
1887 return -ENODEV; 2307 return -ENODEV;
1888 strcpy(cgrp->root->release_agent_path, buffer); 2308 strcpy(cgrp->root->release_agent_path, buffer);
@@ -2140,12 +2560,20 @@ static const struct file_operations cgroup_file_operations = {
2140}; 2560};
2141 2561
2142static const struct inode_operations cgroup_dir_inode_operations = { 2562static const struct inode_operations cgroup_dir_inode_operations = {
2143 .lookup = simple_lookup, 2563 .lookup = cgroup_lookup,
2144 .mkdir = cgroup_mkdir, 2564 .mkdir = cgroup_mkdir,
2145 .rmdir = cgroup_rmdir, 2565 .rmdir = cgroup_rmdir,
2146 .rename = cgroup_rename, 2566 .rename = cgroup_rename,
2147}; 2567};
2148 2568
2569static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2570{
2571 if (dentry->d_name.len > NAME_MAX)
2572 return ERR_PTR(-ENAMETOOLONG);
2573 d_add(dentry, NULL);
2574 return NULL;
2575}
2576
2149/* 2577/*
2150 * Check if a file is a control file 2578 * Check if a file is a control file
2151 */ 2579 */
@@ -2159,10 +2587,6 @@ static inline struct cftype *__file_cft(struct file *file)
2159static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2587static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2160 struct super_block *sb) 2588 struct super_block *sb)
2161{ 2589{
2162 static const struct dentry_operations cgroup_dops = {
2163 .d_iput = cgroup_diput,
2164 };
2165
2166 struct inode *inode; 2590 struct inode *inode;
2167 2591
2168 if (!dentry) 2592 if (!dentry)
@@ -2188,7 +2612,6 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2188 inode->i_size = 0; 2612 inode->i_size = 0;
2189 inode->i_fop = &cgroup_file_operations; 2613 inode->i_fop = &cgroup_file_operations;
2190 } 2614 }
2191 dentry->d_op = &cgroup_dops;
2192 d_instantiate(dentry, inode); 2615 d_instantiate(dentry, inode);
2193 dget(dentry); /* Extra count - pin the dentry in core */ 2616 dget(dentry); /* Extra count - pin the dentry in core */
2194 return 0; 2617 return 0;
@@ -3176,6 +3599,23 @@ fail:
3176 return ret; 3599 return ret;
3177} 3600}
3178 3601
3602static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3603 struct cftype *cft)
3604{
3605 return clone_children(cgrp);
3606}
3607
3608static int cgroup_clone_children_write(struct cgroup *cgrp,
3609 struct cftype *cft,
3610 u64 val)
3611{
3612 if (val)
3613 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3614 else
3615 clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3616 return 0;
3617}
3618
3179/* 3619/*
3180 * for the common functions, 'private' gives the type of file 3620 * for the common functions, 'private' gives the type of file
3181 */ 3621 */
@@ -3192,9 +3632,9 @@ static struct cftype files[] = {
3192 { 3632 {
3193 .name = CGROUP_FILE_GENERIC_PREFIX "procs", 3633 .name = CGROUP_FILE_GENERIC_PREFIX "procs",
3194 .open = cgroup_procs_open, 3634 .open = cgroup_procs_open,
3195 /* .write_u64 = cgroup_procs_write, TODO */ 3635 .write_u64 = cgroup_procs_write,
3196 .release = cgroup_pidlist_release, 3636 .release = cgroup_pidlist_release,
3197 .mode = S_IRUGO, 3637 .mode = S_IRUGO | S_IWUSR,
3198 }, 3638 },
3199 { 3639 {
3200 .name = "notify_on_release", 3640 .name = "notify_on_release",
@@ -3206,6 +3646,11 @@ static struct cftype files[] = {
3206 .write_string = cgroup_write_event_control, 3646 .write_string = cgroup_write_event_control,
3207 .mode = S_IWUGO, 3647 .mode = S_IWUGO,
3208 }, 3648 },
3649 {
3650 .name = "cgroup.clone_children",
3651 .read_u64 = cgroup_clone_children_read,
3652 .write_u64 = cgroup_clone_children_write,
3653 },
3209}; 3654};
3210 3655
3211static struct cftype cft_release_agent = { 3656static struct cftype cft_release_agent = {
@@ -3335,6 +3780,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3335 if (notify_on_release(parent)) 3780 if (notify_on_release(parent))
3336 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3781 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3337 3782
3783 if (clone_children(parent))
3784 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3785
3338 for_each_subsys(root, ss) { 3786 for_each_subsys(root, ss) {
3339 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 3787 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
3340 3788
@@ -3349,6 +3797,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3349 goto err_destroy; 3797 goto err_destroy;
3350 } 3798 }
3351 /* At error, ->destroy() callback has to free assigned ID. */ 3799 /* At error, ->destroy() callback has to free assigned ID. */
3800 if (clone_children(parent) && ss->post_clone)
3801 ss->post_clone(ss, cgrp);
3352 } 3802 }
3353 3803
3354 cgroup_lock_hierarchy(root); 3804 cgroup_lock_hierarchy(root);
@@ -3563,17 +4013,15 @@ again:
3563 spin_lock(&release_list_lock); 4013 spin_lock(&release_list_lock);
3564 set_bit(CGRP_REMOVED, &cgrp->flags); 4014 set_bit(CGRP_REMOVED, &cgrp->flags);
3565 if (!list_empty(&cgrp->release_list)) 4015 if (!list_empty(&cgrp->release_list))
3566 list_del(&cgrp->release_list); 4016 list_del_init(&cgrp->release_list);
3567 spin_unlock(&release_list_lock); 4017 spin_unlock(&release_list_lock);
3568 4018
3569 cgroup_lock_hierarchy(cgrp->root); 4019 cgroup_lock_hierarchy(cgrp->root);
3570 /* delete this cgroup from parent->children */ 4020 /* delete this cgroup from parent->children */
3571 list_del(&cgrp->sibling); 4021 list_del_init(&cgrp->sibling);
3572 cgroup_unlock_hierarchy(cgrp->root); 4022 cgroup_unlock_hierarchy(cgrp->root);
3573 4023
3574 spin_lock(&cgrp->dentry->d_lock);
3575 d = dget(cgrp->dentry); 4024 d = dget(cgrp->dentry);
3576 spin_unlock(&d->d_lock);
3577 4025
3578 cgroup_d_remove_dir(d); 4026 cgroup_d_remove_dir(d);
3579 dput(d); 4027 dput(d);
@@ -3789,7 +4237,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
3789 subsys[ss->subsys_id] = NULL; 4237 subsys[ss->subsys_id] = NULL;
3790 4238
3791 /* remove subsystem from rootnode's list of subsystems */ 4239 /* remove subsystem from rootnode's list of subsystems */
3792 list_del(&ss->sibling); 4240 list_del_init(&ss->sibling);
3793 4241
3794 /* 4242 /*
3795 * disentangle the css from all css_sets attached to the dummytop. as 4243 * disentangle the css from all css_sets attached to the dummytop. as
@@ -4140,20 +4588,8 @@ void cgroup_post_fork(struct task_struct *child)
4140 */ 4588 */
4141void cgroup_exit(struct task_struct *tsk, int run_callbacks) 4589void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4142{ 4590{
4143 int i;
4144 struct css_set *cg; 4591 struct css_set *cg;
4145 4592 int i;
4146 if (run_callbacks && need_forkexit_callback) {
4147 /*
4148 * modular subsystems can't use callbacks, so no need to lock
4149 * the subsys array
4150 */
4151 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4152 struct cgroup_subsys *ss = subsys[i];
4153 if (ss->exit)
4154 ss->exit(ss, tsk);
4155 }
4156 }
4157 4593
4158 /* 4594 /*
4159 * Unlink from the css_set task list if necessary. 4595 * Unlink from the css_set task list if necessary.
@@ -4163,7 +4599,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4163 if (!list_empty(&tsk->cg_list)) { 4599 if (!list_empty(&tsk->cg_list)) {
4164 write_lock(&css_set_lock); 4600 write_lock(&css_set_lock);
4165 if (!list_empty(&tsk->cg_list)) 4601 if (!list_empty(&tsk->cg_list))
4166 list_del(&tsk->cg_list); 4602 list_del_init(&tsk->cg_list);
4167 write_unlock(&css_set_lock); 4603 write_unlock(&css_set_lock);
4168 } 4604 }
4169 4605
@@ -4171,125 +4607,26 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4171 task_lock(tsk); 4607 task_lock(tsk);
4172 cg = tsk->cgroups; 4608 cg = tsk->cgroups;
4173 tsk->cgroups = &init_css_set; 4609 tsk->cgroups = &init_css_set;
4174 task_unlock(tsk);
4175 if (cg)
4176 put_css_set_taskexit(cg);
4177}
4178
4179/**
4180 * cgroup_clone - clone the cgroup the given subsystem is attached to
4181 * @tsk: the task to be moved
4182 * @subsys: the given subsystem
4183 * @nodename: the name for the new cgroup
4184 *
4185 * Duplicate the current cgroup in the hierarchy that the given
4186 * subsystem is attached to, and move this task into the new
4187 * child.
4188 */
4189int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
4190 char *nodename)
4191{
4192 struct dentry *dentry;
4193 int ret = 0;
4194 struct cgroup *parent, *child;
4195 struct inode *inode;
4196 struct css_set *cg;
4197 struct cgroupfs_root *root;
4198 struct cgroup_subsys *ss;
4199
4200 /* We shouldn't be called by an unregistered subsystem */
4201 BUG_ON(!subsys->active);
4202
4203 /* First figure out what hierarchy and cgroup we're dealing
4204 * with, and pin them so we can drop cgroup_mutex */
4205 mutex_lock(&cgroup_mutex);
4206 again:
4207 root = subsys->root;
4208 if (root == &rootnode) {
4209 mutex_unlock(&cgroup_mutex);
4210 return 0;
4211 }
4212 4610
4213 /* Pin the hierarchy */ 4611 if (run_callbacks && need_forkexit_callback) {
4214 if (!atomic_inc_not_zero(&root->sb->s_active)) { 4612 /*
4215 /* We race with the final deactivate_super() */ 4613 * modular subsystems can't use callbacks, so no need to lock
4216 mutex_unlock(&cgroup_mutex); 4614 * the subsys array
4217 return 0; 4615 */
4616 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4617 struct cgroup_subsys *ss = subsys[i];
4618 if (ss->exit) {
4619 struct cgroup *old_cgrp =
4620 rcu_dereference_raw(cg->subsys[i])->cgroup;
4621 struct cgroup *cgrp = task_cgroup(tsk, i);
4622 ss->exit(ss, cgrp, old_cgrp, tsk);
4623 }
4624 }
4218 } 4625 }
4219
4220 /* Keep the cgroup alive */
4221 task_lock(tsk);
4222 parent = task_cgroup(tsk, subsys->subsys_id);
4223 cg = tsk->cgroups;
4224 get_css_set(cg);
4225 task_unlock(tsk); 4626 task_unlock(tsk);
4226 4627
4227 mutex_unlock(&cgroup_mutex); 4628 if (cg)
4228 4629 put_css_set_taskexit(cg);
4229 /* Now do the VFS work to create a cgroup */
4230 inode = parent->dentry->d_inode;
4231
4232 /* Hold the parent directory mutex across this operation to
4233 * stop anyone else deleting the new cgroup */
4234 mutex_lock(&inode->i_mutex);
4235 dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
4236 if (IS_ERR(dentry)) {
4237 printk(KERN_INFO
4238 "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
4239 PTR_ERR(dentry));
4240 ret = PTR_ERR(dentry);
4241 goto out_release;
4242 }
4243
4244 /* Create the cgroup directory, which also creates the cgroup */
4245 ret = vfs_mkdir(inode, dentry, 0755);
4246 child = __d_cgrp(dentry);
4247 dput(dentry);
4248 if (ret) {
4249 printk(KERN_INFO
4250 "Failed to create cgroup %s: %d\n", nodename,
4251 ret);
4252 goto out_release;
4253 }
4254
4255 /* The cgroup now exists. Retake cgroup_mutex and check
4256 * that we're still in the same state that we thought we
4257 * were. */
4258 mutex_lock(&cgroup_mutex);
4259 if ((root != subsys->root) ||
4260 (parent != task_cgroup(tsk, subsys->subsys_id))) {
4261 /* Aargh, we raced ... */
4262 mutex_unlock(&inode->i_mutex);
4263 put_css_set(cg);
4264
4265 deactivate_super(root->sb);
4266 /* The cgroup is still accessible in the VFS, but
4267 * we're not going to try to rmdir() it at this
4268 * point. */
4269 printk(KERN_INFO
4270 "Race in cgroup_clone() - leaking cgroup %s\n",
4271 nodename);
4272 goto again;
4273 }
4274
4275 /* do any required auto-setup */
4276 for_each_subsys(root, ss) {
4277 if (ss->post_clone)
4278 ss->post_clone(ss, child);
4279 }
4280
4281 /* All seems fine. Finish by moving the task into the new cgroup */
4282 ret = cgroup_attach_task(child, tsk);
4283 mutex_unlock(&cgroup_mutex);
4284
4285 out_release:
4286 mutex_unlock(&inode->i_mutex);
4287
4288 mutex_lock(&cgroup_mutex);
4289 put_css_set(cg);
4290 mutex_unlock(&cgroup_mutex);
4291 deactivate_super(root->sb);
4292 return ret;
4293} 4630}
4294 4631
4295/** 4632/**
@@ -4530,14 +4867,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
4530 return ret; 4867 return ret;
4531} 4868}
4532 4869
4533static void __free_css_id_cb(struct rcu_head *head)
4534{
4535 struct css_id *id;
4536
4537 id = container_of(head, struct css_id, rcu_head);
4538 kfree(id);
4539}
4540
4541void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) 4870void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4542{ 4871{
4543 struct css_id *id = css->id; 4872 struct css_id *id = css->id;
@@ -4552,7 +4881,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4552 spin_lock(&ss->id_lock); 4881 spin_lock(&ss->id_lock);
4553 idr_remove(&ss->idr, id->id); 4882 idr_remove(&ss->idr, id->id);
4554 spin_unlock(&ss->id_lock); 4883 spin_unlock(&ss->id_lock);
4555 call_rcu(&id->rcu_head, __free_css_id_cb); 4884 kfree_rcu(id, rcu_head);
4556} 4885}
4557EXPORT_SYMBOL_GPL(free_css_id); 4886EXPORT_SYMBOL_GPL(free_css_id);
4558 4887
@@ -4723,6 +5052,29 @@ css_get_next(struct cgroup_subsys *ss, int id,
4723 return ret; 5052 return ret;
4724} 5053}
4725 5054
5055/*
5056 * get corresponding css from file open on cgroupfs directory
5057 */
5058struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5059{
5060 struct cgroup *cgrp;
5061 struct inode *inode;
5062 struct cgroup_subsys_state *css;
5063
5064 inode = f->f_dentry->d_inode;
5065 /* check in cgroup filesystem dir */
5066 if (inode->i_op != &cgroup_dir_inode_operations)
5067 return ERR_PTR(-EBADF);
5068
5069 if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
5070 return ERR_PTR(-EINVAL);
5071
5072 /* get cgroup */
5073 cgrp = __d_cgrp(f->f_dentry);
5074 css = cgrp->subsys[id];
5075 return css ? css : ERR_PTR(-ENOENT);
5076}
5077
4726#ifdef CONFIG_CGROUP_DEBUG 5078#ifdef CONFIG_CGROUP_DEBUG
4727static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, 5079static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
4728 struct cgroup *cont) 5080 struct cgroup *cont)
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index ce71ed53e88f..e691818d7e45 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -48,20 +48,19 @@ static inline struct freezer *task_freezer(struct task_struct *task)
48 struct freezer, css); 48 struct freezer, css);
49} 49}
50 50
51int cgroup_freezing_or_frozen(struct task_struct *task) 51static inline int __cgroup_freezing_or_frozen(struct task_struct *task)
52{ 52{
53 struct freezer *freezer; 53 enum freezer_state state = task_freezer(task)->state;
54 enum freezer_state state; 54 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
55}
55 56
57int cgroup_freezing_or_frozen(struct task_struct *task)
58{
59 int result;
56 task_lock(task); 60 task_lock(task);
57 freezer = task_freezer(task); 61 result = __cgroup_freezing_or_frozen(task);
58 if (!freezer->css.cgroup->parent)
59 state = CGROUP_THAWED; /* root cgroup can't be frozen */
60 else
61 state = freezer->state;
62 task_unlock(task); 62 task_unlock(task);
63 63 return result;
64 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
65} 64}
66 65
67/* 66/*
@@ -154,13 +153,6 @@ static void freezer_destroy(struct cgroup_subsys *ss,
154 kfree(cgroup_freezer(cgroup)); 153 kfree(cgroup_freezer(cgroup));
155} 154}
156 155
157/* Task is frozen or will freeze immediately when next it gets woken */
158static bool is_task_frozen_enough(struct task_struct *task)
159{
160 return frozen(task) ||
161 (task_is_stopped_or_traced(task) && freezing(task));
162}
163
164/* 156/*
165 * The call to cgroup_lock() in the freezer.state write method prevents 157 * The call to cgroup_lock() in the freezer.state write method prevents
166 * a write to that file racing against an attach, and hence the 158 * a write to that file racing against an attach, and hence the
@@ -168,37 +160,29 @@ static bool is_task_frozen_enough(struct task_struct *task)
168 */ 160 */
169static int freezer_can_attach(struct cgroup_subsys *ss, 161static int freezer_can_attach(struct cgroup_subsys *ss,
170 struct cgroup *new_cgroup, 162 struct cgroup *new_cgroup,
171 struct task_struct *task, bool threadgroup) 163 struct task_struct *task)
172{ 164{
173 struct freezer *freezer; 165 struct freezer *freezer;
174 166
175 /* 167 /*
176 * Anything frozen can't move or be moved to/from. 168 * Anything frozen can't move or be moved to/from.
177 *
178 * Since orig_freezer->state == FROZEN means that @task has been
179 * frozen, so it's sufficient to check the latter condition.
180 */ 169 */
181 170
182 if (is_task_frozen_enough(task))
183 return -EBUSY;
184
185 freezer = cgroup_freezer(new_cgroup); 171 freezer = cgroup_freezer(new_cgroup);
186 if (freezer->state == CGROUP_FROZEN) 172 if (freezer->state != CGROUP_THAWED)
187 return -EBUSY; 173 return -EBUSY;
188 174
189 if (threadgroup) { 175 return 0;
190 struct task_struct *c; 176}
191 177
192 rcu_read_lock(); 178static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
193 list_for_each_entry_rcu(c, &task->thread_group, thread_group) { 179{
194 if (is_task_frozen_enough(c)) { 180 rcu_read_lock();
195 rcu_read_unlock(); 181 if (__cgroup_freezing_or_frozen(tsk)) {
196 return -EBUSY;
197 }
198 }
199 rcu_read_unlock(); 182 rcu_read_unlock();
183 return -EBUSY;
200 } 184 }
201 185 rcu_read_unlock();
202 return 0; 186 return 0;
203} 187}
204 188
@@ -236,31 +220,30 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
236/* 220/*
237 * caller must hold freezer->lock 221 * caller must hold freezer->lock
238 */ 222 */
239static void update_freezer_state(struct cgroup *cgroup, 223static void update_if_frozen(struct cgroup *cgroup,
240 struct freezer *freezer) 224 struct freezer *freezer)
241{ 225{
242 struct cgroup_iter it; 226 struct cgroup_iter it;
243 struct task_struct *task; 227 struct task_struct *task;
244 unsigned int nfrozen = 0, ntotal = 0; 228 unsigned int nfrozen = 0, ntotal = 0;
229 enum freezer_state old_state = freezer->state;
245 230
246 cgroup_iter_start(cgroup, &it); 231 cgroup_iter_start(cgroup, &it);
247 while ((task = cgroup_iter_next(cgroup, &it))) { 232 while ((task = cgroup_iter_next(cgroup, &it))) {
248 ntotal++; 233 ntotal++;
249 if (is_task_frozen_enough(task)) 234 if (frozen(task))
250 nfrozen++; 235 nfrozen++;
251 } 236 }
252 237
253 /* 238 if (old_state == CGROUP_THAWED) {
254 * Transition to FROZEN when no new tasks can be added ensures 239 BUG_ON(nfrozen > 0);
255 * that we never exist in the FROZEN state while there are unfrozen 240 } else if (old_state == CGROUP_FREEZING) {
256 * tasks. 241 if (nfrozen == ntotal)
257 */ 242 freezer->state = CGROUP_FROZEN;
258 if (nfrozen == ntotal) 243 } else { /* old_state == CGROUP_FROZEN */
259 freezer->state = CGROUP_FROZEN; 244 BUG_ON(nfrozen != ntotal);
260 else if (nfrozen > 0) 245 }
261 freezer->state = CGROUP_FREEZING; 246
262 else
263 freezer->state = CGROUP_THAWED;
264 cgroup_iter_end(cgroup, &it); 247 cgroup_iter_end(cgroup, &it);
265} 248}
266 249
@@ -279,7 +262,7 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
279 if (state == CGROUP_FREEZING) { 262 if (state == CGROUP_FREEZING) {
280 /* We change from FREEZING to FROZEN lazily if the cgroup was 263 /* We change from FREEZING to FROZEN lazily if the cgroup was
281 * only partially frozen when we exitted write. */ 264 * only partially frozen when we exitted write. */
282 update_freezer_state(cgroup, freezer); 265 update_if_frozen(cgroup, freezer);
283 state = freezer->state; 266 state = freezer->state;
284 } 267 }
285 spin_unlock_irq(&freezer->lock); 268 spin_unlock_irq(&freezer->lock);
@@ -301,7 +284,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
301 while ((task = cgroup_iter_next(cgroup, &it))) { 284 while ((task = cgroup_iter_next(cgroup, &it))) {
302 if (!freeze_task(task, true)) 285 if (!freeze_task(task, true))
303 continue; 286 continue;
304 if (is_task_frozen_enough(task)) 287 if (frozen(task))
305 continue; 288 continue;
306 if (!freezing(task) && !freezer_should_skip(task)) 289 if (!freezing(task) && !freezer_should_skip(task))
307 num_cant_freeze_now++; 290 num_cant_freeze_now++;
@@ -335,7 +318,7 @@ static int freezer_change_state(struct cgroup *cgroup,
335 318
336 spin_lock_irq(&freezer->lock); 319 spin_lock_irq(&freezer->lock);
337 320
338 update_freezer_state(cgroup, freezer); 321 update_if_frozen(cgroup, freezer);
339 if (goal_state == freezer->state) 322 if (goal_state == freezer->state)
340 goto out; 323 goto out;
341 324
@@ -398,6 +381,9 @@ struct cgroup_subsys freezer_subsys = {
398 .populate = freezer_populate, 381 .populate = freezer_populate,
399 .subsys_id = freezer_subsys_id, 382 .subsys_id = freezer_subsys_id,
400 .can_attach = freezer_can_attach, 383 .can_attach = freezer_can_attach,
384 .can_attach_task = freezer_can_attach_task,
385 .pre_attach = NULL,
386 .attach_task = NULL,
401 .attach = NULL, 387 .attach = NULL,
402 .fork = freezer_fork, 388 .fork = freezer_fork,
403 .exit = NULL, 389 .exit = NULL,
diff --git a/kernel/compat.c b/kernel/compat.c
index c9e2ec0b34a8..fc9eb093acd5 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o,
52 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; 52 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
53} 53}
54 54
55static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
56{
57 memset(txc, 0, sizeof(struct timex));
58
59 if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
60 __get_user(txc->modes, &utp->modes) ||
61 __get_user(txc->offset, &utp->offset) ||
62 __get_user(txc->freq, &utp->freq) ||
63 __get_user(txc->maxerror, &utp->maxerror) ||
64 __get_user(txc->esterror, &utp->esterror) ||
65 __get_user(txc->status, &utp->status) ||
66 __get_user(txc->constant, &utp->constant) ||
67 __get_user(txc->precision, &utp->precision) ||
68 __get_user(txc->tolerance, &utp->tolerance) ||
69 __get_user(txc->time.tv_sec, &utp->time.tv_sec) ||
70 __get_user(txc->time.tv_usec, &utp->time.tv_usec) ||
71 __get_user(txc->tick, &utp->tick) ||
72 __get_user(txc->ppsfreq, &utp->ppsfreq) ||
73 __get_user(txc->jitter, &utp->jitter) ||
74 __get_user(txc->shift, &utp->shift) ||
75 __get_user(txc->stabil, &utp->stabil) ||
76 __get_user(txc->jitcnt, &utp->jitcnt) ||
77 __get_user(txc->calcnt, &utp->calcnt) ||
78 __get_user(txc->errcnt, &utp->errcnt) ||
79 __get_user(txc->stbcnt, &utp->stbcnt))
80 return -EFAULT;
81
82 return 0;
83}
84
85static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
86{
87 if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
88 __put_user(txc->modes, &utp->modes) ||
89 __put_user(txc->offset, &utp->offset) ||
90 __put_user(txc->freq, &utp->freq) ||
91 __put_user(txc->maxerror, &utp->maxerror) ||
92 __put_user(txc->esterror, &utp->esterror) ||
93 __put_user(txc->status, &utp->status) ||
94 __put_user(txc->constant, &utp->constant) ||
95 __put_user(txc->precision, &utp->precision) ||
96 __put_user(txc->tolerance, &utp->tolerance) ||
97 __put_user(txc->time.tv_sec, &utp->time.tv_sec) ||
98 __put_user(txc->time.tv_usec, &utp->time.tv_usec) ||
99 __put_user(txc->tick, &utp->tick) ||
100 __put_user(txc->ppsfreq, &utp->ppsfreq) ||
101 __put_user(txc->jitter, &utp->jitter) ||
102 __put_user(txc->shift, &utp->shift) ||
103 __put_user(txc->stabil, &utp->stabil) ||
104 __put_user(txc->jitcnt, &utp->jitcnt) ||
105 __put_user(txc->calcnt, &utp->calcnt) ||
106 __put_user(txc->errcnt, &utp->errcnt) ||
107 __put_user(txc->stbcnt, &utp->stbcnt) ||
108 __put_user(txc->tai, &utp->tai))
109 return -EFAULT;
110 return 0;
111}
112
55asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, 113asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
56 struct timezone __user *tz) 114 struct timezone __user *tz)
57{ 115{
@@ -235,6 +293,8 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
235 return compat_jiffies_to_clock_t(jiffies); 293 return compat_jiffies_to_clock_t(jiffies);
236} 294}
237 295
296#ifdef __ARCH_WANT_SYS_SIGPENDING
297
238/* 298/*
239 * Assumption: old_sigset_t and compat_old_sigset_t are both 299 * Assumption: old_sigset_t and compat_old_sigset_t are both
240 * types that can be passed to put_user()/get_user(). 300 * types that can be passed to put_user()/get_user().
@@ -254,6 +314,10 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
254 return ret; 314 return ret;
255} 315}
256 316
317#endif
318
319#ifdef __ARCH_WANT_SYS_SIGPROCMASK
320
257asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, 321asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
258 compat_old_sigset_t __user *oset) 322 compat_old_sigset_t __user *oset)
259{ 323{
@@ -275,6 +339,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
275 return ret; 339 return ret;
276} 340}
277 341
342#endif
343
278asmlinkage long compat_sys_setrlimit(unsigned int resource, 344asmlinkage long compat_sys_setrlimit(unsigned int resource,
279 struct compat_rlimit __user *rlim) 345 struct compat_rlimit __user *rlim)
280{ 346{
@@ -617,6 +683,29 @@ long compat_sys_clock_gettime(clockid_t which_clock,
617 return err; 683 return err;
618} 684}
619 685
686long compat_sys_clock_adjtime(clockid_t which_clock,
687 struct compat_timex __user *utp)
688{
689 struct timex txc;
690 mm_segment_t oldfs;
691 int err, ret;
692
693 err = compat_get_timex(&txc, utp);
694 if (err)
695 return err;
696
697 oldfs = get_fs();
698 set_fs(KERNEL_DS);
699 ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc);
700 set_fs(oldfs);
701
702 err = compat_put_timex(utp, &txc);
703 if (err)
704 return err;
705
706 return ret;
707}
708
620long compat_sys_clock_getres(clockid_t which_clock, 709long compat_sys_clock_getres(clockid_t which_clock,
621 struct compat_timespec __user *tp) 710 struct compat_timespec __user *tp)
622{ 711{
@@ -809,10 +898,9 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
809{ 898{
810 compat_sigset_t s32; 899 compat_sigset_t s32;
811 sigset_t s; 900 sigset_t s;
812 int sig;
813 struct timespec t; 901 struct timespec t;
814 siginfo_t info; 902 siginfo_t info;
815 long ret, timeout = 0; 903 long ret;
816 904
817 if (sigsetsize != sizeof(sigset_t)) 905 if (sigsetsize != sizeof(sigset_t))
818 return -EINVAL; 906 return -EINVAL;
@@ -820,51 +908,19 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
820 if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) 908 if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
821 return -EFAULT; 909 return -EFAULT;
822 sigset_from_compat(&s, &s32); 910 sigset_from_compat(&s, &s32);
823 sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP));
824 signotset(&s);
825 911
826 if (uts) { 912 if (uts) {
827 if (get_compat_timespec (&t, uts)) 913 if (get_compat_timespec(&t, uts))
828 return -EFAULT; 914 return -EFAULT;
829 if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0
830 || t.tv_sec < 0)
831 return -EINVAL;
832 } 915 }
833 916
834 spin_lock_irq(&current->sighand->siglock); 917 ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
835 sig = dequeue_signal(current, &s, &info);
836 if (!sig) {
837 timeout = MAX_SCHEDULE_TIMEOUT;
838 if (uts)
839 timeout = timespec_to_jiffies(&t)
840 +(t.tv_sec || t.tv_nsec);
841 if (timeout) {
842 current->real_blocked = current->blocked;
843 sigandsets(&current->blocked, &current->blocked, &s);
844
845 recalc_sigpending();
846 spin_unlock_irq(&current->sighand->siglock);
847
848 timeout = schedule_timeout_interruptible(timeout);
849
850 spin_lock_irq(&current->sighand->siglock);
851 sig = dequeue_signal(current, &s, &info);
852 current->blocked = current->real_blocked;
853 siginitset(&current->real_blocked, 0);
854 recalc_sigpending();
855 }
856 }
857 spin_unlock_irq(&current->sighand->siglock);
858 918
859 if (sig) { 919 if (ret > 0 && uinfo) {
860 ret = sig; 920 if (copy_siginfo_to_user32(uinfo, &info))
861 if (uinfo) { 921 ret = -EFAULT;
862 if (copy_siginfo_to_user32(uinfo, &info))
863 ret = -EFAULT;
864 }
865 }else {
866 ret = timeout?-EINTR:-EAGAIN;
867 } 922 }
923
868 return ret; 924 return ret;
869 925
870} 926}
@@ -951,58 +1007,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
951asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) 1007asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
952{ 1008{
953 struct timex txc; 1009 struct timex txc;
954 int ret; 1010 int err, ret;
955
956 memset(&txc, 0, sizeof(struct timex));
957 1011
958 if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || 1012 err = compat_get_timex(&txc, utp);
959 __get_user(txc.modes, &utp->modes) || 1013 if (err)
960 __get_user(txc.offset, &utp->offset) || 1014 return err;
961 __get_user(txc.freq, &utp->freq) ||
962 __get_user(txc.maxerror, &utp->maxerror) ||
963 __get_user(txc.esterror, &utp->esterror) ||
964 __get_user(txc.status, &utp->status) ||
965 __get_user(txc.constant, &utp->constant) ||
966 __get_user(txc.precision, &utp->precision) ||
967 __get_user(txc.tolerance, &utp->tolerance) ||
968 __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
969 __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
970 __get_user(txc.tick, &utp->tick) ||
971 __get_user(txc.ppsfreq, &utp->ppsfreq) ||
972 __get_user(txc.jitter, &utp->jitter) ||
973 __get_user(txc.shift, &utp->shift) ||
974 __get_user(txc.stabil, &utp->stabil) ||
975 __get_user(txc.jitcnt, &utp->jitcnt) ||
976 __get_user(txc.calcnt, &utp->calcnt) ||
977 __get_user(txc.errcnt, &utp->errcnt) ||
978 __get_user(txc.stbcnt, &utp->stbcnt))
979 return -EFAULT;
980 1015
981 ret = do_adjtimex(&txc); 1016 ret = do_adjtimex(&txc);
982 1017
983 if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || 1018 err = compat_put_timex(utp, &txc);
984 __put_user(txc.modes, &utp->modes) || 1019 if (err)
985 __put_user(txc.offset, &utp->offset) || 1020 return err;
986 __put_user(txc.freq, &utp->freq) ||
987 __put_user(txc.maxerror, &utp->maxerror) ||
988 __put_user(txc.esterror, &utp->esterror) ||
989 __put_user(txc.status, &utp->status) ||
990 __put_user(txc.constant, &utp->constant) ||
991 __put_user(txc.precision, &utp->precision) ||
992 __put_user(txc.tolerance, &utp->tolerance) ||
993 __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
994 __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
995 __put_user(txc.tick, &utp->tick) ||
996 __put_user(txc.ppsfreq, &utp->ppsfreq) ||
997 __put_user(txc.jitter, &utp->jitter) ||
998 __put_user(txc.shift, &utp->shift) ||
999 __put_user(txc.stabil, &utp->stabil) ||
1000 __put_user(txc.jitcnt, &utp->jitcnt) ||
1001 __put_user(txc.calcnt, &utp->calcnt) ||
1002 __put_user(txc.errcnt, &utp->errcnt) ||
1003 __put_user(txc.stbcnt, &utp->stbcnt) ||
1004 __put_user(txc.tai, &utp->tai))
1005 ret = -EFAULT;
1006 1021
1007 return ret; 1022 return ret;
1008} 1023}
diff --git a/kernel/configs.c b/kernel/configs.c
index abaee684ecbf..b4066b44a99d 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -66,6 +66,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
66static const struct file_operations ikconfig_file_ops = { 66static const struct file_operations ikconfig_file_ops = {
67 .owner = THIS_MODULE, 67 .owner = THIS_MODULE,
68 .read = ikconfig_read_current, 68 .read = ikconfig_read_current,
69 .llseek = default_llseek,
69}; 70};
70 71
71static int __init ikconfig_init(void) 72static int __init ikconfig_init(void)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f6e726f18491..12b7458f23b1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -126,7 +126,7 @@ static void cpu_hotplug_done(void)
126#else /* #if CONFIG_HOTPLUG_CPU */ 126#else /* #if CONFIG_HOTPLUG_CPU */
127static void cpu_hotplug_begin(void) {} 127static void cpu_hotplug_begin(void) {}
128static void cpu_hotplug_done(void) {} 128static void cpu_hotplug_done(void) {}
129#endif /* #esle #if CONFIG_HOTPLUG_CPU */ 129#endif /* #else #if CONFIG_HOTPLUG_CPU */
130 130
131/* Need to know about CPUs going up/down? */ 131/* Need to know about CPUs going up/down? */
132int __ref register_cpu_notifier(struct notifier_block *nb) 132int __ref register_cpu_notifier(struct notifier_block *nb)
@@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v)
160{ 160{
161 BUG_ON(cpu_notify(val, v)); 161 BUG_ON(cpu_notify(val, v));
162} 162}
163
164EXPORT_SYMBOL(register_cpu_notifier); 163EXPORT_SYMBOL(register_cpu_notifier);
165 164
166void __ref unregister_cpu_notifier(struct notifier_block *nb) 165void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -189,7 +188,6 @@ static inline void check_for_tasks(int cpu)
189} 188}
190 189
191struct take_cpu_down_param { 190struct take_cpu_down_param {
192 struct task_struct *caller;
193 unsigned long mod; 191 unsigned long mod;
194 void *hcpu; 192 void *hcpu;
195}; 193};
@@ -198,7 +196,6 @@ struct take_cpu_down_param {
198static int __ref take_cpu_down(void *_param) 196static int __ref take_cpu_down(void *_param)
199{ 197{
200 struct take_cpu_down_param *param = _param; 198 struct take_cpu_down_param *param = _param;
201 unsigned int cpu = (unsigned long)param->hcpu;
202 int err; 199 int err;
203 200
204 /* Ensure this CPU doesn't handle any more interrupts. */ 201 /* Ensure this CPU doesn't handle any more interrupts. */
@@ -207,12 +204,6 @@ static int __ref take_cpu_down(void *_param)
207 return err; 204 return err;
208 205
209 cpu_notify(CPU_DYING | param->mod, param->hcpu); 206 cpu_notify(CPU_DYING | param->mod, param->hcpu);
210
211 if (task_cpu(param->caller) == cpu)
212 move_task_off_dead_cpu(cpu, param->caller);
213 /* Force idle task to run as soon as we yield: it should
214 immediately notice cpu is offline and die quickly. */
215 sched_idle_next();
216 return 0; 207 return 0;
217} 208}
218 209
@@ -223,7 +214,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
223 void *hcpu = (void *)(long)cpu; 214 void *hcpu = (void *)(long)cpu;
224 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 215 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
225 struct take_cpu_down_param tcd_param = { 216 struct take_cpu_down_param tcd_param = {
226 .caller = current,
227 .mod = mod, 217 .mod = mod,
228 .hcpu = hcpu, 218 .hcpu = hcpu,
229 }; 219 };
@@ -235,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
235 return -EINVAL; 225 return -EINVAL;
236 226
237 cpu_hotplug_begin(); 227 cpu_hotplug_begin();
228
238 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); 229 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
239 if (err) { 230 if (err) {
240 nr_calls--; 231 nr_calls--;
@@ -253,9 +244,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
253 } 244 }
254 BUG_ON(cpu_online(cpu)); 245 BUG_ON(cpu_online(cpu));
255 246
256 /* Wait for it to sleep (leaving idle task). */ 247 /*
248 * The migration_call() CPU_DYING callback will have removed all
249 * runnable tasks from the cpu, there's only the idle task left now
250 * that the migration thread is done doing the stop_machine thing.
251 *
252 * Wait for the stop thread to go away.
253 */
257 while (!idle_cpu(cpu)) 254 while (!idle_cpu(cpu))
258 yield(); 255 cpu_relax();
259 256
260 /* This actually kills the CPU. */ 257 /* This actually kills the CPU. */
261 __cpu_die(cpu); 258 __cpu_die(cpu);
@@ -306,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
306 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); 303 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
307 if (ret) { 304 if (ret) {
308 nr_calls--; 305 nr_calls--;
309 printk("%s: attempt to bring up CPU %u failed\n", 306 printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
310 __func__, cpu); 307 __func__, cpu);
311 goto out_notify; 308 goto out_notify;
312 } 309 }
@@ -386,6 +383,14 @@ out:
386#ifdef CONFIG_PM_SLEEP_SMP 383#ifdef CONFIG_PM_SLEEP_SMP
387static cpumask_var_t frozen_cpus; 384static cpumask_var_t frozen_cpus;
388 385
386void __weak arch_disable_nonboot_cpus_begin(void)
387{
388}
389
390void __weak arch_disable_nonboot_cpus_end(void)
391{
392}
393
389int disable_nonboot_cpus(void) 394int disable_nonboot_cpus(void)
390{ 395{
391 int cpu, first_cpu, error = 0; 396 int cpu, first_cpu, error = 0;
@@ -397,6 +402,7 @@ int disable_nonboot_cpus(void)
397 * with the userspace trying to use the CPU hotplug at the same time 402 * with the userspace trying to use the CPU hotplug at the same time
398 */ 403 */
399 cpumask_clear(frozen_cpus); 404 cpumask_clear(frozen_cpus);
405 arch_disable_nonboot_cpus_begin();
400 406
401 printk("Disabling non-boot CPUs ...\n"); 407 printk("Disabling non-boot CPUs ...\n");
402 for_each_online_cpu(cpu) { 408 for_each_online_cpu(cpu) {
@@ -412,6 +418,8 @@ int disable_nonboot_cpus(void)
412 } 418 }
413 } 419 }
414 420
421 arch_disable_nonboot_cpus_end();
422
415 if (!error) { 423 if (!error) {
416 BUG_ON(num_online_cpus() > 1); 424 BUG_ON(num_online_cpus() > 1);
417 /* Make sure the CPUs won't be enabled by someone else */ 425 /* Make sure the CPUs won't be enabled by someone else */
@@ -441,14 +449,14 @@ void __ref enable_nonboot_cpus(void)
441 if (cpumask_empty(frozen_cpus)) 449 if (cpumask_empty(frozen_cpus))
442 goto out; 450 goto out;
443 451
444 printk("Enabling non-boot CPUs ...\n"); 452 printk(KERN_INFO "Enabling non-boot CPUs ...\n");
445 453
446 arch_enable_nonboot_cpus_begin(); 454 arch_enable_nonboot_cpus_begin();
447 455
448 for_each_cpu(cpu, frozen_cpus) { 456 for_each_cpu(cpu, frozen_cpus) {
449 error = _cpu_up(cpu, 1); 457 error = _cpu_up(cpu, 1);
450 if (!error) { 458 if (!error) {
451 printk("CPU%d is up\n", cpu); 459 printk(KERN_INFO "CPU%d is up\n", cpu);
452 continue; 460 continue;
453 } 461 }
454 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 462 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
@@ -500,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
500 */ 508 */
501 509
502/* cpu_bit_bitmap[0] is empty - so we can back into it */ 510/* cpu_bit_bitmap[0] is empty - so we can back into it */
503#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x) 511#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x))
504#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) 512#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
505#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) 513#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
506#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) 514#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b23c0979bbe7..9c9b7545c810 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -231,18 +231,17 @@ static DEFINE_SPINLOCK(cpuset_buffer_lock);
231 * users. If someone tries to mount the "cpuset" filesystem, we 231 * users. If someone tries to mount the "cpuset" filesystem, we
232 * silently switch it to mount "cgroup" instead 232 * silently switch it to mount "cgroup" instead
233 */ 233 */
234static int cpuset_get_sb(struct file_system_type *fs_type, 234static struct dentry *cpuset_mount(struct file_system_type *fs_type,
235 int flags, const char *unused_dev_name, 235 int flags, const char *unused_dev_name, void *data)
236 void *data, struct vfsmount *mnt)
237{ 236{
238 struct file_system_type *cgroup_fs = get_fs_type("cgroup"); 237 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
239 int ret = -ENODEV; 238 struct dentry *ret = ERR_PTR(-ENODEV);
240 if (cgroup_fs) { 239 if (cgroup_fs) {
241 char mountopts[] = 240 char mountopts[] =
242 "cpuset,noprefix," 241 "cpuset,noprefix,"
243 "release_agent=/sbin/cpuset_release_agent"; 242 "release_agent=/sbin/cpuset_release_agent";
244 ret = cgroup_fs->get_sb(cgroup_fs, flags, 243 ret = cgroup_fs->mount(cgroup_fs, flags,
245 unused_dev_name, mountopts, mnt); 244 unused_dev_name, mountopts);
246 put_filesystem(cgroup_fs); 245 put_filesystem(cgroup_fs);
247 } 246 }
248 return ret; 247 return ret;
@@ -250,7 +249,7 @@ static int cpuset_get_sb(struct file_system_type *fs_type,
250 249
251static struct file_system_type cpuset_fs_type = { 250static struct file_system_type cpuset_fs_type = {
252 .name = "cpuset", 251 .name = "cpuset",
253 .get_sb = cpuset_get_sb, 252 .mount = cpuset_mount,
254}; 253};
255 254
256/* 255/*
@@ -1016,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p,
1016 struct cpuset *cs; 1015 struct cpuset *cs;
1017 int migrate; 1016 int migrate;
1018 const nodemask_t *oldmem = scan->data; 1017 const nodemask_t *oldmem = scan->data;
1019 NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); 1018 static nodemask_t newmems; /* protected by cgroup_mutex */
1020
1021 if (!newmems)
1022 return;
1023 1019
1024 cs = cgroup_cs(scan->cg); 1020 cs = cgroup_cs(scan->cg);
1025 guarantee_online_mems(cs, newmems); 1021 guarantee_online_mems(cs, &newmems);
1026 1022
1027 cpuset_change_task_nodemask(p, newmems); 1023 cpuset_change_task_nodemask(p, &newmems);
1028
1029 NODEMASK_FREE(newmems);
1030 1024
1031 mm = get_task_mm(p); 1025 mm = get_task_mm(p);
1032 if (!mm) 1026 if (!mm)
@@ -1165,7 +1159,7 @@ int current_cpuset_is_being_rebound(void)
1165static int update_relax_domain_level(struct cpuset *cs, s64 val) 1159static int update_relax_domain_level(struct cpuset *cs, s64 val)
1166{ 1160{
1167#ifdef CONFIG_SMP 1161#ifdef CONFIG_SMP
1168 if (val < -1 || val >= SD_LV_MAX) 1162 if (val < -1 || val >= sched_domain_level_max)
1169 return -EINVAL; 1163 return -EINVAL;
1170#endif 1164#endif
1171 1165
@@ -1373,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp)
1373 return val; 1367 return val;
1374} 1368}
1375 1369
1376/* Protected by cgroup_lock */
1377static cpumask_var_t cpus_attach;
1378
1379/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1370/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1380static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, 1371static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1381 struct task_struct *tsk, bool threadgroup) 1372 struct task_struct *tsk)
1382{ 1373{
1383 int ret;
1384 struct cpuset *cs = cgroup_cs(cont); 1374 struct cpuset *cs = cgroup_cs(cont);
1385 1375
1386 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1376 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1397,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1397 if (tsk->flags & PF_THREAD_BOUND) 1387 if (tsk->flags & PF_THREAD_BOUND)
1398 return -EINVAL; 1388 return -EINVAL;
1399 1389
1400 ret = security_task_setscheduler(tsk, 0, NULL);
1401 if (ret)
1402 return ret;
1403 if (threadgroup) {
1404 struct task_struct *c;
1405
1406 rcu_read_lock();
1407 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1408 ret = security_task_setscheduler(c, 0, NULL);
1409 if (ret) {
1410 rcu_read_unlock();
1411 return ret;
1412 }
1413 }
1414 rcu_read_unlock();
1415 }
1416 return 0; 1390 return 0;
1417} 1391}
1418 1392
1419static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, 1393static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
1420 struct cpuset *cs) 1394{
1395 return security_task_setscheduler(task);
1396}
1397
1398/*
1399 * Protected by cgroup_lock. The nodemasks must be stored globally because
1400 * dynamically allocating them is not allowed in pre_attach, and they must
1401 * persist among pre_attach, attach_task, and attach.
1402 */
1403static cpumask_var_t cpus_attach;
1404static nodemask_t cpuset_attach_nodemask_from;
1405static nodemask_t cpuset_attach_nodemask_to;
1406
1407/* Set-up work for before attaching each task. */
1408static void cpuset_pre_attach(struct cgroup *cont)
1409{
1410 struct cpuset *cs = cgroup_cs(cont);
1411
1412 if (cs == &top_cpuset)
1413 cpumask_copy(cpus_attach, cpu_possible_mask);
1414 else
1415 guarantee_online_cpus(cs, cpus_attach);
1416
1417 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1418}
1419
1420/* Per-thread attachment work. */
1421static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
1421{ 1422{
1422 int err; 1423 int err;
1424 struct cpuset *cs = cgroup_cs(cont);
1425
1423 /* 1426 /*
1424 * can_attach beforehand should guarantee that this doesn't fail. 1427 * can_attach beforehand should guarantee that this doesn't fail.
1425 * TODO: have a better way to handle failure here 1428 * TODO: have a better way to handle failure here
@@ -1427,56 +1430,31 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1427 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1430 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1428 WARN_ON_ONCE(err); 1431 WARN_ON_ONCE(err);
1429 1432
1430 cpuset_change_task_nodemask(tsk, to); 1433 cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
1431 cpuset_update_task_spread_flag(cs, tsk); 1434 cpuset_update_task_spread_flag(cs, tsk);
1432
1433} 1435}
1434 1436
1435static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, 1437static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1436 struct cgroup *oldcont, struct task_struct *tsk, 1438 struct cgroup *oldcont, struct task_struct *tsk)
1437 bool threadgroup)
1438{ 1439{
1439 struct mm_struct *mm; 1440 struct mm_struct *mm;
1440 struct cpuset *cs = cgroup_cs(cont); 1441 struct cpuset *cs = cgroup_cs(cont);
1441 struct cpuset *oldcs = cgroup_cs(oldcont); 1442 struct cpuset *oldcs = cgroup_cs(oldcont);
1442 NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
1443 NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
1444
1445 if (from == NULL || to == NULL)
1446 goto alloc_fail;
1447 1443
1448 if (cs == &top_cpuset) { 1444 /*
1449 cpumask_copy(cpus_attach, cpu_possible_mask); 1445 * Change mm, possibly for multiple threads in a threadgroup. This is
1450 } else { 1446 * expensive and may sleep.
1451 guarantee_online_cpus(cs, cpus_attach); 1447 */
1452 } 1448 cpuset_attach_nodemask_from = oldcs->mems_allowed;
1453 guarantee_online_mems(cs, to); 1449 cpuset_attach_nodemask_to = cs->mems_allowed;
1454
1455 /* do per-task migration stuff possibly for each in the threadgroup */
1456 cpuset_attach_task(tsk, to, cs);
1457 if (threadgroup) {
1458 struct task_struct *c;
1459 rcu_read_lock();
1460 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1461 cpuset_attach_task(c, to, cs);
1462 }
1463 rcu_read_unlock();
1464 }
1465
1466 /* change mm; only needs to be done once even if threadgroup */
1467 *from = oldcs->mems_allowed;
1468 *to = cs->mems_allowed;
1469 mm = get_task_mm(tsk); 1450 mm = get_task_mm(tsk);
1470 if (mm) { 1451 if (mm) {
1471 mpol_rebind_mm(mm, to); 1452 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1472 if (is_memory_migrate(cs)) 1453 if (is_memory_migrate(cs))
1473 cpuset_migrate_mm(mm, from, to); 1454 cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
1455 &cpuset_attach_nodemask_to);
1474 mmput(mm); 1456 mmput(mm);
1475 } 1457 }
1476
1477alloc_fail:
1478 NODEMASK_FREE(from);
1479 NODEMASK_FREE(to);
1480} 1458}
1481 1459
1482/* The various types of files and directories in a cpuset file system */ 1460/* The various types of files and directories in a cpuset file system */
@@ -1576,8 +1554,10 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1576 return -ENODEV; 1554 return -ENODEV;
1577 1555
1578 trialcs = alloc_trial_cpuset(cs); 1556 trialcs = alloc_trial_cpuset(cs);
1579 if (!trialcs) 1557 if (!trialcs) {
1580 return -ENOMEM; 1558 retval = -ENOMEM;
1559 goto out;
1560 }
1581 1561
1582 switch (cft->private) { 1562 switch (cft->private) {
1583 case FILE_CPULIST: 1563 case FILE_CPULIST:
@@ -1592,6 +1572,7 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1592 } 1572 }
1593 1573
1594 free_trial_cpuset(trialcs); 1574 free_trial_cpuset(trialcs);
1575out:
1595 cgroup_unlock(); 1576 cgroup_unlock();
1596 return retval; 1577 return retval;
1597} 1578}
@@ -1608,34 +1589,26 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1608 * across a page fault. 1589 * across a page fault.
1609 */ 1590 */
1610 1591
1611static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) 1592static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1612{ 1593{
1613 int ret; 1594 size_t count;
1614 1595
1615 mutex_lock(&callback_mutex); 1596 mutex_lock(&callback_mutex);
1616 ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); 1597 count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
1617 mutex_unlock(&callback_mutex); 1598 mutex_unlock(&callback_mutex);
1618 1599
1619 return ret; 1600 return count;
1620} 1601}
1621 1602
1622static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1603static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1623{ 1604{
1624 NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); 1605 size_t count;
1625 int retval;
1626
1627 if (mask == NULL)
1628 return -ENOMEM;
1629 1606
1630 mutex_lock(&callback_mutex); 1607 mutex_lock(&callback_mutex);
1631 *mask = cs->mems_allowed; 1608 count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
1632 mutex_unlock(&callback_mutex); 1609 mutex_unlock(&callback_mutex);
1633 1610
1634 retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); 1611 return count;
1635
1636 NODEMASK_FREE(mask);
1637
1638 return retval;
1639} 1612}
1640 1613
1641static ssize_t cpuset_common_file_read(struct cgroup *cont, 1614static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1829,10 +1802,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1829} 1802}
1830 1803
1831/* 1804/*
1832 * post_clone() is called at the end of cgroup_clone(). 1805 * post_clone() is called during cgroup_create() when the
1833 * 'cgroup' was just created automatically as a result of 1806 * clone_children mount argument was specified. The cgroup
1834 * a cgroup_clone(), and the current task is about to 1807 * can not yet have any tasks.
1835 * be moved into 'cgroup'.
1836 * 1808 *
1837 * Currently we refuse to set up the cgroup - thereby 1809 * Currently we refuse to set up the cgroup - thereby
1838 * refusing the task to be entered, and as a result refusing 1810 * refusing the task to be entered, and as a result refusing
@@ -1860,8 +1832,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
1860 cs = cgroup_cs(cgroup); 1832 cs = cgroup_cs(cgroup);
1861 parent_cs = cgroup_cs(parent); 1833 parent_cs = cgroup_cs(parent);
1862 1834
1835 mutex_lock(&callback_mutex);
1863 cs->mems_allowed = parent_cs->mems_allowed; 1836 cs->mems_allowed = parent_cs->mems_allowed;
1864 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); 1837 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1838 mutex_unlock(&callback_mutex);
1865 return; 1839 return;
1866} 1840}
1867 1841
@@ -1929,6 +1903,9 @@ struct cgroup_subsys cpuset_subsys = {
1929 .create = cpuset_create, 1903 .create = cpuset_create,
1930 .destroy = cpuset_destroy, 1904 .destroy = cpuset_destroy,
1931 .can_attach = cpuset_can_attach, 1905 .can_attach = cpuset_can_attach,
1906 .can_attach_task = cpuset_can_attach_task,
1907 .pre_attach = cpuset_pre_attach,
1908 .attach_task = cpuset_attach_task,
1932 .attach = cpuset_attach, 1909 .attach = cpuset_attach,
1933 .populate = cpuset_populate, 1910 .populate = cpuset_populate,
1934 .post_clone = cpuset_post_clone, 1911 .post_clone = cpuset_post_clone,
@@ -2064,10 +2041,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2064 struct cpuset *cp; /* scans cpusets being updated */ 2041 struct cpuset *cp; /* scans cpusets being updated */
2065 struct cpuset *child; /* scans child cpusets of cp */ 2042 struct cpuset *child; /* scans child cpusets of cp */
2066 struct cgroup *cont; 2043 struct cgroup *cont;
2067 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); 2044 static nodemask_t oldmems; /* protected by cgroup_mutex */
2068
2069 if (oldmems == NULL)
2070 return;
2071 2045
2072 list_add_tail((struct list_head *)&root->stack_list, &queue); 2046 list_add_tail((struct list_head *)&root->stack_list, &queue);
2073 2047
@@ -2084,7 +2058,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2084 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2058 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2085 continue; 2059 continue;
2086 2060
2087 *oldmems = cp->mems_allowed; 2061 oldmems = cp->mems_allowed;
2088 2062
2089 /* Remove offline cpus and mems from this cpuset. */ 2063 /* Remove offline cpus and mems from this cpuset. */
2090 mutex_lock(&callback_mutex); 2064 mutex_lock(&callback_mutex);
@@ -2100,10 +2074,9 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2100 remove_tasks_in_empty_cpuset(cp); 2074 remove_tasks_in_empty_cpuset(cp);
2101 else { 2075 else {
2102 update_tasks_cpumask(cp, NULL); 2076 update_tasks_cpumask(cp, NULL);
2103 update_tasks_nodemask(cp, oldmems, NULL); 2077 update_tasks_nodemask(cp, &oldmems, NULL);
2104 } 2078 }
2105 } 2079 }
2106 NODEMASK_FREE(oldmems);
2107} 2080}
2108 2081
2109/* 2082/*
@@ -2145,19 +2118,16 @@ void cpuset_update_active_cpus(void)
2145static int cpuset_track_online_nodes(struct notifier_block *self, 2118static int cpuset_track_online_nodes(struct notifier_block *self,
2146 unsigned long action, void *arg) 2119 unsigned long action, void *arg)
2147{ 2120{
2148 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); 2121 static nodemask_t oldmems; /* protected by cgroup_mutex */
2149
2150 if (oldmems == NULL)
2151 return NOTIFY_DONE;
2152 2122
2153 cgroup_lock(); 2123 cgroup_lock();
2154 switch (action) { 2124 switch (action) {
2155 case MEM_ONLINE: 2125 case MEM_ONLINE:
2156 *oldmems = top_cpuset.mems_allowed; 2126 oldmems = top_cpuset.mems_allowed;
2157 mutex_lock(&callback_mutex); 2127 mutex_lock(&callback_mutex);
2158 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2128 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2159 mutex_unlock(&callback_mutex); 2129 mutex_unlock(&callback_mutex);
2160 update_tasks_nodemask(&top_cpuset, oldmems, NULL); 2130 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2161 break; 2131 break;
2162 case MEM_OFFLINE: 2132 case MEM_OFFLINE:
2163 /* 2133 /*
@@ -2171,7 +2141,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2171 } 2141 }
2172 cgroup_unlock(); 2142 cgroup_unlock();
2173 2143
2174 NODEMASK_FREE(oldmems);
2175 return NOTIFY_OK; 2144 return NOTIFY_OK;
2176} 2145}
2177#endif 2146#endif
@@ -2221,7 +2190,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2221 rcu_read_lock(); 2190 rcu_read_lock();
2222 cs = task_cs(tsk); 2191 cs = task_cs(tsk);
2223 if (cs) 2192 if (cs)
2224 cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed); 2193 do_set_cpus_allowed(tsk, cs->cpus_allowed);
2225 rcu_read_unlock(); 2194 rcu_read_unlock();
2226 2195
2227 /* 2196 /*
@@ -2248,7 +2217,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2248 * Like above we can temporary set any mask and rely on 2217 * Like above we can temporary set any mask and rely on
2249 * set_cpus_allowed_ptr() as synchronization point. 2218 * set_cpus_allowed_ptr() as synchronization point.
2250 */ 2219 */
2251 cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask); 2220 do_set_cpus_allowed(tsk, cpu_possible_mask);
2252 cpu = cpumask_any(cpu_active_mask); 2221 cpu = cpumask_any(cpu_active_mask);
2253 } 2222 }
2254 2223
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000000..5f85690285d4
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,34 @@
1#include <linux/kernel.h>
2#include <linux/crash_dump.h>
3#include <linux/init.h>
4#include <linux/errno.h>
5#include <linux/module.h>
6
7/*
8 * If we have booted due to a crash, max_pfn will be a very low value. We need
9 * to know the amount of memory that the previous kernel used.
10 */
11unsigned long saved_max_pfn;
12
13/*
14 * stores the physical address of elf header of crash image
15 *
16 * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
17 * is_kdump_kernel() to determine if we are booting after a panic. Hence put
18 * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
19 */
20unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
21
22/*
23 * elfcorehdr= specifies the location of elf core header stored by the crashed
24 * kernel. This option will be passed by kexec loader to the capture kernel.
25 */
26static int __init setup_elfcorehdr(char *arg)
27{
28 char *end;
29 if (!arg)
30 return -EINVAL;
31 elfcorehdr_addr = memparse(arg, &end);
32 return end > arg ? 0 : -EINVAL;
33}
34early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
index 9a3e22641fe7..174fa84eca30 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,4 +1,4 @@
1/* Task credentials management - see Documentation/credentials.txt 1/* Task credentials management - see Documentation/security/credentials.txt
2 * 2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
@@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar;
35static struct thread_group_cred init_tgcred = { 35static struct thread_group_cred init_tgcred = {
36 .usage = ATOMIC_INIT(2), 36 .usage = ATOMIC_INIT(2),
37 .tgid = 0, 37 .tgid = 0,
38 .lock = SPIN_LOCK_UNLOCKED, 38 .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
39}; 39};
40#endif 40#endif
41 41
@@ -49,11 +49,12 @@ struct cred init_cred = {
49 .magic = CRED_MAGIC, 49 .magic = CRED_MAGIC,
50#endif 50#endif
51 .securebits = SECUREBITS_DEFAULT, 51 .securebits = SECUREBITS_DEFAULT,
52 .cap_inheritable = CAP_INIT_INH_SET, 52 .cap_inheritable = CAP_EMPTY_SET,
53 .cap_permitted = CAP_FULL_SET, 53 .cap_permitted = CAP_FULL_SET,
54 .cap_effective = CAP_INIT_EFF_SET, 54 .cap_effective = CAP_FULL_SET,
55 .cap_bset = CAP_INIT_BSET, 55 .cap_bset = CAP_FULL_SET,
56 .user = INIT_USER, 56 .user = INIT_USER,
57 .user_ns = &init_user_ns,
57 .group_info = &init_groups, 58 .group_info = &init_groups,
58#ifdef CONFIG_KEYS 59#ifdef CONFIG_KEYS
59 .tgcred = &init_tgcred, 60 .tgcred = &init_tgcred,
@@ -252,13 +253,13 @@ struct cred *cred_alloc_blank(void)
252#endif 253#endif
253 254
254 atomic_set(&new->usage, 1); 255 atomic_set(&new->usage, 1);
256#ifdef CONFIG_DEBUG_CREDENTIALS
257 new->magic = CRED_MAGIC;
258#endif
255 259
256 if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) 260 if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
257 goto error; 261 goto error;
258 262
259#ifdef CONFIG_DEBUG_CREDENTIALS
260 new->magic = CRED_MAGIC;
261#endif
262 return new; 263 return new;
263 264
264error: 265error:
@@ -325,7 +326,7 @@ EXPORT_SYMBOL(prepare_creds);
325 326
326/* 327/*
327 * Prepare credentials for current to perform an execve() 328 * Prepare credentials for current to perform an execve()
328 * - The caller must hold current->cred_guard_mutex 329 * - The caller must hold ->cred_guard_mutex
329 */ 330 */
330struct cred *prepare_exec_creds(void) 331struct cred *prepare_exec_creds(void)
331{ 332{
@@ -384,8 +385,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
384 struct cred *new; 385 struct cred *new;
385 int ret; 386 int ret;
386 387
387 mutex_init(&p->cred_guard_mutex);
388
389 if ( 388 if (
390#ifdef CONFIG_KEYS 389#ifdef CONFIG_KEYS
391 !p->cred->thread_keyring && 390 !p->cred->thread_keyring &&
@@ -412,6 +411,11 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
412 goto error_put; 411 goto error_put;
413 } 412 }
414 413
414 /* cache user_ns in cred. Doesn't need a refcount because it will
415 * stay pinned by cred->user
416 */
417 new->user_ns = new->user->user_ns;
418
415#ifdef CONFIG_KEYS 419#ifdef CONFIG_KEYS
416 /* new threads get their own thread keyrings if their parent already 420 /* new threads get their own thread keyrings if their parent already
417 * had one */ 421 * had one */
@@ -659,6 +663,8 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
659 validate_creds(old); 663 validate_creds(old);
660 664
661 *new = *old; 665 *new = *old;
666 atomic_set(&new->usage, 1);
667 set_cred_subscribers(new, 0);
662 get_uid(new->user); 668 get_uid(new->user);
663 get_group_info(new->group_info); 669 get_group_info(new->group_info);
664 670
@@ -676,8 +682,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
676 if (security_prepare_creds(new, old, GFP_KERNEL) < 0) 682 if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
677 goto error; 683 goto error;
678 684
679 atomic_set(&new->usage, 1);
680 set_cred_subscribers(new, 0);
681 put_cred(old); 685 put_cred(old);
682 validate_creds(new); 686 validate_creds(new);
683 return new; 687 return new;
@@ -750,7 +754,11 @@ bool creds_are_invalid(const struct cred *cred)
750 if (cred->magic != CRED_MAGIC) 754 if (cred->magic != CRED_MAGIC)
751 return true; 755 return true;
752#ifdef CONFIG_SECURITY_SELINUX 756#ifdef CONFIG_SECURITY_SELINUX
753 if (selinux_is_enabled()) { 757 /*
758 * cred->security == NULL if security_cred_alloc_blank() or
759 * security_prepare_creds() returned an error.
760 */
761 if (selinux_is_enabled() && cred->security) {
754 if ((unsigned long) cred->security < PAGE_SIZE) 762 if ((unsigned long) cred->security < PAGE_SIZE)
755 return true; 763 return true;
756 if ((*(u32 *)cred->security & 0xffffff00) == 764 if ((*(u32 *)cred->security & 0xffffff00) ==
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index de407c78178d..bad6786dee88 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -47,6 +47,7 @@
47#include <linux/pid.h> 47#include <linux/pid.h>
48#include <linux/smp.h> 48#include <linux/smp.h>
49#include <linux/mm.h> 49#include <linux/mm.h>
50#include <linux/rcupdate.h>
50 51
51#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
52#include <asm/byteorder.h> 53#include <asm/byteorder.h>
@@ -109,13 +110,15 @@ static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
109 */ 110 */
110atomic_t kgdb_active = ATOMIC_INIT(-1); 111atomic_t kgdb_active = ATOMIC_INIT(-1);
111EXPORT_SYMBOL_GPL(kgdb_active); 112EXPORT_SYMBOL_GPL(kgdb_active);
113static DEFINE_RAW_SPINLOCK(dbg_master_lock);
114static DEFINE_RAW_SPINLOCK(dbg_slave_lock);
112 115
113/* 116/*
114 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early 117 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
115 * bootup code (which might not have percpu set up yet): 118 * bootup code (which might not have percpu set up yet):
116 */ 119 */
117static atomic_t passive_cpu_wait[NR_CPUS]; 120static atomic_t masters_in_kgdb;
118static atomic_t cpu_in_kgdb[NR_CPUS]; 121static atomic_t slaves_in_kgdb;
119static atomic_t kgdb_break_tasklet_var; 122static atomic_t kgdb_break_tasklet_var;
120atomic_t kgdb_setting_breakpoint; 123atomic_t kgdb_setting_breakpoint;
121 124
@@ -206,18 +209,6 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
206 return 0; 209 return 0;
207} 210}
208 211
209/**
210 * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
211 * @regs: Current &struct pt_regs.
212 *
213 * This function will be called if the particular architecture must
214 * disable hardware debugging while it is processing gdb packets or
215 * handling exception.
216 */
217void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
218{
219}
220
221/* 212/*
222 * Some architectures need cache flushes when we set/clear a 213 * Some architectures need cache flushes when we set/clear a
223 * breakpoint: 214 * breakpoint:
@@ -457,26 +448,34 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
457 return 1; 448 return 1;
458} 449}
459 450
460static void dbg_cpu_switch(int cpu, int next_cpu) 451static void dbg_touch_watchdogs(void)
461{ 452{
462 /* Mark the cpu we are switching away from as a slave when it 453 touch_softlockup_watchdog_sync();
463 * holds the kgdb_active token. This must be done so that the 454 clocksource_touch_watchdog();
464 * that all the cpus wait in for the debug core will not enter 455 rcu_cpu_stall_reset();
465 * again as the master. */
466 if (cpu == atomic_read(&kgdb_active)) {
467 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
468 kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER;
469 }
470 kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER;
471} 456}
472 457
473static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs) 458static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
459 int exception_state)
474{ 460{
475 unsigned long flags; 461 unsigned long flags;
476 int sstep_tries = 100; 462 int sstep_tries = 100;
477 int error; 463 int error;
478 int i, cpu; 464 int cpu;
479 int trace_on = 0; 465 int trace_on = 0;
466 int online_cpus = num_online_cpus();
467
468 kgdb_info[ks->cpu].enter_kgdb++;
469 kgdb_info[ks->cpu].exception_state |= exception_state;
470
471 if (exception_state == DCPU_WANT_MASTER)
472 atomic_inc(&masters_in_kgdb);
473 else
474 atomic_inc(&slaves_in_kgdb);
475
476 if (arch_kgdb_ops.disable_hw_break)
477 arch_kgdb_ops.disable_hw_break(regs);
478
480acquirelock: 479acquirelock:
481 /* 480 /*
482 * Interrupts will be restored by the 'trap return' code, except when 481 * Interrupts will be restored by the 'trap return' code, except when
@@ -489,14 +488,15 @@ acquirelock:
489 kgdb_info[cpu].task = current; 488 kgdb_info[cpu].task = current;
490 kgdb_info[cpu].ret_state = 0; 489 kgdb_info[cpu].ret_state = 0;
491 kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT; 490 kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
492 /*
493 * Make sure the above info reaches the primary CPU before
494 * our cpu_in_kgdb[] flag setting does:
495 */
496 atomic_inc(&cpu_in_kgdb[cpu]);
497 491
498 if (exception_level == 1) 492 /* Make sure the above info reaches the primary CPU */
493 smp_mb();
494
495 if (exception_level == 1) {
496 if (raw_spin_trylock(&dbg_master_lock))
497 atomic_xchg(&kgdb_active, cpu);
499 goto cpu_master_loop; 498 goto cpu_master_loop;
499 }
500 500
501 /* 501 /*
502 * CPU will loop if it is a slave or request to become a kgdb 502 * CPU will loop if it is a slave or request to become a kgdb
@@ -508,10 +508,12 @@ cpu_loop:
508 kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER; 508 kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
509 goto cpu_master_loop; 509 goto cpu_master_loop;
510 } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) { 510 } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
511 if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu) 511 if (raw_spin_trylock(&dbg_master_lock)) {
512 atomic_xchg(&kgdb_active, cpu);
512 break; 513 break;
514 }
513 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { 515 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
514 if (!atomic_read(&passive_cpu_wait[cpu])) 516 if (!raw_spin_is_locked(&dbg_slave_lock))
515 goto return_normal; 517 goto return_normal;
516 } else { 518 } else {
517return_normal: 519return_normal:
@@ -522,9 +524,12 @@ return_normal:
522 arch_kgdb_ops.correct_hw_break(); 524 arch_kgdb_ops.correct_hw_break();
523 if (trace_on) 525 if (trace_on)
524 tracing_on(); 526 tracing_on();
525 atomic_dec(&cpu_in_kgdb[cpu]); 527 kgdb_info[cpu].exception_state &=
526 touch_softlockup_watchdog_sync(); 528 ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
527 clocksource_touch_watchdog(); 529 kgdb_info[cpu].enter_kgdb--;
530 smp_mb__before_atomic_dec();
531 atomic_dec(&slaves_in_kgdb);
532 dbg_touch_watchdogs();
528 local_irq_restore(flags); 533 local_irq_restore(flags);
529 return 0; 534 return 0;
530 } 535 }
@@ -533,7 +538,7 @@ return_normal:
533 538
534 /* 539 /*
535 * For single stepping, try to only enter on the processor 540 * For single stepping, try to only enter on the processor
536 * that was single stepping. To gaurd against a deadlock, the 541 * that was single stepping. To guard against a deadlock, the
537 * kernel will only try for the value of sstep_tries before 542 * kernel will only try for the value of sstep_tries before
538 * giving up and continuing on. 543 * giving up and continuing on.
539 */ 544 */
@@ -541,8 +546,8 @@ return_normal:
541 (kgdb_info[cpu].task && 546 (kgdb_info[cpu].task &&
542 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { 547 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
543 atomic_set(&kgdb_active, -1); 548 atomic_set(&kgdb_active, -1);
544 touch_softlockup_watchdog_sync(); 549 raw_spin_unlock(&dbg_master_lock);
545 clocksource_touch_watchdog(); 550 dbg_touch_watchdogs();
546 local_irq_restore(flags); 551 local_irq_restore(flags);
547 552
548 goto acquirelock; 553 goto acquirelock;
@@ -563,16 +568,12 @@ return_normal:
563 if (dbg_io_ops->pre_exception) 568 if (dbg_io_ops->pre_exception)
564 dbg_io_ops->pre_exception(); 569 dbg_io_ops->pre_exception();
565 570
566 kgdb_disable_hw_debug(ks->linux_regs);
567
568 /* 571 /*
569 * Get the passive CPU lock which will hold all the non-primary 572 * Get the passive CPU lock which will hold all the non-primary
570 * CPU in a spin state while the debugger is active 573 * CPU in a spin state while the debugger is active
571 */ 574 */
572 if (!kgdb_single_step) { 575 if (!kgdb_single_step)
573 for (i = 0; i < NR_CPUS; i++) 576 raw_spin_lock(&dbg_slave_lock);
574 atomic_inc(&passive_cpu_wait[i]);
575 }
576 577
577#ifdef CONFIG_SMP 578#ifdef CONFIG_SMP
578 /* Signal the other CPUs to enter kgdb_wait() */ 579 /* Signal the other CPUs to enter kgdb_wait() */
@@ -583,10 +584,9 @@ return_normal:
583 /* 584 /*
584 * Wait for the other CPUs to be notified and be waiting for us: 585 * Wait for the other CPUs to be notified and be waiting for us:
585 */ 586 */
586 for_each_online_cpu(i) { 587 while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) +
587 while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i])) 588 atomic_read(&slaves_in_kgdb)) != online_cpus)
588 cpu_relax(); 589 cpu_relax();
589 }
590 590
591 /* 591 /*
592 * At this point the primary processor is completely 592 * At this point the primary processor is completely
@@ -615,7 +615,8 @@ cpu_master_loop:
615 if (error == DBG_PASS_EVENT) { 615 if (error == DBG_PASS_EVENT) {
616 dbg_kdb_mode = !dbg_kdb_mode; 616 dbg_kdb_mode = !dbg_kdb_mode;
617 } else if (error == DBG_SWITCH_CPU_EVENT) { 617 } else if (error == DBG_SWITCH_CPU_EVENT) {
618 dbg_cpu_switch(cpu, dbg_switch_cpu); 618 kgdb_info[dbg_switch_cpu].exception_state |=
619 DCPU_NEXT_MASTER;
619 goto cpu_loop; 620 goto cpu_loop;
620 } else { 621 } else {
621 kgdb_info[cpu].ret_state = error; 622 kgdb_info[cpu].ret_state = error;
@@ -627,24 +628,11 @@ cpu_master_loop:
627 if (dbg_io_ops->post_exception) 628 if (dbg_io_ops->post_exception)
628 dbg_io_ops->post_exception(); 629 dbg_io_ops->post_exception();
629 630
630 atomic_dec(&cpu_in_kgdb[ks->cpu]);
631
632 if (!kgdb_single_step) { 631 if (!kgdb_single_step) {
633 for (i = NR_CPUS-1; i >= 0; i--) 632 raw_spin_unlock(&dbg_slave_lock);
634 atomic_dec(&passive_cpu_wait[i]); 633 /* Wait till all the CPUs have quit from the debugger. */
635 /* 634 while (kgdb_do_roundup && atomic_read(&slaves_in_kgdb))
636 * Wait till all the CPUs have quit from the debugger, 635 cpu_relax();
637 * but allow a CPU that hit an exception and is
638 * waiting to become the master to remain in the debug
639 * core.
640 */
641 for_each_online_cpu(i) {
642 while (kgdb_do_roundup &&
643 atomic_read(&cpu_in_kgdb[i]) &&
644 !(kgdb_info[i].exception_state &
645 DCPU_WANT_MASTER))
646 cpu_relax();
647 }
648 } 636 }
649 637
650kgdb_restore: 638kgdb_restore:
@@ -655,12 +643,20 @@ kgdb_restore:
655 else 643 else
656 kgdb_sstep_pid = 0; 644 kgdb_sstep_pid = 0;
657 } 645 }
646 if (arch_kgdb_ops.correct_hw_break)
647 arch_kgdb_ops.correct_hw_break();
658 if (trace_on) 648 if (trace_on)
659 tracing_on(); 649 tracing_on();
650
651 kgdb_info[cpu].exception_state &=
652 ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
653 kgdb_info[cpu].enter_kgdb--;
654 smp_mb__before_atomic_dec();
655 atomic_dec(&masters_in_kgdb);
660 /* Free kgdb_active */ 656 /* Free kgdb_active */
661 atomic_set(&kgdb_active, -1); 657 atomic_set(&kgdb_active, -1);
662 touch_softlockup_watchdog_sync(); 658 raw_spin_unlock(&dbg_master_lock);
663 clocksource_touch_watchdog(); 659 dbg_touch_watchdogs();
664 local_irq_restore(flags); 660 local_irq_restore(flags);
665 661
666 return kgdb_info[cpu].ret_state; 662 return kgdb_info[cpu].ret_state;
@@ -678,7 +674,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
678{ 674{
679 struct kgdb_state kgdb_var; 675 struct kgdb_state kgdb_var;
680 struct kgdb_state *ks = &kgdb_var; 676 struct kgdb_state *ks = &kgdb_var;
681 int ret;
682 677
683 ks->cpu = raw_smp_processor_id(); 678 ks->cpu = raw_smp_processor_id();
684 ks->ex_vector = evector; 679 ks->ex_vector = evector;
@@ -689,11 +684,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
689 684
690 if (kgdb_reenter_check(ks)) 685 if (kgdb_reenter_check(ks))
691 return 0; /* Ouch, double exception ! */ 686 return 0; /* Ouch, double exception ! */
692 kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER; 687 if (kgdb_info[ks->cpu].enter_kgdb != 0)
693 ret = kgdb_cpu_enter(ks, regs); 688 return 0;
694 kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER | 689
695 DCPU_IS_SLAVE); 690 return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
696 return ret;
697} 691}
698 692
699int kgdb_nmicallback(int cpu, void *regs) 693int kgdb_nmicallback(int cpu, void *regs)
@@ -706,12 +700,9 @@ int kgdb_nmicallback(int cpu, void *regs)
706 ks->cpu = cpu; 700 ks->cpu = cpu;
707 ks->linux_regs = regs; 701 ks->linux_regs = regs;
708 702
709 if (!atomic_read(&cpu_in_kgdb[cpu]) && 703 if (kgdb_info[ks->cpu].enter_kgdb == 0 &&
710 atomic_read(&kgdb_active) != -1 && 704 raw_spin_is_locked(&dbg_master_lock)) {
711 atomic_read(&kgdb_active) != cpu) { 705 kgdb_cpu_enter(ks, regs, DCPU_IS_SLAVE);
712 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
713 kgdb_cpu_enter(ks, regs);
714 kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
715 return 0; 706 return 0;
716 } 707 }
717#endif 708#endif
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index c5d753d80f67..3494c28a7e7a 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -40,6 +40,7 @@ struct debuggerinfo_struct {
40 int exception_state; 40 int exception_state;
41 int ret_state; 41 int ret_state;
42 int irq_depth; 42 int irq_depth;
43 int enter_kgdb;
43}; 44};
44 45
45extern struct debuggerinfo_struct kgdb_info[]; 46extern struct debuggerinfo_struct kgdb_info[];
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 481a7bd2dfe7..a11db956dd62 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1093,3 +1093,33 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
1093 put_packet(remcom_out_buffer); 1093 put_packet(remcom_out_buffer);
1094 return 0; 1094 return 0;
1095} 1095}
1096
1097/**
1098 * gdbstub_exit - Send an exit message to GDB
1099 * @status: The exit code to report.
1100 */
1101void gdbstub_exit(int status)
1102{
1103 unsigned char checksum, ch, buffer[3];
1104 int loop;
1105
1106 buffer[0] = 'W';
1107 buffer[1] = hex_asc_hi(status);
1108 buffer[2] = hex_asc_lo(status);
1109
1110 dbg_io_ops->write_char('$');
1111 checksum = 0;
1112
1113 for (loop = 0; loop < 3; loop++) {
1114 ch = buffer[loop];
1115 checksum += ch;
1116 dbg_io_ops->write_char(ch);
1117 }
1118
1119 dbg_io_ops->write_char('#');
1120 dbg_io_ops->write_char(hex_asc_hi(checksum));
1121 dbg_io_ops->write_char(hex_asc_lo(checksum));
1122
1123 /* make sure the output is flushed, lest the bootloader clobber it */
1124 dbg_io_ops->flush();
1125}
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index bf6e8270e957..dd0b1b7dd02c 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -86,7 +86,7 @@ int kdb_stub(struct kgdb_state *ks)
86 } 86 }
87 /* Set initial kdb state variables */ 87 /* Set initial kdb state variables */
88 KDB_STATE_CLEAR(KGDB_TRANS); 88 KDB_STATE_CLEAR(KGDB_TRANS);
89 kdb_initial_cpu = ks->cpu; 89 kdb_initial_cpu = atomic_read(&kgdb_active);
90 kdb_current_task = kgdb_info[ks->cpu].task; 90 kdb_current_task = kgdb_info[ks->cpu].task;
91 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; 91 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
92 /* Remove any breakpoints as needed by kdb and clear single step */ 92 /* Remove any breakpoints as needed by kdb and clear single step */
@@ -105,7 +105,6 @@ int kdb_stub(struct kgdb_state *ks)
105 ks->pass_exception = 1; 105 ks->pass_exception = 1;
106 KDB_FLAG_SET(CATASTROPHIC); 106 KDB_FLAG_SET(CATASTROPHIC);
107 } 107 }
108 kdb_initial_cpu = ks->cpu;
109 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { 108 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
110 KDB_STATE_CLEAR(SSBPT); 109 KDB_STATE_CLEAR(SSBPT);
111 KDB_STATE_CLEAR(DOING_SS); 110 KDB_STATE_CLEAR(DOING_SS);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index c9b7f4f90bba..96fdaac46a80 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -823,4 +823,4 @@ int kdb_printf(const char *fmt, ...)
823 823
824 return r; 824 return r;
825} 825}
826 826EXPORT_SYMBOL_GPL(kdb_printf);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index caf057a3de0e..be14779bcef6 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -78,11 +78,11 @@ static unsigned int kdb_continue_catastrophic;
78static kdbtab_t *kdb_commands; 78static kdbtab_t *kdb_commands;
79#define KDB_BASE_CMD_MAX 50 79#define KDB_BASE_CMD_MAX 50
80static int kdb_max_commands = KDB_BASE_CMD_MAX; 80static int kdb_max_commands = KDB_BASE_CMD_MAX;
81static kdbtab_t kdb_base_commands[50]; 81static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX];
82#define for_each_kdbcmd(cmd, num) \ 82#define for_each_kdbcmd(cmd, num) \
83 for ((cmd) = kdb_base_commands, (num) = 0; \ 83 for ((cmd) = kdb_base_commands, (num) = 0; \
84 num < kdb_max_commands; \ 84 num < kdb_max_commands; \
85 num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++) 85 num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++)
86 86
87typedef struct _kdbmsg { 87typedef struct _kdbmsg {
88 int km_diag; /* kdb diagnostic */ 88 int km_diag; /* kdb diagnostic */
@@ -441,9 +441,9 @@ static int kdb_check_regs(void)
441 * symbol name, and offset to the caller. 441 * symbol name, and offset to the caller.
442 * 442 *
443 * The argument may consist of a numeric value (decimal or 443 * The argument may consist of a numeric value (decimal or
444 * hexidecimal), a symbol name, a register name (preceeded by the 444 * hexidecimal), a symbol name, a register name (preceded by the
445 * percent sign), an environment variable with a numeric value 445 * percent sign), an environment variable with a numeric value
446 * (preceeded by a dollar sign) or a simple arithmetic expression 446 * (preceded by a dollar sign) or a simple arithmetic expression
447 * consisting of a symbol name, +/-, and a numeric constant value 447 * consisting of a symbol name, +/-, and a numeric constant value
448 * (offset). 448 * (offset).
449 * Parameters: 449 * Parameters:
@@ -646,7 +646,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
646 } 646 }
647 if (!s->usable) 647 if (!s->usable)
648 return KDB_NOTIMP; 648 return KDB_NOTIMP;
649 s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB); 649 s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
650 if (!s->command) { 650 if (!s->command) {
651 kdb_printf("Could not allocate new kdb_defcmd table for %s\n", 651 kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
652 cmdstr); 652 cmdstr);
@@ -1127,7 +1127,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1127 /* special case below */ 1127 /* special case below */
1128 } else { 1128 } else {
1129 kdb_printf("\nEntering kdb (current=0x%p, pid %d) ", 1129 kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
1130 kdb_current, kdb_current->pid); 1130 kdb_current, kdb_current ? kdb_current->pid : 0);
1131#if defined(CONFIG_SMP) 1131#if defined(CONFIG_SMP)
1132 kdb_printf("on processor %d ", raw_smp_processor_id()); 1132 kdb_printf("on processor %d ", raw_smp_processor_id());
1133#endif 1133#endif
@@ -1335,7 +1335,7 @@ void kdb_print_state(const char *text, int value)
1335 * error The hardware-defined error code 1335 * error The hardware-defined error code
1336 * reason2 kdb's current reason code. 1336 * reason2 kdb's current reason code.
1337 * Initially error but can change 1337 * Initially error but can change
1338 * acording to kdb state. 1338 * according to kdb state.
1339 * db_result Result code from break or debug point. 1339 * db_result Result code from break or debug point.
1340 * regs The exception frame at time of fault/breakpoint. 1340 * regs The exception frame at time of fault/breakpoint.
1341 * should always be valid. 1341 * should always be valid.
@@ -1749,13 +1749,13 @@ static int kdb_go(int argc, const char **argv)
1749 int nextarg; 1749 int nextarg;
1750 long offset; 1750 long offset;
1751 1751
1752 if (raw_smp_processor_id() != kdb_initial_cpu) {
1753 kdb_printf("go must execute on the entry cpu, "
1754 "please use \"cpu %d\" and then execute go\n",
1755 kdb_initial_cpu);
1756 return KDB_BADCPUNUM;
1757 }
1752 if (argc == 1) { 1758 if (argc == 1) {
1753 if (raw_smp_processor_id() != kdb_initial_cpu) {
1754 kdb_printf("go <address> must be issued from the "
1755 "initial cpu, do cpu %d first\n",
1756 kdb_initial_cpu);
1757 return KDB_ARGCOUNT;
1758 }
1759 nextarg = 1; 1759 nextarg = 1;
1760 diag = kdbgetaddrarg(argc, argv, &nextarg, 1760 diag = kdbgetaddrarg(argc, argv, &nextarg,
1761 &addr, &offset, NULL); 1761 &addr, &offset, NULL);
@@ -2361,7 +2361,7 @@ static int kdb_pid(int argc, const char **argv)
2361 */ 2361 */
2362static int kdb_ll(int argc, const char **argv) 2362static int kdb_ll(int argc, const char **argv)
2363{ 2363{
2364 int diag; 2364 int diag = 0;
2365 unsigned long addr; 2365 unsigned long addr;
2366 long offset = 0; 2366 long offset = 0;
2367 unsigned long va; 2367 unsigned long va;
@@ -2400,20 +2400,21 @@ static int kdb_ll(int argc, const char **argv)
2400 char buf[80]; 2400 char buf[80];
2401 2401
2402 if (KDB_FLAG(CMD_INTERRUPT)) 2402 if (KDB_FLAG(CMD_INTERRUPT))
2403 return 0; 2403 goto out;
2404 2404
2405 sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); 2405 sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
2406 diag = kdb_parse(buf); 2406 diag = kdb_parse(buf);
2407 if (diag) 2407 if (diag)
2408 return diag; 2408 goto out;
2409 2409
2410 addr = va + linkoffset; 2410 addr = va + linkoffset;
2411 if (kdb_getword(&va, addr, sizeof(va))) 2411 if (kdb_getword(&va, addr, sizeof(va)))
2412 return 0; 2412 goto out;
2413 } 2413 }
2414 kfree(command);
2415 2414
2416 return 0; 2415out:
2416 kfree(command);
2417 return diag;
2417} 2418}
2418 2419
2419static int kdb_kgdb(int argc, const char **argv) 2420static int kdb_kgdb(int argc, const char **argv)
@@ -2603,20 +2604,17 @@ static int kdb_summary(int argc, const char **argv)
2603 */ 2604 */
2604static int kdb_per_cpu(int argc, const char **argv) 2605static int kdb_per_cpu(int argc, const char **argv)
2605{ 2606{
2606 char buf[256], fmtstr[64]; 2607 char fmtstr[64];
2607 kdb_symtab_t symtab; 2608 int cpu, diag, nextarg = 1;
2608 cpumask_t suppress = CPU_MASK_NONE; 2609 unsigned long addr, symaddr, val, bytesperword = 0, whichcpu = ~0UL;
2609 int cpu, diag;
2610 unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL;
2611 2610
2612 if (argc < 1 || argc > 3) 2611 if (argc < 1 || argc > 3)
2613 return KDB_ARGCOUNT; 2612 return KDB_ARGCOUNT;
2614 2613
2615 snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]); 2614 diag = kdbgetaddrarg(argc, argv, &nextarg, &symaddr, NULL, NULL);
2616 if (!kdbgetsymval(buf, &symtab)) { 2615 if (diag)
2617 kdb_printf("%s is not a per_cpu variable\n", argv[1]); 2616 return diag;
2618 return KDB_BADADDR; 2617
2619 }
2620 if (argc >= 2) { 2618 if (argc >= 2) {
2621 diag = kdbgetularg(argv[2], &bytesperword); 2619 diag = kdbgetularg(argv[2], &bytesperword);
2622 if (diag) 2620 if (diag)
@@ -2649,46 +2647,25 @@ static int kdb_per_cpu(int argc, const char **argv)
2649#define KDB_PCU(cpu) 0 2647#define KDB_PCU(cpu) 0
2650#endif 2648#endif
2651#endif 2649#endif
2652
2653 for_each_online_cpu(cpu) { 2650 for_each_online_cpu(cpu) {
2651 if (KDB_FLAG(CMD_INTERRUPT))
2652 return 0;
2653
2654 if (whichcpu != ~0UL && whichcpu != cpu) 2654 if (whichcpu != ~0UL && whichcpu != cpu)
2655 continue; 2655 continue;
2656 addr = symtab.sym_start + KDB_PCU(cpu); 2656 addr = symaddr + KDB_PCU(cpu);
2657 diag = kdb_getword(&val, addr, bytesperword); 2657 diag = kdb_getword(&val, addr, bytesperword);
2658 if (diag) { 2658 if (diag) {
2659 kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to " 2659 kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
2660 "read, diag=%d\n", cpu, addr, diag); 2660 "read, diag=%d\n", cpu, addr, diag);
2661 continue; 2661 continue;
2662 } 2662 }
2663#ifdef CONFIG_SMP
2664 if (!val) {
2665 cpu_set(cpu, suppress);
2666 continue;
2667 }
2668#endif /* CONFIG_SMP */
2669 kdb_printf("%5d ", cpu); 2663 kdb_printf("%5d ", cpu);
2670 kdb_md_line(fmtstr, addr, 2664 kdb_md_line(fmtstr, addr,
2671 bytesperword == KDB_WORD_SIZE, 2665 bytesperword == KDB_WORD_SIZE,
2672 1, bytesperword, 1, 1, 0); 2666 1, bytesperword, 1, 1, 0);
2673 } 2667 }
2674 if (cpus_weight(suppress) == 0)
2675 return 0;
2676 kdb_printf("Zero suppressed cpu(s):");
2677 for (cpu = first_cpu(suppress); cpu < num_possible_cpus();
2678 cpu = next_cpu(cpu, suppress)) {
2679 kdb_printf(" %d", cpu);
2680 if (cpu == num_possible_cpus() - 1 ||
2681 next_cpu(cpu, suppress) != cpu + 1)
2682 continue;
2683 while (cpu < num_possible_cpus() &&
2684 next_cpu(cpu, suppress) == cpu + 1)
2685 ++cpu;
2686 kdb_printf("-%d", cpu);
2687 }
2688 kdb_printf("\n");
2689
2690#undef KDB_PCU 2668#undef KDB_PCU
2691
2692 return 0; 2669 return 0;
2693} 2670}
2694 2671
@@ -2763,13 +2740,13 @@ int kdb_register_repeat(char *cmd,
2763 } 2740 }
2764 if (kdb_commands) { 2741 if (kdb_commands) {
2765 memcpy(new, kdb_commands, 2742 memcpy(new, kdb_commands,
2766 kdb_max_commands * sizeof(*new)); 2743 (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
2767 kfree(kdb_commands); 2744 kfree(kdb_commands);
2768 } 2745 }
2769 memset(new + kdb_max_commands, 0, 2746 memset(new + kdb_max_commands, 0,
2770 kdb_command_extend * sizeof(*new)); 2747 kdb_command_extend * sizeof(*new));
2771 kdb_commands = new; 2748 kdb_commands = new;
2772 kp = kdb_commands + kdb_max_commands; 2749 kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
2773 kdb_max_commands += kdb_command_extend; 2750 kdb_max_commands += kdb_command_extend;
2774 } 2751 }
2775 2752
@@ -2783,6 +2760,8 @@ int kdb_register_repeat(char *cmd,
2783 2760
2784 return 0; 2761 return 0;
2785} 2762}
2763EXPORT_SYMBOL_GPL(kdb_register_repeat);
2764
2786 2765
2787/* 2766/*
2788 * kdb_register - Compatibility register function for commands that do 2767 * kdb_register - Compatibility register function for commands that do
@@ -2805,6 +2784,7 @@ int kdb_register(char *cmd,
2805 return kdb_register_repeat(cmd, func, usage, help, minlen, 2784 return kdb_register_repeat(cmd, func, usage, help, minlen,
2806 KDB_REPEAT_NONE); 2785 KDB_REPEAT_NONE);
2807} 2786}
2787EXPORT_SYMBOL_GPL(kdb_register);
2808 2788
2809/* 2789/*
2810 * kdb_unregister - This function is used to unregister a kernel 2790 * kdb_unregister - This function is used to unregister a kernel
@@ -2823,7 +2803,7 @@ int kdb_unregister(char *cmd)
2823 /* 2803 /*
2824 * find the command. 2804 * find the command.
2825 */ 2805 */
2826 for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) { 2806 for_each_kdbcmd(kp, i) {
2827 if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { 2807 if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
2828 kp->cmd_name = NULL; 2808 kp->cmd_name = NULL;
2829 return 0; 2809 return 0;
@@ -2833,6 +2813,7 @@ int kdb_unregister(char *cmd)
2833 /* Couldn't find it. */ 2813 /* Couldn't find it. */
2834 return 1; 2814 return 1;
2835} 2815}
2816EXPORT_SYMBOL_GPL(kdb_unregister);
2836 2817
2837/* Initialize the kdb command table. */ 2818/* Initialize the kdb command table. */
2838static void __init kdb_inittab(void) 2819static void __init kdb_inittab(void)
@@ -2911,7 +2892,7 @@ static void __init kdb_inittab(void)
2911 "Send a signal to a process", 0, KDB_REPEAT_NONE); 2892 "Send a signal to a process", 0, KDB_REPEAT_NONE);
2912 kdb_register_repeat("summary", kdb_summary, "", 2893 kdb_register_repeat("summary", kdb_summary, "",
2913 "Summarize the system", 4, KDB_REPEAT_NONE); 2894 "Summarize the system", 4, KDB_REPEAT_NONE);
2914 kdb_register_repeat("per_cpu", kdb_per_cpu, "", 2895 kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
2915 "Display per_cpu variables", 3, KDB_REPEAT_NONE); 2896 "Display per_cpu variables", 3, KDB_REPEAT_NONE);
2916 kdb_register_repeat("grephelp", kdb_grep_help, "", 2897 kdb_register_repeat("grephelp", kdb_grep_help, "",
2917 "Display help on | grep", 0, KDB_REPEAT_NONE); 2898 "Display help on | grep", 0, KDB_REPEAT_NONE);
@@ -2933,7 +2914,7 @@ static void __init kdb_cmd_init(void)
2933 } 2914 }
2934} 2915}
2935 2916
2936/* Intialize kdb_printf, breakpoint tables and kdb state */ 2917/* Initialize kdb_printf, breakpoint tables and kdb state */
2937void __init kdb_init(int lvl) 2918void __init kdb_init(int lvl)
2938{ 2919{
2939 static int kdb_init_lvl = KDB_NOT_INITIALIZED; 2920 static int kdb_init_lvl = KDB_NOT_INITIALIZED;
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index be775f7e81e0..35d69ed1dfb5 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -15,29 +15,6 @@
15#include <linux/kgdb.h> 15#include <linux/kgdb.h>
16#include "../debug_core.h" 16#include "../debug_core.h"
17 17
18/* Kernel Debugger Error codes. Must not overlap with command codes. */
19#define KDB_NOTFOUND (-1)
20#define KDB_ARGCOUNT (-2)
21#define KDB_BADWIDTH (-3)
22#define KDB_BADRADIX (-4)
23#define KDB_NOTENV (-5)
24#define KDB_NOENVVALUE (-6)
25#define KDB_NOTIMP (-7)
26#define KDB_ENVFULL (-8)
27#define KDB_ENVBUFFULL (-9)
28#define KDB_TOOMANYBPT (-10)
29#define KDB_TOOMANYDBREGS (-11)
30#define KDB_DUPBPT (-12)
31#define KDB_BPTNOTFOUND (-13)
32#define KDB_BADMODE (-14)
33#define KDB_BADINT (-15)
34#define KDB_INVADDRFMT (-16)
35#define KDB_BADREG (-17)
36#define KDB_BADCPUNUM (-18)
37#define KDB_BADLENGTH (-19)
38#define KDB_NOBP (-20)
39#define KDB_BADADDR (-21)
40
41/* Kernel Debugger Command codes. Must not overlap with error codes. */ 18/* Kernel Debugger Command codes. Must not overlap with error codes. */
42#define KDB_CMD_GO (-1001) 19#define KDB_CMD_GO (-1001)
43#define KDB_CMD_CPU (-1002) 20#define KDB_CMD_CPU (-1002)
@@ -93,17 +70,6 @@
93 */ 70 */
94#define KDB_MAXBPT 16 71#define KDB_MAXBPT 16
95 72
96/* Maximum number of arguments to a function */
97#define KDB_MAXARGS 16
98
99typedef enum {
100 KDB_REPEAT_NONE = 0, /* Do not repeat this command */
101 KDB_REPEAT_NO_ARGS, /* Repeat the command without arguments */
102 KDB_REPEAT_WITH_ARGS, /* Repeat the command including its arguments */
103} kdb_repeat_t;
104
105typedef int (*kdb_func_t)(int, const char **);
106
107/* Symbol table format returned by kallsyms. */ 73/* Symbol table format returned by kallsyms. */
108typedef struct __ksymtab { 74typedef struct __ksymtab {
109 unsigned long value; /* Address of symbol */ 75 unsigned long value; /* Address of symbol */
@@ -123,11 +89,6 @@ extern int kallsyms_symbol_next(char *prefix_name, int flag);
123extern int kallsyms_symbol_complete(char *prefix_name, int max_len); 89extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
124 90
125/* Exported Symbols for kernel loadable modules to use. */ 91/* Exported Symbols for kernel loadable modules to use. */
126extern int kdb_register(char *, kdb_func_t, char *, char *, short);
127extern int kdb_register_repeat(char *, kdb_func_t, char *, char *,
128 short, kdb_repeat_t);
129extern int kdb_unregister(char *);
130
131extern int kdb_getarea_size(void *, unsigned long, size_t); 92extern int kdb_getarea_size(void *, unsigned long, size_t);
132extern int kdb_putarea_size(unsigned long, void *, size_t); 93extern int kdb_putarea_size(unsigned long, void *, size_t);
133 94
@@ -144,6 +105,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t);
144extern int kdb_putword(unsigned long, unsigned long, size_t); 105extern int kdb_putword(unsigned long, unsigned long, size_t);
145 106
146extern int kdbgetularg(const char *, unsigned long *); 107extern int kdbgetularg(const char *, unsigned long *);
108extern int kdbgetu64arg(const char *, u64 *);
147extern char *kdbgetenv(const char *); 109extern char *kdbgetenv(const char *);
148extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, 110extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
149 long *, char **); 111 long *, char **);
@@ -255,14 +217,6 @@ extern void kdb_ps1(const struct task_struct *p);
255extern void kdb_print_nameval(const char *name, unsigned long val); 217extern void kdb_print_nameval(const char *name, unsigned long val);
256extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); 218extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
257extern void kdb_meminfo_proc_show(void); 219extern void kdb_meminfo_proc_show(void);
258#ifdef CONFIG_KALLSYMS
259extern const char *kdb_walk_kallsyms(loff_t *pos);
260#else /* ! CONFIG_KALLSYMS */
261static inline const char *kdb_walk_kallsyms(loff_t *pos)
262{
263 return NULL;
264}
265#endif /* ! CONFIG_KALLSYMS */
266extern char *kdb_getstr(char *, size_t, char *); 220extern char *kdb_getstr(char *, size_t, char *);
267 221
268/* Defines for kdb_symbol_print */ 222/* Defines for kdb_symbol_print */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 6b2485dcb050..5532dd37aa86 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -545,7 +545,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size)
545 * Mask for process state. 545 * Mask for process state.
546 * Notes: 546 * Notes:
547 * The mask folds data from several sources into a single long value, so 547 * The mask folds data from several sources into a single long value, so
548 * be carefull not to overlap the bits. TASK_* bits are in the LSB, 548 * be careful not to overlap the bits. TASK_* bits are in the LSB,
549 * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there 549 * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
550 * is no overlap between TASK_* and EXIT_* but that may not always be 550 * is no overlap between TASK_* and EXIT_* but that may not always be
551 * true, so EXIT_* bits are shifted left 16 bits before being stored in 551 * true, so EXIT_* bits are shifted left 16 bits before being stored in
diff --git a/kernel/early_res.c b/kernel/early_res.c
deleted file mode 100644
index 7bfae887f211..000000000000
--- a/kernel/early_res.c
+++ /dev/null
@@ -1,590 +0,0 @@
1/*
2 * early_res, could be used to replace bootmem
3 */
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/init.h>
7#include <linux/bootmem.h>
8#include <linux/mm.h>
9#include <linux/early_res.h>
10#include <linux/slab.h>
11#include <linux/kmemleak.h>
12
13/*
14 * Early reserved memory areas.
15 */
16/*
17 * need to make sure this one is bigger enough before
18 * find_fw_memmap_area could be used
19 */
20#define MAX_EARLY_RES_X 32
21
22struct early_res {
23 u64 start, end;
24 char name[15];
25 char overlap_ok;
26};
27static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
28
29static int max_early_res __initdata = MAX_EARLY_RES_X;
30static struct early_res *early_res __initdata = &early_res_x[0];
31static int early_res_count __initdata;
32
33static int __init find_overlapped_early(u64 start, u64 end)
34{
35 int i;
36 struct early_res *r;
37
38 for (i = 0; i < max_early_res && early_res[i].end; i++) {
39 r = &early_res[i];
40 if (end > r->start && start < r->end)
41 break;
42 }
43
44 return i;
45}
46
47/*
48 * Drop the i-th range from the early reservation map,
49 * by copying any higher ranges down one over it, and
50 * clearing what had been the last slot.
51 */
52static void __init drop_range(int i)
53{
54 int j;
55
56 for (j = i + 1; j < max_early_res && early_res[j].end; j++)
57 ;
58
59 memmove(&early_res[i], &early_res[i + 1],
60 (j - 1 - i) * sizeof(struct early_res));
61
62 early_res[j - 1].end = 0;
63 early_res_count--;
64}
65
66static void __init drop_range_partial(int i, u64 start, u64 end)
67{
68 u64 common_start, common_end;
69 u64 old_start, old_end;
70
71 old_start = early_res[i].start;
72 old_end = early_res[i].end;
73 common_start = max(old_start, start);
74 common_end = min(old_end, end);
75
76 /* no overlap ? */
77 if (common_start >= common_end)
78 return;
79
80 if (old_start < common_start) {
81 /* make head segment */
82 early_res[i].end = common_start;
83 if (old_end > common_end) {
84 char name[15];
85
86 /*
87 * Save a local copy of the name, since the
88 * early_res array could get resized inside
89 * reserve_early_without_check() ->
90 * __check_and_double_early_res(), which would
91 * make the current name pointer invalid.
92 */
93 strncpy(name, early_res[i].name,
94 sizeof(early_res[i].name) - 1);
95 /* add another for left over on tail */
96 reserve_early_without_check(common_end, old_end, name);
97 }
98 return;
99 } else {
100 if (old_end > common_end) {
101 /* reuse the entry for tail left */
102 early_res[i].start = common_end;
103 return;
104 }
105 /* all covered */
106 drop_range(i);
107 }
108}
109
110/*
111 * Split any existing ranges that:
112 * 1) are marked 'overlap_ok', and
113 * 2) overlap with the stated range [start, end)
114 * into whatever portion (if any) of the existing range is entirely
115 * below or entirely above the stated range. Drop the portion
116 * of the existing range that overlaps with the stated range,
117 * which will allow the caller of this routine to then add that
118 * stated range without conflicting with any existing range.
119 */
120static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
121{
122 int i;
123 struct early_res *r;
124 u64 lower_start, lower_end;
125 u64 upper_start, upper_end;
126 char name[15];
127
128 for (i = 0; i < max_early_res && early_res[i].end; i++) {
129 r = &early_res[i];
130
131 /* Continue past non-overlapping ranges */
132 if (end <= r->start || start >= r->end)
133 continue;
134
135 /*
136 * Leave non-ok overlaps as is; let caller
137 * panic "Overlapping early reservations"
138 * when it hits this overlap.
139 */
140 if (!r->overlap_ok)
141 return;
142
143 /*
144 * We have an ok overlap. We will drop it from the early
145 * reservation map, and add back in any non-overlapping
146 * portions (lower or upper) as separate, overlap_ok,
147 * non-overlapping ranges.
148 */
149
150 /* 1. Note any non-overlapping (lower or upper) ranges. */
151 strncpy(name, r->name, sizeof(name) - 1);
152
153 lower_start = lower_end = 0;
154 upper_start = upper_end = 0;
155 if (r->start < start) {
156 lower_start = r->start;
157 lower_end = start;
158 }
159 if (r->end > end) {
160 upper_start = end;
161 upper_end = r->end;
162 }
163
164 /* 2. Drop the original ok overlapping range */
165 drop_range(i);
166
167 i--; /* resume for-loop on copied down entry */
168
169 /* 3. Add back in any non-overlapping ranges. */
170 if (lower_end)
171 reserve_early_overlap_ok(lower_start, lower_end, name);
172 if (upper_end)
173 reserve_early_overlap_ok(upper_start, upper_end, name);
174 }
175}
176
177static void __init __reserve_early(u64 start, u64 end, char *name,
178 int overlap_ok)
179{
180 int i;
181 struct early_res *r;
182
183 i = find_overlapped_early(start, end);
184 if (i >= max_early_res)
185 panic("Too many early reservations");
186 r = &early_res[i];
187 if (r->end)
188 panic("Overlapping early reservations "
189 "%llx-%llx %s to %llx-%llx %s\n",
190 start, end - 1, name ? name : "", r->start,
191 r->end - 1, r->name);
192 r->start = start;
193 r->end = end;
194 r->overlap_ok = overlap_ok;
195 if (name)
196 strncpy(r->name, name, sizeof(r->name) - 1);
197 early_res_count++;
198}
199
200/*
201 * A few early reservtations come here.
202 *
203 * The 'overlap_ok' in the name of this routine does -not- mean it
204 * is ok for these reservations to overlap an earlier reservation.
205 * Rather it means that it is ok for subsequent reservations to
206 * overlap this one.
207 *
208 * Use this entry point to reserve early ranges when you are doing
209 * so out of "Paranoia", reserving perhaps more memory than you need,
210 * just in case, and don't mind a subsequent overlapping reservation
211 * that is known to be needed.
212 *
213 * The drop_overlaps_that_are_ok() call here isn't really needed.
214 * It would be needed if we had two colliding 'overlap_ok'
215 * reservations, so that the second such would not panic on the
216 * overlap with the first. We don't have any such as of this
217 * writing, but might as well tolerate such if it happens in
218 * the future.
219 */
220void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
221{
222 drop_overlaps_that_are_ok(start, end);
223 __reserve_early(start, end, name, 1);
224}
225
226static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
227{
228 u64 start, end, size, mem;
229 struct early_res *new;
230
231 /* do we have enough slots left ? */
232 if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
233 return;
234
235 /* double it */
236 mem = -1ULL;
237 size = sizeof(struct early_res) * max_early_res * 2;
238 if (early_res == early_res_x)
239 start = 0;
240 else
241 start = early_res[0].end;
242 end = ex_start;
243 if (start + size < end)
244 mem = find_fw_memmap_area(start, end, size,
245 sizeof(struct early_res));
246 if (mem == -1ULL) {
247 start = ex_end;
248 end = get_max_mapped();
249 if (start + size < end)
250 mem = find_fw_memmap_area(start, end, size,
251 sizeof(struct early_res));
252 }
253 if (mem == -1ULL)
254 panic("can not find more space for early_res array");
255
256 new = __va(mem);
257 /* save the first one for own */
258 new[0].start = mem;
259 new[0].end = mem + size;
260 new[0].overlap_ok = 0;
261 /* copy old to new */
262 if (early_res == early_res_x) {
263 memcpy(&new[1], &early_res[0],
264 sizeof(struct early_res) * max_early_res);
265 memset(&new[max_early_res+1], 0,
266 sizeof(struct early_res) * (max_early_res - 1));
267 early_res_count++;
268 } else {
269 memcpy(&new[1], &early_res[1],
270 sizeof(struct early_res) * (max_early_res - 1));
271 memset(&new[max_early_res], 0,
272 sizeof(struct early_res) * max_early_res);
273 }
274 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
275 early_res = new;
276 max_early_res *= 2;
277 printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
278 max_early_res, mem, mem + size - 1);
279}
280
281/*
282 * Most early reservations come here.
283 *
284 * We first have drop_overlaps_that_are_ok() drop any pre-existing
285 * 'overlap_ok' ranges, so that we can then reserve this memory
286 * range without risk of panic'ing on an overlapping overlap_ok
287 * early reservation.
288 */
289void __init reserve_early(u64 start, u64 end, char *name)
290{
291 if (start >= end)
292 return;
293
294 __check_and_double_early_res(start, end);
295
296 drop_overlaps_that_are_ok(start, end);
297 __reserve_early(start, end, name, 0);
298}
299
300void __init reserve_early_without_check(u64 start, u64 end, char *name)
301{
302 struct early_res *r;
303
304 if (start >= end)
305 return;
306
307 __check_and_double_early_res(start, end);
308
309 r = &early_res[early_res_count];
310
311 r->start = start;
312 r->end = end;
313 r->overlap_ok = 0;
314 if (name)
315 strncpy(r->name, name, sizeof(r->name) - 1);
316 early_res_count++;
317}
318
319void __init free_early(u64 start, u64 end)
320{
321 struct early_res *r;
322 int i;
323
324 kmemleak_free_part(__va(start), end - start);
325
326 i = find_overlapped_early(start, end);
327 r = &early_res[i];
328 if (i >= max_early_res || r->end != end || r->start != start)
329 panic("free_early on not reserved area: %llx-%llx!",
330 start, end - 1);
331
332 drop_range(i);
333}
334
335void __init free_early_partial(u64 start, u64 end)
336{
337 struct early_res *r;
338 int i;
339
340 kmemleak_free_part(__va(start), end - start);
341
342 if (start == end)
343 return;
344
345 if (WARN_ONCE(start > end, " wrong range [%#llx, %#llx]\n", start, end))
346 return;
347
348try_next:
349 i = find_overlapped_early(start, end);
350 if (i >= max_early_res)
351 return;
352
353 r = &early_res[i];
354 /* hole ? */
355 if (r->end >= end && r->start <= start) {
356 drop_range_partial(i, start, end);
357 return;
358 }
359
360 drop_range_partial(i, start, end);
361 goto try_next;
362}
363
364#ifdef CONFIG_NO_BOOTMEM
365static void __init subtract_early_res(struct range *range, int az)
366{
367 int i, count;
368 u64 final_start, final_end;
369 int idx = 0;
370
371 count = 0;
372 for (i = 0; i < max_early_res && early_res[i].end; i++)
373 count++;
374
375 /* need to skip first one ?*/
376 if (early_res != early_res_x)
377 idx = 1;
378
379#define DEBUG_PRINT_EARLY_RES 1
380
381#if DEBUG_PRINT_EARLY_RES
382 printk(KERN_INFO "Subtract (%d early reservations)\n", count);
383#endif
384 for (i = idx; i < count; i++) {
385 struct early_res *r = &early_res[i];
386#if DEBUG_PRINT_EARLY_RES
387 printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i,
388 r->start, r->end, r->name);
389#endif
390 final_start = PFN_DOWN(r->start);
391 final_end = PFN_UP(r->end);
392 if (final_start >= final_end)
393 continue;
394 subtract_range(range, az, final_start, final_end);
395 }
396
397}
398
399int __init get_free_all_memory_range(struct range **rangep, int nodeid)
400{
401 int i, count;
402 u64 start = 0, end;
403 u64 size;
404 u64 mem;
405 struct range *range;
406 int nr_range;
407
408 count = 0;
409 for (i = 0; i < max_early_res && early_res[i].end; i++)
410 count++;
411
412 count *= 2;
413
414 size = sizeof(struct range) * count;
415 end = get_max_mapped();
416#ifdef MAX_DMA32_PFN
417 if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
418 start = MAX_DMA32_PFN << PAGE_SHIFT;
419#endif
420 mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
421 if (mem == -1ULL)
422 panic("can not find more space for range free");
423
424 range = __va(mem);
425 /* use early_node_map[] and early_res to get range array at first */
426 memset(range, 0, size);
427 nr_range = 0;
428
429 /* need to go over early_node_map to find out good range for node */
430 nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
431#ifdef CONFIG_X86_32
432 subtract_range(range, count, max_low_pfn, -1ULL);
433#endif
434 subtract_early_res(range, count);
435 nr_range = clean_sort_range(range, count);
436
437 /* need to clear it ? */
438 if (nodeid == MAX_NUMNODES) {
439 memset(&early_res[0], 0,
440 sizeof(struct early_res) * max_early_res);
441 early_res = NULL;
442 max_early_res = 0;
443 }
444
445 *rangep = range;
446 return nr_range;
447}
448#else
449void __init early_res_to_bootmem(u64 start, u64 end)
450{
451 int i, count;
452 u64 final_start, final_end;
453 int idx = 0;
454
455 count = 0;
456 for (i = 0; i < max_early_res && early_res[i].end; i++)
457 count++;
458
459 /* need to skip first one ?*/
460 if (early_res != early_res_x)
461 idx = 1;
462
463 printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
464 count - idx, max_early_res, start, end);
465 for (i = idx; i < count; i++) {
466 struct early_res *r = &early_res[i];
467 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
468 r->start, r->end, r->name);
469 final_start = max(start, r->start);
470 final_end = min(end, r->end);
471 if (final_start >= final_end) {
472 printk(KERN_CONT "\n");
473 continue;
474 }
475 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
476 final_start, final_end);
477 reserve_bootmem_generic(final_start, final_end - final_start,
478 BOOTMEM_DEFAULT);
479 }
480 /* clear them */
481 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
482 early_res = NULL;
483 max_early_res = 0;
484 early_res_count = 0;
485}
486#endif
487
488/* Check for already reserved areas */
489static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
490{
491 int i;
492 u64 addr = *addrp;
493 int changed = 0;
494 struct early_res *r;
495again:
496 i = find_overlapped_early(addr, addr + size);
497 r = &early_res[i];
498 if (i < max_early_res && r->end) {
499 *addrp = addr = round_up(r->end, align);
500 changed = 1;
501 goto again;
502 }
503 return changed;
504}
505
506/* Check for already reserved areas */
507static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
508{
509 int i;
510 u64 addr = *addrp, last;
511 u64 size = *sizep;
512 int changed = 0;
513again:
514 last = addr + size;
515 for (i = 0; i < max_early_res && early_res[i].end; i++) {
516 struct early_res *r = &early_res[i];
517 if (last > r->start && addr < r->start) {
518 size = r->start - addr;
519 changed = 1;
520 goto again;
521 }
522 if (last > r->end && addr < r->end) {
523 addr = round_up(r->end, align);
524 size = last - addr;
525 changed = 1;
526 goto again;
527 }
528 if (last <= r->end && addr >= r->start) {
529 (*sizep)++;
530 return 0;
531 }
532 }
533 if (changed) {
534 *addrp = addr;
535 *sizep = size;
536 }
537 return changed;
538}
539
540/*
541 * Find a free area with specified alignment in a specific range.
542 * only with the area.between start to end is active range from early_node_map
543 * so they are good as RAM
544 */
545u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
546 u64 size, u64 align)
547{
548 u64 addr, last;
549
550 addr = round_up(ei_start, align);
551 if (addr < start)
552 addr = round_up(start, align);
553 if (addr >= ei_last)
554 goto out;
555 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
556 ;
557 last = addr + size;
558 if (last > ei_last)
559 goto out;
560 if (last > end)
561 goto out;
562
563 return addr;
564
565out:
566 return -1ULL;
567}
568
569u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
570 u64 *sizep, u64 align)
571{
572 u64 addr, last;
573
574 addr = round_up(ei_start, align);
575 if (addr < start)
576 addr = round_up(start, align);
577 if (addr >= ei_last)
578 goto out;
579 *sizep = ei_last - addr;
580 while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
581 ;
582 last = addr + *sizep;
583 if (last > ei_last)
584 goto out;
585
586 return addr;
587
588out:
589 return -1ULL;
590}
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
new file mode 100644
index 000000000000..1ce23d3d8394
--- /dev/null
+++ b/kernel/events/Makefile
@@ -0,0 +1,6 @@
1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg
3endif
4
5obj-y := core.o
6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/perf_event.c b/kernel/events/core.c
index b98bed3d8182..9efe7108ccaf 100644
--- a/kernel/perf_event.c
+++ b/kernel/events/core.c
@@ -2,8 +2,8 @@
2 * Performance events core code: 2 * Performance events core code:
3 * 3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 * 8 *
9 * For licensing details see kernel-base/COPYING 9 * For licensing details see kernel-base/COPYING
@@ -13,6 +13,7 @@
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/smp.h> 15#include <linux/smp.h>
16#include <linux/idr.h>
16#include <linux/file.h> 17#include <linux/file.h>
17#include <linux/poll.h> 18#include <linux/poll.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
@@ -21,7 +22,9 @@
21#include <linux/dcache.h> 22#include <linux/dcache.h>
22#include <linux/percpu.h> 23#include <linux/percpu.h>
23#include <linux/ptrace.h> 24#include <linux/ptrace.h>
25#include <linux/reboot.h>
24#include <linux/vmstat.h> 26#include <linux/vmstat.h>
27#include <linux/device.h>
25#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
26#include <linux/hardirq.h> 29#include <linux/hardirq.h>
27#include <linux/rculist.h> 30#include <linux/rculist.h>
@@ -35,20 +38,104 @@
35 38
36#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
37 40
38/* 41struct remote_function_call {
39 * Each CPU has a list of per CPU events: 42 struct task_struct *p;
43 int (*func)(void *info);
44 void *info;
45 int ret;
46};
47
48static void remote_function(void *data)
49{
50 struct remote_function_call *tfc = data;
51 struct task_struct *p = tfc->p;
52
53 if (p) {
54 tfc->ret = -EAGAIN;
55 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
56 return;
57 }
58
59 tfc->ret = tfc->func(tfc->info);
60}
61
62/**
63 * task_function_call - call a function on the cpu on which a task runs
64 * @p: the task to evaluate
65 * @func: the function to be called
66 * @info: the function call argument
67 *
68 * Calls the function @func when the task is currently running. This might
69 * be on the current CPU, which just calls the function directly
70 *
71 * returns: @func return value, or
72 * -ESRCH - when the process isn't running
73 * -EAGAIN - when the process moved away
40 */ 74 */
41static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); 75static int
76task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
77{
78 struct remote_function_call data = {
79 .p = p,
80 .func = func,
81 .info = info,
82 .ret = -ESRCH, /* No such (running) process */
83 };
42 84
43int perf_max_events __read_mostly = 1; 85 if (task_curr(p))
44static int perf_reserved_percpu __read_mostly; 86 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
45static int perf_overcommit __read_mostly = 1; 87
88 return data.ret;
89}
90
91/**
92 * cpu_function_call - call a function on the cpu
93 * @func: the function to be called
94 * @info: the function call argument
95 *
96 * Calls the function @func on the remote cpu.
97 *
98 * returns: @func return value or -ENXIO when the cpu is offline
99 */
100static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
101{
102 struct remote_function_call data = {
103 .p = NULL,
104 .func = func,
105 .info = info,
106 .ret = -ENXIO, /* No such CPU */
107 };
108
109 smp_call_function_single(cpu, remote_function, &data, 1);
110
111 return data.ret;
112}
113
114#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
115 PERF_FLAG_FD_OUTPUT |\
116 PERF_FLAG_PID_CGROUP)
117
118enum event_type_t {
119 EVENT_FLEXIBLE = 0x1,
120 EVENT_PINNED = 0x2,
121 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
122};
123
124/*
125 * perf_sched_events : >0 events exist
126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
127 */
128struct jump_label_key perf_sched_events __read_mostly;
129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
46 130
47static atomic_t nr_events __read_mostly;
48static atomic_t nr_mmap_events __read_mostly; 131static atomic_t nr_mmap_events __read_mostly;
49static atomic_t nr_comm_events __read_mostly; 132static atomic_t nr_comm_events __read_mostly;
50static atomic_t nr_task_events __read_mostly; 133static atomic_t nr_task_events __read_mostly;
51 134
135static LIST_HEAD(pmus);
136static DEFINE_MUTEX(pmus_lock);
137static struct srcu_struct pmus_srcu;
138
52/* 139/*
53 * perf event paranoia level: 140 * perf event paranoia level:
54 * -1 - not paranoid at all 141 * -1 - not paranoid at all
@@ -58,58 +145,445 @@ static atomic_t nr_task_events __read_mostly;
58 */ 145 */
59int sysctl_perf_event_paranoid __read_mostly = 1; 146int sysctl_perf_event_paranoid __read_mostly = 1;
60 147
61int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ 148/* Minimum for 512 kiB + 1 user control page */
149int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
62 150
63/* 151/*
64 * max perf event sample rate 152 * max perf event sample rate
65 */ 153 */
66int sysctl_perf_event_sample_rate __read_mostly = 100000; 154#define DEFAULT_MAX_SAMPLE_RATE 100000
155int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
156static int max_samples_per_tick __read_mostly =
157 DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
158
159int perf_proc_update_handler(struct ctl_table *table, int write,
160 void __user *buffer, size_t *lenp,
161 loff_t *ppos)
162{
163 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
164
165 if (ret || !write)
166 return ret;
167
168 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
169
170 return 0;
171}
67 172
68static atomic64_t perf_event_id; 173static atomic64_t perf_event_id;
69 174
175static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
176 enum event_type_t event_type);
177
178static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
179 enum event_type_t event_type,
180 struct task_struct *task);
181
182static void update_context_time(struct perf_event_context *ctx);
183static u64 perf_event_time(struct perf_event *event);
184
185void __weak perf_event_print_debug(void) { }
186
187extern __weak const char *perf_pmu_name(void)
188{
189 return "pmu";
190}
191
192static inline u64 perf_clock(void)
193{
194 return local_clock();
195}
196
197static inline struct perf_cpu_context *
198__get_cpu_context(struct perf_event_context *ctx)
199{
200 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
201}
202
203#ifdef CONFIG_CGROUP_PERF
204
70/* 205/*
71 * Lock for (sysadmin-configurable) event reservations: 206 * Must ensure cgroup is pinned (css_get) before calling
207 * this function. In other words, we cannot call this function
208 * if there is no cgroup event for the current CPU context.
72 */ 209 */
73static DEFINE_SPINLOCK(perf_resource_lock); 210static inline struct perf_cgroup *
211perf_cgroup_from_task(struct task_struct *task)
212{
213 return container_of(task_subsys_state(task, perf_subsys_id),
214 struct perf_cgroup, css);
215}
216
217static inline bool
218perf_cgroup_match(struct perf_event *event)
219{
220 struct perf_event_context *ctx = event->ctx;
221 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
222
223 return !event->cgrp || event->cgrp == cpuctx->cgrp;
224}
225
226static inline void perf_get_cgroup(struct perf_event *event)
227{
228 css_get(&event->cgrp->css);
229}
230
231static inline void perf_put_cgroup(struct perf_event *event)
232{
233 css_put(&event->cgrp->css);
234}
235
236static inline void perf_detach_cgroup(struct perf_event *event)
237{
238 perf_put_cgroup(event);
239 event->cgrp = NULL;
240}
241
242static inline int is_cgroup_event(struct perf_event *event)
243{
244 return event->cgrp != NULL;
245}
246
247static inline u64 perf_cgroup_event_time(struct perf_event *event)
248{
249 struct perf_cgroup_info *t;
250
251 t = per_cpu_ptr(event->cgrp->info, event->cpu);
252 return t->time;
253}
254
255static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
256{
257 struct perf_cgroup_info *info;
258 u64 now;
259
260 now = perf_clock();
261
262 info = this_cpu_ptr(cgrp->info);
263
264 info->time += now - info->timestamp;
265 info->timestamp = now;
266}
267
268static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
269{
270 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
271 if (cgrp_out)
272 __update_cgrp_time(cgrp_out);
273}
274
275static inline void update_cgrp_time_from_event(struct perf_event *event)
276{
277 struct perf_cgroup *cgrp;
278
279 /*
280 * ensure we access cgroup data only when needed and
281 * when we know the cgroup is pinned (css_get)
282 */
283 if (!is_cgroup_event(event))
284 return;
285
286 cgrp = perf_cgroup_from_task(current);
287 /*
288 * Do not update time when cgroup is not active
289 */
290 if (cgrp == event->cgrp)
291 __update_cgrp_time(event->cgrp);
292}
293
294static inline void
295perf_cgroup_set_timestamp(struct task_struct *task,
296 struct perf_event_context *ctx)
297{
298 struct perf_cgroup *cgrp;
299 struct perf_cgroup_info *info;
300
301 /*
302 * ctx->lock held by caller
303 * ensure we do not access cgroup data
304 * unless we have the cgroup pinned (css_get)
305 */
306 if (!task || !ctx->nr_cgroups)
307 return;
308
309 cgrp = perf_cgroup_from_task(task);
310 info = this_cpu_ptr(cgrp->info);
311 info->timestamp = ctx->timestamp;
312}
313
314#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
315#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
74 316
75/* 317/*
76 * Architecture provided APIs - weak aliases: 318 * reschedule events based on the cgroup constraint of task.
319 *
320 * mode SWOUT : schedule out everything
321 * mode SWIN : schedule in based on cgroup for next
77 */ 322 */
78extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) 323void perf_cgroup_switch(struct task_struct *task, int mode)
79{ 324{
80 return NULL; 325 struct perf_cpu_context *cpuctx;
326 struct pmu *pmu;
327 unsigned long flags;
328
329 /*
330 * disable interrupts to avoid geting nr_cgroup
331 * changes via __perf_event_disable(). Also
332 * avoids preemption.
333 */
334 local_irq_save(flags);
335
336 /*
337 * we reschedule only in the presence of cgroup
338 * constrained events.
339 */
340 rcu_read_lock();
341
342 list_for_each_entry_rcu(pmu, &pmus, entry) {
343
344 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
345
346 perf_pmu_disable(cpuctx->ctx.pmu);
347
348 /*
349 * perf_cgroup_events says at least one
350 * context on this CPU has cgroup events.
351 *
352 * ctx->nr_cgroups reports the number of cgroup
353 * events for a context.
354 */
355 if (cpuctx->ctx.nr_cgroups > 0) {
356
357 if (mode & PERF_CGROUP_SWOUT) {
358 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
359 /*
360 * must not be done before ctxswout due
361 * to event_filter_match() in event_sched_out()
362 */
363 cpuctx->cgrp = NULL;
364 }
365
366 if (mode & PERF_CGROUP_SWIN) {
367 WARN_ON_ONCE(cpuctx->cgrp);
368 /* set cgrp before ctxsw in to
369 * allow event_filter_match() to not
370 * have to pass task around
371 */
372 cpuctx->cgrp = perf_cgroup_from_task(task);
373 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
374 }
375 }
376
377 perf_pmu_enable(cpuctx->ctx.pmu);
378 }
379
380 rcu_read_unlock();
381
382 local_irq_restore(flags);
81} 383}
82 384
83void __weak hw_perf_disable(void) { barrier(); } 385static inline void perf_cgroup_sched_out(struct task_struct *task)
84void __weak hw_perf_enable(void) { barrier(); } 386{
387 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
388}
85 389
86void __weak perf_event_print_debug(void) { } 390static inline void perf_cgroup_sched_in(struct task_struct *task)
391{
392 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
393}
394
395static inline int perf_cgroup_connect(int fd, struct perf_event *event,
396 struct perf_event_attr *attr,
397 struct perf_event *group_leader)
398{
399 struct perf_cgroup *cgrp;
400 struct cgroup_subsys_state *css;
401 struct file *file;
402 int ret = 0, fput_needed;
403
404 file = fget_light(fd, &fput_needed);
405 if (!file)
406 return -EBADF;
407
408 css = cgroup_css_from_dir(file, perf_subsys_id);
409 if (IS_ERR(css)) {
410 ret = PTR_ERR(css);
411 goto out;
412 }
413
414 cgrp = container_of(css, struct perf_cgroup, css);
415 event->cgrp = cgrp;
416
417 /* must be done before we fput() the file */
418 perf_get_cgroup(event);
87 419
88static DEFINE_PER_CPU(int, perf_disable_count); 420 /*
421 * all events in a group must monitor
422 * the same cgroup because a task belongs
423 * to only one perf cgroup at a time
424 */
425 if (group_leader && group_leader->cgrp != cgrp) {
426 perf_detach_cgroup(event);
427 ret = -EINVAL;
428 }
429out:
430 fput_light(file, fput_needed);
431 return ret;
432}
89 433
90void perf_disable(void) 434static inline void
435perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
91{ 436{
92 if (!__get_cpu_var(perf_disable_count)++) 437 struct perf_cgroup_info *t;
93 hw_perf_disable(); 438 t = per_cpu_ptr(event->cgrp->info, event->cpu);
439 event->shadow_ctx_time = now - t->timestamp;
94} 440}
95 441
96void perf_enable(void) 442static inline void
443perf_cgroup_defer_enabled(struct perf_event *event)
97{ 444{
98 if (!--__get_cpu_var(perf_disable_count)) 445 /*
99 hw_perf_enable(); 446 * when the current task's perf cgroup does not match
447 * the event's, we need to remember to call the
448 * perf_mark_enable() function the first time a task with
449 * a matching perf cgroup is scheduled in.
450 */
451 if (is_cgroup_event(event) && !perf_cgroup_match(event))
452 event->cgrp_defer_enabled = 1;
100} 453}
101 454
102static void get_ctx(struct perf_event_context *ctx) 455static inline void
456perf_cgroup_mark_enabled(struct perf_event *event,
457 struct perf_event_context *ctx)
103{ 458{
104 WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); 459 struct perf_event *sub;
460 u64 tstamp = perf_event_time(event);
461
462 if (!event->cgrp_defer_enabled)
463 return;
464
465 event->cgrp_defer_enabled = 0;
466
467 event->tstamp_enabled = tstamp - event->total_time_enabled;
468 list_for_each_entry(sub, &event->sibling_list, group_entry) {
469 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
470 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
471 sub->cgrp_defer_enabled = 0;
472 }
473 }
105} 474}
475#else /* !CONFIG_CGROUP_PERF */
106 476
107static void free_ctx(struct rcu_head *head) 477static inline bool
478perf_cgroup_match(struct perf_event *event)
108{ 479{
109 struct perf_event_context *ctx; 480 return true;
481}
482
483static inline void perf_detach_cgroup(struct perf_event *event)
484{}
485
486static inline int is_cgroup_event(struct perf_event *event)
487{
488 return 0;
489}
490
491static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
492{
493 return 0;
494}
495
496static inline void update_cgrp_time_from_event(struct perf_event *event)
497{
498}
110 499
111 ctx = container_of(head, struct perf_event_context, rcu_head); 500static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
112 kfree(ctx); 501{
502}
503
504static inline void perf_cgroup_sched_out(struct task_struct *task)
505{
506}
507
508static inline void perf_cgroup_sched_in(struct task_struct *task)
509{
510}
511
512static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
513 struct perf_event_attr *attr,
514 struct perf_event *group_leader)
515{
516 return -EINVAL;
517}
518
519static inline void
520perf_cgroup_set_timestamp(struct task_struct *task,
521 struct perf_event_context *ctx)
522{
523}
524
525void
526perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
527{
528}
529
530static inline void
531perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
532{
533}
534
535static inline u64 perf_cgroup_event_time(struct perf_event *event)
536{
537 return 0;
538}
539
540static inline void
541perf_cgroup_defer_enabled(struct perf_event *event)
542{
543}
544
545static inline void
546perf_cgroup_mark_enabled(struct perf_event *event,
547 struct perf_event_context *ctx)
548{
549}
550#endif
551
552void perf_pmu_disable(struct pmu *pmu)
553{
554 int *count = this_cpu_ptr(pmu->pmu_disable_count);
555 if (!(*count)++)
556 pmu->pmu_disable(pmu);
557}
558
559void perf_pmu_enable(struct pmu *pmu)
560{
561 int *count = this_cpu_ptr(pmu->pmu_disable_count);
562 if (!--(*count))
563 pmu->pmu_enable(pmu);
564}
565
566static DEFINE_PER_CPU(struct list_head, rotation_list);
567
568/*
569 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
570 * because they're strictly cpu affine and rotate_start is called with IRQs
571 * disabled, while rotate_context is called from IRQ context.
572 */
573static void perf_pmu_rotate_start(struct pmu *pmu)
574{
575 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
576 struct list_head *head = &__get_cpu_var(rotation_list);
577
578 WARN_ON(!irqs_disabled());
579
580 if (list_empty(&cpuctx->rotation_list))
581 list_add(&cpuctx->rotation_list, head);
582}
583
584static void get_ctx(struct perf_event_context *ctx)
585{
586 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
113} 587}
114 588
115static void put_ctx(struct perf_event_context *ctx) 589static void put_ctx(struct perf_event_context *ctx)
@@ -119,7 +593,7 @@ static void put_ctx(struct perf_event_context *ctx)
119 put_ctx(ctx->parent_ctx); 593 put_ctx(ctx->parent_ctx);
120 if (ctx->task) 594 if (ctx->task)
121 put_task_struct(ctx->task); 595 put_task_struct(ctx->task);
122 call_rcu(&ctx->rcu_head, free_ctx); 596 kfree_rcu(ctx, rcu_head);
123 } 597 }
124} 598}
125 599
@@ -131,6 +605,28 @@ static void unclone_ctx(struct perf_event_context *ctx)
131 } 605 }
132} 606}
133 607
608static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
609{
610 /*
611 * only top level events have the pid namespace they were created in
612 */
613 if (event->parent)
614 event = event->parent;
615
616 return task_tgid_nr_ns(p, event->ns);
617}
618
619static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
620{
621 /*
622 * only top level events have the pid namespace they were created in
623 */
624 if (event->parent)
625 event = event->parent;
626
627 return task_pid_nr_ns(p, event->ns);
628}
629
134/* 630/*
135 * If we inherit events we want to return the parent event id 631 * If we inherit events we want to return the parent event id
136 * to userspace. 632 * to userspace.
@@ -151,13 +647,13 @@ static u64 primary_event_id(struct perf_event *event)
151 * the context could get moved to another task. 647 * the context could get moved to another task.
152 */ 648 */
153static struct perf_event_context * 649static struct perf_event_context *
154perf_lock_task_context(struct task_struct *task, unsigned long *flags) 650perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
155{ 651{
156 struct perf_event_context *ctx; 652 struct perf_event_context *ctx;
157 653
158 rcu_read_lock(); 654 rcu_read_lock();
159 retry: 655retry:
160 ctx = rcu_dereference(task->perf_event_ctxp); 656 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
161 if (ctx) { 657 if (ctx) {
162 /* 658 /*
163 * If this context is a clone of another, it might 659 * If this context is a clone of another, it might
@@ -170,7 +666,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
170 * can't get swapped on us any more. 666 * can't get swapped on us any more.
171 */ 667 */
172 raw_spin_lock_irqsave(&ctx->lock, *flags); 668 raw_spin_lock_irqsave(&ctx->lock, *flags);
173 if (ctx != rcu_dereference(task->perf_event_ctxp)) { 669 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
174 raw_spin_unlock_irqrestore(&ctx->lock, *flags); 670 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
175 goto retry; 671 goto retry;
176 } 672 }
@@ -189,12 +685,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
189 * can't get swapped to another task. This also increments its 685 * can't get swapped to another task. This also increments its
190 * reference count so that the context can't get freed. 686 * reference count so that the context can't get freed.
191 */ 687 */
192static struct perf_event_context *perf_pin_task_context(struct task_struct *task) 688static struct perf_event_context *
689perf_pin_task_context(struct task_struct *task, int ctxn)
193{ 690{
194 struct perf_event_context *ctx; 691 struct perf_event_context *ctx;
195 unsigned long flags; 692 unsigned long flags;
196 693
197 ctx = perf_lock_task_context(task, &flags); 694 ctx = perf_lock_task_context(task, ctxn, &flags);
198 if (ctx) { 695 if (ctx) {
199 ++ctx->pin_count; 696 ++ctx->pin_count;
200 raw_spin_unlock_irqrestore(&ctx->lock, flags); 697 raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -209,12 +706,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
209 raw_spin_lock_irqsave(&ctx->lock, flags); 706 raw_spin_lock_irqsave(&ctx->lock, flags);
210 --ctx->pin_count; 707 --ctx->pin_count;
211 raw_spin_unlock_irqrestore(&ctx->lock, flags); 708 raw_spin_unlock_irqrestore(&ctx->lock, flags);
212 put_ctx(ctx);
213}
214
215static inline u64 perf_clock(void)
216{
217 return local_clock();
218} 709}
219 710
220/* 711/*
@@ -228,6 +719,16 @@ static void update_context_time(struct perf_event_context *ctx)
228 ctx->timestamp = now; 719 ctx->timestamp = now;
229} 720}
230 721
722static u64 perf_event_time(struct perf_event *event)
723{
724 struct perf_event_context *ctx = event->ctx;
725
726 if (is_cgroup_event(event))
727 return perf_cgroup_event_time(event);
728
729 return ctx ? ctx->time : 0;
730}
731
231/* 732/*
232 * Update the total_time_enabled and total_time_running fields for a event. 733 * Update the total_time_enabled and total_time_running fields for a event.
233 */ 734 */
@@ -239,8 +740,19 @@ static void update_event_times(struct perf_event *event)
239 if (event->state < PERF_EVENT_STATE_INACTIVE || 740 if (event->state < PERF_EVENT_STATE_INACTIVE ||
240 event->group_leader->state < PERF_EVENT_STATE_INACTIVE) 741 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
241 return; 742 return;
242 743 /*
243 if (ctx->is_active) 744 * in cgroup mode, time_enabled represents
745 * the time the event was enabled AND active
746 * tasks were in the monitored cgroup. This is
747 * independent of the activity of the context as
748 * there may be a mix of cgroup and non-cgroup events.
749 *
750 * That is why we treat cgroup events differently
751 * here.
752 */
753 if (is_cgroup_event(event))
754 run_end = perf_event_time(event);
755 else if (ctx->is_active)
244 run_end = ctx->time; 756 run_end = ctx->time;
245 else 757 else
246 run_end = event->tstamp_stopped; 758 run_end = event->tstamp_stopped;
@@ -250,9 +762,10 @@ static void update_event_times(struct perf_event *event)
250 if (event->state == PERF_EVENT_STATE_INACTIVE) 762 if (event->state == PERF_EVENT_STATE_INACTIVE)
251 run_end = event->tstamp_stopped; 763 run_end = event->tstamp_stopped;
252 else 764 else
253 run_end = ctx->time; 765 run_end = perf_event_time(event);
254 766
255 event->total_time_running = run_end - event->tstamp_running; 767 event->total_time_running = run_end - event->tstamp_running;
768
256} 769}
257 770
258/* 771/*
@@ -301,17 +814,102 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
301 list_add_tail(&event->group_entry, list); 814 list_add_tail(&event->group_entry, list);
302 } 815 }
303 816
817 if (is_cgroup_event(event))
818 ctx->nr_cgroups++;
819
304 list_add_rcu(&event->event_entry, &ctx->event_list); 820 list_add_rcu(&event->event_entry, &ctx->event_list);
821 if (!ctx->nr_events)
822 perf_pmu_rotate_start(ctx->pmu);
305 ctx->nr_events++; 823 ctx->nr_events++;
306 if (event->attr.inherit_stat) 824 if (event->attr.inherit_stat)
307 ctx->nr_stat++; 825 ctx->nr_stat++;
308} 826}
309 827
828/*
829 * Called at perf_event creation and when events are attached/detached from a
830 * group.
831 */
832static void perf_event__read_size(struct perf_event *event)
833{
834 int entry = sizeof(u64); /* value */
835 int size = 0;
836 int nr = 1;
837
838 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
839 size += sizeof(u64);
840
841 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
842 size += sizeof(u64);
843
844 if (event->attr.read_format & PERF_FORMAT_ID)
845 entry += sizeof(u64);
846
847 if (event->attr.read_format & PERF_FORMAT_GROUP) {
848 nr += event->group_leader->nr_siblings;
849 size += sizeof(u64);
850 }
851
852 size += entry * nr;
853 event->read_size = size;
854}
855
856static void perf_event__header_size(struct perf_event *event)
857{
858 struct perf_sample_data *data;
859 u64 sample_type = event->attr.sample_type;
860 u16 size = 0;
861
862 perf_event__read_size(event);
863
864 if (sample_type & PERF_SAMPLE_IP)
865 size += sizeof(data->ip);
866
867 if (sample_type & PERF_SAMPLE_ADDR)
868 size += sizeof(data->addr);
869
870 if (sample_type & PERF_SAMPLE_PERIOD)
871 size += sizeof(data->period);
872
873 if (sample_type & PERF_SAMPLE_READ)
874 size += event->read_size;
875
876 event->header_size = size;
877}
878
879static void perf_event__id_header_size(struct perf_event *event)
880{
881 struct perf_sample_data *data;
882 u64 sample_type = event->attr.sample_type;
883 u16 size = 0;
884
885 if (sample_type & PERF_SAMPLE_TID)
886 size += sizeof(data->tid_entry);
887
888 if (sample_type & PERF_SAMPLE_TIME)
889 size += sizeof(data->time);
890
891 if (sample_type & PERF_SAMPLE_ID)
892 size += sizeof(data->id);
893
894 if (sample_type & PERF_SAMPLE_STREAM_ID)
895 size += sizeof(data->stream_id);
896
897 if (sample_type & PERF_SAMPLE_CPU)
898 size += sizeof(data->cpu_entry);
899
900 event->id_header_size = size;
901}
902
310static void perf_group_attach(struct perf_event *event) 903static void perf_group_attach(struct perf_event *event)
311{ 904{
312 struct perf_event *group_leader = event->group_leader; 905 struct perf_event *group_leader = event->group_leader, *pos;
906
907 /*
908 * We can have double attach due to group movement in perf_event_open.
909 */
910 if (event->attach_state & PERF_ATTACH_GROUP)
911 return;
313 912
314 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
315 event->attach_state |= PERF_ATTACH_GROUP; 913 event->attach_state |= PERF_ATTACH_GROUP;
316 914
317 if (group_leader == event) 915 if (group_leader == event)
@@ -323,6 +921,11 @@ static void perf_group_attach(struct perf_event *event)
323 921
324 list_add_tail(&event->group_entry, &group_leader->sibling_list); 922 list_add_tail(&event->group_entry, &group_leader->sibling_list);
325 group_leader->nr_siblings++; 923 group_leader->nr_siblings++;
924
925 perf_event__header_size(group_leader);
926
927 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
928 perf_event__header_size(pos);
326} 929}
327 930
328/* 931/*
@@ -332,6 +935,7 @@ static void perf_group_attach(struct perf_event *event)
332static void 935static void
333list_del_event(struct perf_event *event, struct perf_event_context *ctx) 936list_del_event(struct perf_event *event, struct perf_event_context *ctx)
334{ 937{
938 struct perf_cpu_context *cpuctx;
335 /* 939 /*
336 * We can have double detach due to exit/hot-unplug + close. 940 * We can have double detach due to exit/hot-unplug + close.
337 */ 941 */
@@ -340,6 +944,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
340 944
341 event->attach_state &= ~PERF_ATTACH_CONTEXT; 945 event->attach_state &= ~PERF_ATTACH_CONTEXT;
342 946
947 if (is_cgroup_event(event)) {
948 ctx->nr_cgroups--;
949 cpuctx = __get_cpu_context(ctx);
950 /*
951 * if there are no more cgroup events
952 * then cler cgrp to avoid stale pointer
953 * in update_cgrp_time_from_cpuctx()
954 */
955 if (!ctx->nr_cgroups)
956 cpuctx->cgrp = NULL;
957 }
958
343 ctx->nr_events--; 959 ctx->nr_events--;
344 if (event->attr.inherit_stat) 960 if (event->attr.inherit_stat)
345 ctx->nr_stat--; 961 ctx->nr_stat--;
@@ -381,7 +997,7 @@ static void perf_group_detach(struct perf_event *event)
381 if (event->group_leader != event) { 997 if (event->group_leader != event) {
382 list_del_init(&event->group_entry); 998 list_del_init(&event->group_entry);
383 event->group_leader->nr_siblings--; 999 event->group_leader->nr_siblings--;
384 return; 1000 goto out;
385 } 1001 }
386 1002
387 if (!list_empty(&event->group_entry)) 1003 if (!list_empty(&event->group_entry))
@@ -400,12 +1016,19 @@ static void perf_group_detach(struct perf_event *event)
400 /* Inherit group flags from the previous leader */ 1016 /* Inherit group flags from the previous leader */
401 sibling->group_flags = event->group_flags; 1017 sibling->group_flags = event->group_flags;
402 } 1018 }
1019
1020out:
1021 perf_event__header_size(event->group_leader);
1022
1023 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1024 perf_event__header_size(tmp);
403} 1025}
404 1026
405static inline int 1027static inline int
406event_filter_match(struct perf_event *event) 1028event_filter_match(struct perf_event *event)
407{ 1029{
408 return event->cpu == -1 || event->cpu == smp_processor_id(); 1030 return (event->cpu == -1 || event->cpu == smp_processor_id())
1031 && perf_cgroup_match(event);
409} 1032}
410 1033
411static void 1034static void
@@ -413,6 +1036,7 @@ event_sched_out(struct perf_event *event,
413 struct perf_cpu_context *cpuctx, 1036 struct perf_cpu_context *cpuctx,
414 struct perf_event_context *ctx) 1037 struct perf_event_context *ctx)
415{ 1038{
1039 u64 tstamp = perf_event_time(event);
416 u64 delta; 1040 u64 delta;
417 /* 1041 /*
418 * An event which could not be activated because of 1042 * An event which could not be activated because of
@@ -422,9 +1046,9 @@ event_sched_out(struct perf_event *event,
422 */ 1046 */
423 if (event->state == PERF_EVENT_STATE_INACTIVE 1047 if (event->state == PERF_EVENT_STATE_INACTIVE
424 && !event_filter_match(event)) { 1048 && !event_filter_match(event)) {
425 delta = ctx->time - event->tstamp_stopped; 1049 delta = tstamp - event->tstamp_stopped;
426 event->tstamp_running += delta; 1050 event->tstamp_running += delta;
427 event->tstamp_stopped = ctx->time; 1051 event->tstamp_stopped = tstamp;
428 } 1052 }
429 1053
430 if (event->state != PERF_EVENT_STATE_ACTIVE) 1054 if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -435,8 +1059,8 @@ event_sched_out(struct perf_event *event,
435 event->pending_disable = 0; 1059 event->pending_disable = 0;
436 event->state = PERF_EVENT_STATE_OFF; 1060 event->state = PERF_EVENT_STATE_OFF;
437 } 1061 }
438 event->tstamp_stopped = ctx->time; 1062 event->tstamp_stopped = tstamp;
439 event->pmu->disable(event); 1063 event->pmu->del(event, 0);
440 event->oncpu = -1; 1064 event->oncpu = -1;
441 1065
442 if (!is_software_event(event)) 1066 if (!is_software_event(event))
@@ -472,51 +1096,24 @@ group_sched_out(struct perf_event *group_event,
472 * We disable the event on the hardware level first. After that we 1096 * We disable the event on the hardware level first. After that we
473 * remove it from the context list. 1097 * remove it from the context list.
474 */ 1098 */
475static void __perf_event_remove_from_context(void *info) 1099static int __perf_remove_from_context(void *info)
476{ 1100{
477 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
478 struct perf_event *event = info; 1101 struct perf_event *event = info;
479 struct perf_event_context *ctx = event->ctx; 1102 struct perf_event_context *ctx = event->ctx;
480 1103 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
481 /*
482 * If this is a task context, we need to check whether it is
483 * the current task context of this cpu. If not it has been
484 * scheduled out before the smp call arrived.
485 */
486 if (ctx->task && cpuctx->task_ctx != ctx)
487 return;
488 1104
489 raw_spin_lock(&ctx->lock); 1105 raw_spin_lock(&ctx->lock);
490 /*
491 * Protect the list operation against NMI by disabling the
492 * events on a global level.
493 */
494 perf_disable();
495
496 event_sched_out(event, cpuctx, ctx); 1106 event_sched_out(event, cpuctx, ctx);
497
498 list_del_event(event, ctx); 1107 list_del_event(event, ctx);
499
500 if (!ctx->task) {
501 /*
502 * Allow more per task events with respect to the
503 * reservation:
504 */
505 cpuctx->max_pertask =
506 min(perf_max_events - ctx->nr_events,
507 perf_max_events - perf_reserved_percpu);
508 }
509
510 perf_enable();
511 raw_spin_unlock(&ctx->lock); 1108 raw_spin_unlock(&ctx->lock);
1109
1110 return 0;
512} 1111}
513 1112
514 1113
515/* 1114/*
516 * Remove the event from a task's (or a CPU's) list of events. 1115 * Remove the event from a task's (or a CPU's) list of events.
517 * 1116 *
518 * Must be called with ctx->mutex held.
519 *
520 * CPU events are removed with a smp call. For task events we only 1117 * CPU events are removed with a smp call. For task events we only
521 * call when the task is on a CPU. 1118 * call when the task is on a CPU.
522 * 1119 *
@@ -527,60 +1124,62 @@ static void __perf_event_remove_from_context(void *info)
527 * When called from perf_event_exit_task, it's OK because the 1124 * When called from perf_event_exit_task, it's OK because the
528 * context has been detached from its task. 1125 * context has been detached from its task.
529 */ 1126 */
530static void perf_event_remove_from_context(struct perf_event *event) 1127static void perf_remove_from_context(struct perf_event *event)
531{ 1128{
532 struct perf_event_context *ctx = event->ctx; 1129 struct perf_event_context *ctx = event->ctx;
533 struct task_struct *task = ctx->task; 1130 struct task_struct *task = ctx->task;
534 1131
1132 lockdep_assert_held(&ctx->mutex);
1133
535 if (!task) { 1134 if (!task) {
536 /* 1135 /*
537 * Per cpu events are removed via an smp call and 1136 * Per cpu events are removed via an smp call and
538 * the removal is always successful. 1137 * the removal is always successful.
539 */ 1138 */
540 smp_call_function_single(event->cpu, 1139 cpu_function_call(event->cpu, __perf_remove_from_context, event);
541 __perf_event_remove_from_context,
542 event, 1);
543 return; 1140 return;
544 } 1141 }
545 1142
546retry: 1143retry:
547 task_oncpu_function_call(task, __perf_event_remove_from_context, 1144 if (!task_function_call(task, __perf_remove_from_context, event))
548 event); 1145 return;
549 1146
550 raw_spin_lock_irq(&ctx->lock); 1147 raw_spin_lock_irq(&ctx->lock);
551 /* 1148 /*
552 * If the context is active we need to retry the smp call. 1149 * If we failed to find a running task, but find the context active now
1150 * that we've acquired the ctx->lock, retry.
553 */ 1151 */
554 if (ctx->nr_active && !list_empty(&event->group_entry)) { 1152 if (ctx->is_active) {
555 raw_spin_unlock_irq(&ctx->lock); 1153 raw_spin_unlock_irq(&ctx->lock);
556 goto retry; 1154 goto retry;
557 } 1155 }
558 1156
559 /* 1157 /*
560 * The lock prevents that this context is scheduled in so we 1158 * Since the task isn't running, its safe to remove the event, us
561 * can remove the event safely, if the call above did not 1159 * holding the ctx->lock ensures the task won't get scheduled in.
562 * succeed.
563 */ 1160 */
564 if (!list_empty(&event->group_entry)) 1161 list_del_event(event, ctx);
565 list_del_event(event, ctx);
566 raw_spin_unlock_irq(&ctx->lock); 1162 raw_spin_unlock_irq(&ctx->lock);
567} 1163}
568 1164
569/* 1165/*
570 * Cross CPU call to disable a performance event 1166 * Cross CPU call to disable a performance event
571 */ 1167 */
572static void __perf_event_disable(void *info) 1168static int __perf_event_disable(void *info)
573{ 1169{
574 struct perf_event *event = info; 1170 struct perf_event *event = info;
575 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
576 struct perf_event_context *ctx = event->ctx; 1171 struct perf_event_context *ctx = event->ctx;
1172 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
577 1173
578 /* 1174 /*
579 * If this is a per-task event, need to check whether this 1175 * If this is a per-task event, need to check whether this
580 * event's task is the current task on this cpu. 1176 * event's task is the current task on this cpu.
1177 *
1178 * Can trigger due to concurrent perf_event_context_sched_out()
1179 * flipping contexts around.
581 */ 1180 */
582 if (ctx->task && cpuctx->task_ctx != ctx) 1181 if (ctx->task && cpuctx->task_ctx != ctx)
583 return; 1182 return -EINVAL;
584 1183
585 raw_spin_lock(&ctx->lock); 1184 raw_spin_lock(&ctx->lock);
586 1185
@@ -590,6 +1189,7 @@ static void __perf_event_disable(void *info)
590 */ 1189 */
591 if (event->state >= PERF_EVENT_STATE_INACTIVE) { 1190 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
592 update_context_time(ctx); 1191 update_context_time(ctx);
1192 update_cgrp_time_from_event(event);
593 update_group_times(event); 1193 update_group_times(event);
594 if (event == event->group_leader) 1194 if (event == event->group_leader)
595 group_sched_out(event, cpuctx, ctx); 1195 group_sched_out(event, cpuctx, ctx);
@@ -599,6 +1199,8 @@ static void __perf_event_disable(void *info)
599 } 1199 }
600 1200
601 raw_spin_unlock(&ctx->lock); 1201 raw_spin_unlock(&ctx->lock);
1202
1203 return 0;
602} 1204}
603 1205
604/* 1206/*
@@ -623,13 +1225,13 @@ void perf_event_disable(struct perf_event *event)
623 /* 1225 /*
624 * Disable the event on the cpu that it's on 1226 * Disable the event on the cpu that it's on
625 */ 1227 */
626 smp_call_function_single(event->cpu, __perf_event_disable, 1228 cpu_function_call(event->cpu, __perf_event_disable, event);
627 event, 1);
628 return; 1229 return;
629 } 1230 }
630 1231
631 retry: 1232retry:
632 task_oncpu_function_call(task, __perf_event_disable, event); 1233 if (!task_function_call(task, __perf_event_disable, event))
1234 return;
633 1235
634 raw_spin_lock_irq(&ctx->lock); 1236 raw_spin_lock_irq(&ctx->lock);
635 /* 1237 /*
@@ -637,6 +1239,11 @@ void perf_event_disable(struct perf_event *event)
637 */ 1239 */
638 if (event->state == PERF_EVENT_STATE_ACTIVE) { 1240 if (event->state == PERF_EVENT_STATE_ACTIVE) {
639 raw_spin_unlock_irq(&ctx->lock); 1241 raw_spin_unlock_irq(&ctx->lock);
1242 /*
1243 * Reload the task pointer, it might have been changed by
1244 * a concurrent perf_event_context_sched_out().
1245 */
1246 task = ctx->task;
640 goto retry; 1247 goto retry;
641 } 1248 }
642 1249
@@ -648,32 +1255,85 @@ void perf_event_disable(struct perf_event *event)
648 update_group_times(event); 1255 update_group_times(event);
649 event->state = PERF_EVENT_STATE_OFF; 1256 event->state = PERF_EVENT_STATE_OFF;
650 } 1257 }
651
652 raw_spin_unlock_irq(&ctx->lock); 1258 raw_spin_unlock_irq(&ctx->lock);
653} 1259}
654 1260
1261static void perf_set_shadow_time(struct perf_event *event,
1262 struct perf_event_context *ctx,
1263 u64 tstamp)
1264{
1265 /*
1266 * use the correct time source for the time snapshot
1267 *
1268 * We could get by without this by leveraging the
1269 * fact that to get to this function, the caller
1270 * has most likely already called update_context_time()
1271 * and update_cgrp_time_xx() and thus both timestamp
1272 * are identical (or very close). Given that tstamp is,
1273 * already adjusted for cgroup, we could say that:
1274 * tstamp - ctx->timestamp
1275 * is equivalent to
1276 * tstamp - cgrp->timestamp.
1277 *
1278 * Then, in perf_output_read(), the calculation would
1279 * work with no changes because:
1280 * - event is guaranteed scheduled in
1281 * - no scheduled out in between
1282 * - thus the timestamp would be the same
1283 *
1284 * But this is a bit hairy.
1285 *
1286 * So instead, we have an explicit cgroup call to remain
1287 * within the time time source all along. We believe it
1288 * is cleaner and simpler to understand.
1289 */
1290 if (is_cgroup_event(event))
1291 perf_cgroup_set_shadow_time(event, tstamp);
1292 else
1293 event->shadow_ctx_time = tstamp - ctx->timestamp;
1294}
1295
1296#define MAX_INTERRUPTS (~0ULL)
1297
1298static void perf_log_throttle(struct perf_event *event, int enable);
1299
655static int 1300static int
656event_sched_in(struct perf_event *event, 1301event_sched_in(struct perf_event *event,
657 struct perf_cpu_context *cpuctx, 1302 struct perf_cpu_context *cpuctx,
658 struct perf_event_context *ctx) 1303 struct perf_event_context *ctx)
659{ 1304{
1305 u64 tstamp = perf_event_time(event);
1306
660 if (event->state <= PERF_EVENT_STATE_OFF) 1307 if (event->state <= PERF_EVENT_STATE_OFF)
661 return 0; 1308 return 0;
662 1309
663 event->state = PERF_EVENT_STATE_ACTIVE; 1310 event->state = PERF_EVENT_STATE_ACTIVE;
664 event->oncpu = smp_processor_id(); 1311 event->oncpu = smp_processor_id();
1312
1313 /*
1314 * Unthrottle events, since we scheduled we might have missed several
1315 * ticks already, also for a heavily scheduling task there is little
1316 * guarantee it'll get a tick in a timely manner.
1317 */
1318 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1319 perf_log_throttle(event, 1);
1320 event->hw.interrupts = 0;
1321 }
1322
665 /* 1323 /*
666 * The new state must be visible before we turn it on in the hardware: 1324 * The new state must be visible before we turn it on in the hardware:
667 */ 1325 */
668 smp_wmb(); 1326 smp_wmb();
669 1327
670 if (event->pmu->enable(event)) { 1328 if (event->pmu->add(event, PERF_EF_START)) {
671 event->state = PERF_EVENT_STATE_INACTIVE; 1329 event->state = PERF_EVENT_STATE_INACTIVE;
672 event->oncpu = -1; 1330 event->oncpu = -1;
673 return -EAGAIN; 1331 return -EAGAIN;
674 } 1332 }
675 1333
676 event->tstamp_running += ctx->time - event->tstamp_stopped; 1334 event->tstamp_running += tstamp - event->tstamp_stopped;
1335
1336 perf_set_shadow_time(event, ctx, tstamp);
677 1337
678 if (!is_software_event(event)) 1338 if (!is_software_event(event))
679 cpuctx->active_oncpu++; 1339 cpuctx->active_oncpu++;
@@ -691,22 +1351,17 @@ group_sched_in(struct perf_event *group_event,
691 struct perf_event_context *ctx) 1351 struct perf_event_context *ctx)
692{ 1352{
693 struct perf_event *event, *partial_group = NULL; 1353 struct perf_event *event, *partial_group = NULL;
694 const struct pmu *pmu = group_event->pmu; 1354 struct pmu *pmu = group_event->pmu;
695 bool txn = false; 1355 u64 now = ctx->time;
1356 bool simulate = false;
696 1357
697 if (group_event->state == PERF_EVENT_STATE_OFF) 1358 if (group_event->state == PERF_EVENT_STATE_OFF)
698 return 0; 1359 return 0;
699 1360
700 /* Check if group transaction availabe */ 1361 pmu->start_txn(pmu);
701 if (pmu->start_txn)
702 txn = true;
703
704 if (txn)
705 pmu->start_txn(pmu);
706 1362
707 if (event_sched_in(group_event, cpuctx, ctx)) { 1363 if (event_sched_in(group_event, cpuctx, ctx)) {
708 if (txn) 1364 pmu->cancel_txn(pmu);
709 pmu->cancel_txn(pmu);
710 return -EAGAIN; 1365 return -EAGAIN;
711 } 1366 }
712 1367
@@ -720,23 +1375,38 @@ group_sched_in(struct perf_event *group_event,
720 } 1375 }
721 } 1376 }
722 1377
723 if (!txn || !pmu->commit_txn(pmu)) 1378 if (!pmu->commit_txn(pmu))
724 return 0; 1379 return 0;
725 1380
726group_error: 1381group_error:
727 /* 1382 /*
728 * Groups can be scheduled in as one unit only, so undo any 1383 * Groups can be scheduled in as one unit only, so undo any
729 * partial group before returning: 1384 * partial group before returning:
1385 * The events up to the failed event are scheduled out normally,
1386 * tstamp_stopped will be updated.
1387 *
1388 * The failed events and the remaining siblings need to have
1389 * their timings updated as if they had gone thru event_sched_in()
1390 * and event_sched_out(). This is required to get consistent timings
1391 * across the group. This also takes care of the case where the group
1392 * could never be scheduled by ensuring tstamp_stopped is set to mark
1393 * the time the event was actually stopped, such that time delta
1394 * calculation in update_event_times() is correct.
730 */ 1395 */
731 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 1396 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
732 if (event == partial_group) 1397 if (event == partial_group)
733 break; 1398 simulate = true;
734 event_sched_out(event, cpuctx, ctx); 1399
1400 if (simulate) {
1401 event->tstamp_running += now - event->tstamp_stopped;
1402 event->tstamp_stopped = now;
1403 } else {
1404 event_sched_out(event, cpuctx, ctx);
1405 }
735 } 1406 }
736 event_sched_out(group_event, cpuctx, ctx); 1407 event_sched_out(group_event, cpuctx, ctx);
737 1408
738 if (txn) 1409 pmu->cancel_txn(pmu);
739 pmu->cancel_txn(pmu);
740 1410
741 return -EAGAIN; 1411 return -EAGAIN;
742} 1412}
@@ -775,52 +1445,52 @@ static int group_can_go_on(struct perf_event *event,
775static void add_event_to_ctx(struct perf_event *event, 1445static void add_event_to_ctx(struct perf_event *event,
776 struct perf_event_context *ctx) 1446 struct perf_event_context *ctx)
777{ 1447{
1448 u64 tstamp = perf_event_time(event);
1449
778 list_add_event(event, ctx); 1450 list_add_event(event, ctx);
779 perf_group_attach(event); 1451 perf_group_attach(event);
780 event->tstamp_enabled = ctx->time; 1452 event->tstamp_enabled = tstamp;
781 event->tstamp_running = ctx->time; 1453 event->tstamp_running = tstamp;
782 event->tstamp_stopped = ctx->time; 1454 event->tstamp_stopped = tstamp;
783} 1455}
784 1456
1457static void perf_event_context_sched_in(struct perf_event_context *ctx,
1458 struct task_struct *tsk);
1459
785/* 1460/*
786 * Cross CPU call to install and enable a performance event 1461 * Cross CPU call to install and enable a performance event
787 * 1462 *
788 * Must be called with ctx->mutex held 1463 * Must be called with ctx->mutex held
789 */ 1464 */
790static void __perf_install_in_context(void *info) 1465static int __perf_install_in_context(void *info)
791{ 1466{
792 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
793 struct perf_event *event = info; 1467 struct perf_event *event = info;
794 struct perf_event_context *ctx = event->ctx; 1468 struct perf_event_context *ctx = event->ctx;
795 struct perf_event *leader = event->group_leader; 1469 struct perf_event *leader = event->group_leader;
1470 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
796 int err; 1471 int err;
797 1472
798 /* 1473 /*
799 * If this is a task context, we need to check whether it is 1474 * In case we're installing a new context to an already running task,
800 * the current task context of this cpu. If not it has been 1475 * could also happen before perf_event_task_sched_in() on architectures
801 * scheduled out before the smp call arrived. 1476 * which do context switches with IRQs enabled.
802 * Or possibly this is the right context but it isn't
803 * on this cpu because it had no events.
804 */ 1477 */
805 if (ctx->task && cpuctx->task_ctx != ctx) { 1478 if (ctx->task && !cpuctx->task_ctx)
806 if (cpuctx->task_ctx || ctx->task != current) 1479 perf_event_context_sched_in(ctx, ctx->task);
807 return;
808 cpuctx->task_ctx = ctx;
809 }
810 1480
811 raw_spin_lock(&ctx->lock); 1481 raw_spin_lock(&ctx->lock);
812 ctx->is_active = 1; 1482 ctx->is_active = 1;
813 update_context_time(ctx); 1483 update_context_time(ctx);
814
815 /* 1484 /*
816 * Protect the list operation against NMI by disabling the 1485 * update cgrp time only if current cgrp
817 * events on a global level. NOP for non NMI based events. 1486 * matches event->cgrp. Must be done before
1487 * calling add_event_to_ctx()
818 */ 1488 */
819 perf_disable(); 1489 update_cgrp_time_from_event(event);
820 1490
821 add_event_to_ctx(event, ctx); 1491 add_event_to_ctx(event, ctx);
822 1492
823 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1493 if (!event_filter_match(event))
824 goto unlock; 1494 goto unlock;
825 1495
826 /* 1496 /*
@@ -855,13 +1525,10 @@ static void __perf_install_in_context(void *info)
855 } 1525 }
856 } 1526 }
857 1527
858 if (!err && !ctx->task && cpuctx->max_pertask) 1528unlock:
859 cpuctx->max_pertask--;
860
861 unlock:
862 perf_enable();
863
864 raw_spin_unlock(&ctx->lock); 1529 raw_spin_unlock(&ctx->lock);
1530
1531 return 0;
865} 1532}
866 1533
867/* 1534/*
@@ -873,8 +1540,6 @@ static void __perf_install_in_context(void *info)
873 * If the event is attached to a task which is on a CPU we use a smp 1540 * If the event is attached to a task which is on a CPU we use a smp
874 * call to enable it in the task context. The task might have been 1541 * call to enable it in the task context. The task might have been
875 * scheduled away, but we check this in the smp call again. 1542 * scheduled away, but we check this in the smp call again.
876 *
877 * Must be called with ctx->mutex held.
878 */ 1543 */
879static void 1544static void
880perf_install_in_context(struct perf_event_context *ctx, 1545perf_install_in_context(struct perf_event_context *ctx,
@@ -883,36 +1548,38 @@ perf_install_in_context(struct perf_event_context *ctx,
883{ 1548{
884 struct task_struct *task = ctx->task; 1549 struct task_struct *task = ctx->task;
885 1550
1551 lockdep_assert_held(&ctx->mutex);
1552
1553 event->ctx = ctx;
1554
886 if (!task) { 1555 if (!task) {
887 /* 1556 /*
888 * Per cpu events are installed via an smp call and 1557 * Per cpu events are installed via an smp call and
889 * the install is always successful. 1558 * the install is always successful.
890 */ 1559 */
891 smp_call_function_single(cpu, __perf_install_in_context, 1560 cpu_function_call(cpu, __perf_install_in_context, event);
892 event, 1);
893 return; 1561 return;
894 } 1562 }
895 1563
896retry: 1564retry:
897 task_oncpu_function_call(task, __perf_install_in_context, 1565 if (!task_function_call(task, __perf_install_in_context, event))
898 event); 1566 return;
899 1567
900 raw_spin_lock_irq(&ctx->lock); 1568 raw_spin_lock_irq(&ctx->lock);
901 /* 1569 /*
902 * we need to retry the smp call. 1570 * If we failed to find a running task, but find the context active now
1571 * that we've acquired the ctx->lock, retry.
903 */ 1572 */
904 if (ctx->is_active && list_empty(&event->group_entry)) { 1573 if (ctx->is_active) {
905 raw_spin_unlock_irq(&ctx->lock); 1574 raw_spin_unlock_irq(&ctx->lock);
906 goto retry; 1575 goto retry;
907 } 1576 }
908 1577
909 /* 1578 /*
910 * The lock prevents that this context is scheduled in so we 1579 * Since the task isn't running, its safe to add the event, us holding
911 * can add the event safely, if it the call above did not 1580 * the ctx->lock ensures the task won't get scheduled in.
912 * succeed.
913 */ 1581 */
914 if (list_empty(&event->group_entry)) 1582 add_event_to_ctx(event, ctx);
915 add_event_to_ctx(event, ctx);
916 raw_spin_unlock_irq(&ctx->lock); 1583 raw_spin_unlock_irq(&ctx->lock);
917} 1584}
918 1585
@@ -928,46 +1595,48 @@ static void __perf_event_mark_enabled(struct perf_event *event,
928 struct perf_event_context *ctx) 1595 struct perf_event_context *ctx)
929{ 1596{
930 struct perf_event *sub; 1597 struct perf_event *sub;
1598 u64 tstamp = perf_event_time(event);
931 1599
932 event->state = PERF_EVENT_STATE_INACTIVE; 1600 event->state = PERF_EVENT_STATE_INACTIVE;
933 event->tstamp_enabled = ctx->time - event->total_time_enabled; 1601 event->tstamp_enabled = tstamp - event->total_time_enabled;
934 list_for_each_entry(sub, &event->sibling_list, group_entry) 1602 list_for_each_entry(sub, &event->sibling_list, group_entry) {
935 if (sub->state >= PERF_EVENT_STATE_INACTIVE) 1603 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
936 sub->tstamp_enabled = 1604 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
937 ctx->time - sub->total_time_enabled; 1605 }
938} 1606}
939 1607
940/* 1608/*
941 * Cross CPU call to enable a performance event 1609 * Cross CPU call to enable a performance event
942 */ 1610 */
943static void __perf_event_enable(void *info) 1611static int __perf_event_enable(void *info)
944{ 1612{
945 struct perf_event *event = info; 1613 struct perf_event *event = info;
946 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
947 struct perf_event_context *ctx = event->ctx; 1614 struct perf_event_context *ctx = event->ctx;
948 struct perf_event *leader = event->group_leader; 1615 struct perf_event *leader = event->group_leader;
1616 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
949 int err; 1617 int err;
950 1618
951 /* 1619 if (WARN_ON_ONCE(!ctx->is_active))
952 * If this is a per-task event, need to check whether this 1620 return -EINVAL;
953 * event's task is the current task on this cpu.
954 */
955 if (ctx->task && cpuctx->task_ctx != ctx) {
956 if (cpuctx->task_ctx || ctx->task != current)
957 return;
958 cpuctx->task_ctx = ctx;
959 }
960 1621
961 raw_spin_lock(&ctx->lock); 1622 raw_spin_lock(&ctx->lock);
962 ctx->is_active = 1;
963 update_context_time(ctx); 1623 update_context_time(ctx);
964 1624
965 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1625 if (event->state >= PERF_EVENT_STATE_INACTIVE)
966 goto unlock; 1626 goto unlock;
1627
1628 /*
1629 * set current task's cgroup time reference point
1630 */
1631 perf_cgroup_set_timestamp(current, ctx);
1632
967 __perf_event_mark_enabled(event, ctx); 1633 __perf_event_mark_enabled(event, ctx);
968 1634
969 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1635 if (!event_filter_match(event)) {
1636 if (is_cgroup_event(event))
1637 perf_cgroup_defer_enabled(event);
970 goto unlock; 1638 goto unlock;
1639 }
971 1640
972 /* 1641 /*
973 * If the event is in a group and isn't the group leader, 1642 * If the event is in a group and isn't the group leader,
@@ -979,12 +1648,10 @@ static void __perf_event_enable(void *info)
979 if (!group_can_go_on(event, cpuctx, 1)) { 1648 if (!group_can_go_on(event, cpuctx, 1)) {
980 err = -EEXIST; 1649 err = -EEXIST;
981 } else { 1650 } else {
982 perf_disable();
983 if (event == leader) 1651 if (event == leader)
984 err = group_sched_in(event, cpuctx, ctx); 1652 err = group_sched_in(event, cpuctx, ctx);
985 else 1653 else
986 err = event_sched_in(event, cpuctx, ctx); 1654 err = event_sched_in(event, cpuctx, ctx);
987 perf_enable();
988 } 1655 }
989 1656
990 if (err) { 1657 if (err) {
@@ -1000,8 +1667,10 @@ static void __perf_event_enable(void *info)
1000 } 1667 }
1001 } 1668 }
1002 1669
1003 unlock: 1670unlock:
1004 raw_spin_unlock(&ctx->lock); 1671 raw_spin_unlock(&ctx->lock);
1672
1673 return 0;
1005} 1674}
1006 1675
1007/* 1676/*
@@ -1022,8 +1691,7 @@ void perf_event_enable(struct perf_event *event)
1022 /* 1691 /*
1023 * Enable the event on the cpu that it's on 1692 * Enable the event on the cpu that it's on
1024 */ 1693 */
1025 smp_call_function_single(event->cpu, __perf_event_enable, 1694 cpu_function_call(event->cpu, __perf_event_enable, event);
1026 event, 1);
1027 return; 1695 return;
1028 } 1696 }
1029 1697
@@ -1041,9 +1709,16 @@ void perf_event_enable(struct perf_event *event)
1041 if (event->state == PERF_EVENT_STATE_ERROR) 1709 if (event->state == PERF_EVENT_STATE_ERROR)
1042 event->state = PERF_EVENT_STATE_OFF; 1710 event->state = PERF_EVENT_STATE_OFF;
1043 1711
1044 retry: 1712retry:
1713 if (!ctx->is_active) {
1714 __perf_event_mark_enabled(event, ctx);
1715 goto out;
1716 }
1717
1045 raw_spin_unlock_irq(&ctx->lock); 1718 raw_spin_unlock_irq(&ctx->lock);
1046 task_oncpu_function_call(task, __perf_event_enable, event); 1719
1720 if (!task_function_call(task, __perf_event_enable, event))
1721 return;
1047 1722
1048 raw_spin_lock_irq(&ctx->lock); 1723 raw_spin_lock_irq(&ctx->lock);
1049 1724
@@ -1051,17 +1726,16 @@ void perf_event_enable(struct perf_event *event)
1051 * If the context is active and the event is still off, 1726 * If the context is active and the event is still off,
1052 * we need to retry the cross-call. 1727 * we need to retry the cross-call.
1053 */ 1728 */
1054 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) 1729 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
1730 /*
1731 * task could have been flipped by a concurrent
1732 * perf_event_context_sched_out()
1733 */
1734 task = ctx->task;
1055 goto retry; 1735 goto retry;
1736 }
1056 1737
1057 /* 1738out:
1058 * Since we have the lock this context can't be scheduled
1059 * in, so we can change the state safely.
1060 */
1061 if (event->state == PERF_EVENT_STATE_OFF)
1062 __perf_event_mark_enabled(event, ctx);
1063
1064 out:
1065 raw_spin_unlock_irq(&ctx->lock); 1739 raw_spin_unlock_irq(&ctx->lock);
1066} 1740}
1067 1741
@@ -1070,7 +1744,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1070 /* 1744 /*
1071 * not supported on inherited events 1745 * not supported on inherited events
1072 */ 1746 */
1073 if (event->attr.inherit) 1747 if (event->attr.inherit || !is_sampling_event(event))
1074 return -EINVAL; 1748 return -EINVAL;
1075 1749
1076 atomic_add(refresh, &event->event_limit); 1750 atomic_add(refresh, &event->event_limit);
@@ -1079,12 +1753,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1079 return 0; 1753 return 0;
1080} 1754}
1081 1755
1082enum event_type_t {
1083 EVENT_FLEXIBLE = 0x1,
1084 EVENT_PINNED = 0x2,
1085 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1086};
1087
1088static void ctx_sched_out(struct perf_event_context *ctx, 1756static void ctx_sched_out(struct perf_event_context *ctx,
1089 struct perf_cpu_context *cpuctx, 1757 struct perf_cpu_context *cpuctx,
1090 enum event_type_t event_type) 1758 enum event_type_t event_type)
@@ -1092,26 +1760,27 @@ static void ctx_sched_out(struct perf_event_context *ctx,
1092 struct perf_event *event; 1760 struct perf_event *event;
1093 1761
1094 raw_spin_lock(&ctx->lock); 1762 raw_spin_lock(&ctx->lock);
1763 perf_pmu_disable(ctx->pmu);
1095 ctx->is_active = 0; 1764 ctx->is_active = 0;
1096 if (likely(!ctx->nr_events)) 1765 if (likely(!ctx->nr_events))
1097 goto out; 1766 goto out;
1098 update_context_time(ctx); 1767 update_context_time(ctx);
1768 update_cgrp_time_from_cpuctx(cpuctx);
1099 1769
1100 perf_disable();
1101 if (!ctx->nr_active) 1770 if (!ctx->nr_active)
1102 goto out_enable; 1771 goto out;
1103 1772
1104 if (event_type & EVENT_PINNED) 1773 if (event_type & EVENT_PINNED) {
1105 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 1774 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1106 group_sched_out(event, cpuctx, ctx); 1775 group_sched_out(event, cpuctx, ctx);
1776 }
1107 1777
1108 if (event_type & EVENT_FLEXIBLE) 1778 if (event_type & EVENT_FLEXIBLE) {
1109 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 1779 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1110 group_sched_out(event, cpuctx, ctx); 1780 group_sched_out(event, cpuctx, ctx);
1111 1781 }
1112 out_enable: 1782out:
1113 perf_enable(); 1783 perf_pmu_enable(ctx->pmu);
1114 out:
1115 raw_spin_unlock(&ctx->lock); 1784 raw_spin_unlock(&ctx->lock);
1116} 1785}
1117 1786
@@ -1209,34 +1878,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1209 } 1878 }
1210} 1879}
1211 1880
1212/* 1881static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1213 * Called from scheduler to remove the events of the current task, 1882 struct task_struct *next)
1214 * with interrupts disabled.
1215 *
1216 * We stop each event and update the event value in event->count.
1217 *
1218 * This does not protect us against NMI, but disable()
1219 * sets the disabled bit in the control field of event _before_
1220 * accessing the event control register. If a NMI hits, then it will
1221 * not restart the event.
1222 */
1223void perf_event_task_sched_out(struct task_struct *task,
1224 struct task_struct *next)
1225{ 1883{
1226 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1884 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1227 struct perf_event_context *ctx = task->perf_event_ctxp;
1228 struct perf_event_context *next_ctx; 1885 struct perf_event_context *next_ctx;
1229 struct perf_event_context *parent; 1886 struct perf_event_context *parent;
1887 struct perf_cpu_context *cpuctx;
1230 int do_switch = 1; 1888 int do_switch = 1;
1231 1889
1232 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); 1890 if (likely(!ctx))
1891 return;
1233 1892
1234 if (likely(!ctx || !cpuctx->task_ctx)) 1893 cpuctx = __get_cpu_context(ctx);
1894 if (!cpuctx->task_ctx)
1235 return; 1895 return;
1236 1896
1237 rcu_read_lock(); 1897 rcu_read_lock();
1238 parent = rcu_dereference(ctx->parent_ctx); 1898 parent = rcu_dereference(ctx->parent_ctx);
1239 next_ctx = next->perf_event_ctxp; 1899 next_ctx = next->perf_event_ctxp[ctxn];
1240 if (parent && next_ctx && 1900 if (parent && next_ctx &&
1241 rcu_dereference(next_ctx->parent_ctx) == parent) { 1901 rcu_dereference(next_ctx->parent_ctx) == parent) {
1242 /* 1902 /*
@@ -1255,8 +1915,8 @@ void perf_event_task_sched_out(struct task_struct *task,
1255 * XXX do we need a memory barrier of sorts 1915 * XXX do we need a memory barrier of sorts
1256 * wrt to rcu_dereference() of perf_event_ctxp 1916 * wrt to rcu_dereference() of perf_event_ctxp
1257 */ 1917 */
1258 task->perf_event_ctxp = next_ctx; 1918 task->perf_event_ctxp[ctxn] = next_ctx;
1259 next->perf_event_ctxp = ctx; 1919 next->perf_event_ctxp[ctxn] = ctx;
1260 ctx->task = next; 1920 ctx->task = next;
1261 next_ctx->task = task; 1921 next_ctx->task = task;
1262 do_switch = 0; 1922 do_switch = 0;
@@ -1274,10 +1934,41 @@ void perf_event_task_sched_out(struct task_struct *task,
1274 } 1934 }
1275} 1935}
1276 1936
1937#define for_each_task_context_nr(ctxn) \
1938 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
1939
1940/*
1941 * Called from scheduler to remove the events of the current task,
1942 * with interrupts disabled.
1943 *
1944 * We stop each event and update the event value in event->count.
1945 *
1946 * This does not protect us against NMI, but disable()
1947 * sets the disabled bit in the control field of event _before_
1948 * accessing the event control register. If a NMI hits, then it will
1949 * not restart the event.
1950 */
1951void __perf_event_task_sched_out(struct task_struct *task,
1952 struct task_struct *next)
1953{
1954 int ctxn;
1955
1956 for_each_task_context_nr(ctxn)
1957 perf_event_context_sched_out(task, ctxn, next);
1958
1959 /*
1960 * if cgroup events exist on this CPU, then we need
1961 * to check if we have to switch out PMU state.
1962 * cgroup event are system-wide mode only
1963 */
1964 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
1965 perf_cgroup_sched_out(task);
1966}
1967
1277static void task_ctx_sched_out(struct perf_event_context *ctx, 1968static void task_ctx_sched_out(struct perf_event_context *ctx,
1278 enum event_type_t event_type) 1969 enum event_type_t event_type)
1279{ 1970{
1280 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1971 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1281 1972
1282 if (!cpuctx->task_ctx) 1973 if (!cpuctx->task_ctx)
1283 return; 1974 return;
@@ -1292,14 +1983,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
1292/* 1983/*
1293 * Called with IRQs disabled 1984 * Called with IRQs disabled
1294 */ 1985 */
1295static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1296{
1297 task_ctx_sched_out(ctx, EVENT_ALL);
1298}
1299
1300/*
1301 * Called with IRQs disabled
1302 */
1303static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, 1986static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1304 enum event_type_t event_type) 1987 enum event_type_t event_type)
1305{ 1988{
@@ -1315,9 +1998,13 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
1315 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 1998 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1316 if (event->state <= PERF_EVENT_STATE_OFF) 1999 if (event->state <= PERF_EVENT_STATE_OFF)
1317 continue; 2000 continue;
1318 if (event->cpu != -1 && event->cpu != smp_processor_id()) 2001 if (!event_filter_match(event))
1319 continue; 2002 continue;
1320 2003
2004 /* may need to reset tstamp_enabled */
2005 if (is_cgroup_event(event))
2006 perf_cgroup_mark_enabled(event, ctx);
2007
1321 if (group_can_go_on(event, cpuctx, 1)) 2008 if (group_can_go_on(event, cpuctx, 1))
1322 group_sched_in(event, cpuctx, ctx); 2009 group_sched_in(event, cpuctx, ctx);
1323 2010
@@ -1347,29 +2034,36 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1347 * Listen to the 'cpu' scheduling filter constraint 2034 * Listen to the 'cpu' scheduling filter constraint
1348 * of events: 2035 * of events:
1349 */ 2036 */
1350 if (event->cpu != -1 && event->cpu != smp_processor_id()) 2037 if (!event_filter_match(event))
1351 continue; 2038 continue;
1352 2039
1353 if (group_can_go_on(event, cpuctx, can_add_hw)) 2040 /* may need to reset tstamp_enabled */
2041 if (is_cgroup_event(event))
2042 perf_cgroup_mark_enabled(event, ctx);
2043
2044 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1354 if (group_sched_in(event, cpuctx, ctx)) 2045 if (group_sched_in(event, cpuctx, ctx))
1355 can_add_hw = 0; 2046 can_add_hw = 0;
2047 }
1356 } 2048 }
1357} 2049}
1358 2050
1359static void 2051static void
1360ctx_sched_in(struct perf_event_context *ctx, 2052ctx_sched_in(struct perf_event_context *ctx,
1361 struct perf_cpu_context *cpuctx, 2053 struct perf_cpu_context *cpuctx,
1362 enum event_type_t event_type) 2054 enum event_type_t event_type,
2055 struct task_struct *task)
1363{ 2056{
2057 u64 now;
2058
1364 raw_spin_lock(&ctx->lock); 2059 raw_spin_lock(&ctx->lock);
1365 ctx->is_active = 1; 2060 ctx->is_active = 1;
1366 if (likely(!ctx->nr_events)) 2061 if (likely(!ctx->nr_events))
1367 goto out; 2062 goto out;
1368 2063
1369 ctx->timestamp = perf_clock(); 2064 now = perf_clock();
1370 2065 ctx->timestamp = now;
1371 perf_disable(); 2066 perf_cgroup_set_timestamp(task, ctx);
1372
1373 /* 2067 /*
1374 * First go through the list and put on any pinned groups 2068 * First go through the list and put on any pinned groups
1375 * in order to give them the best chance of going on. 2069 * in order to give them the best chance of going on.
@@ -1381,56 +2075,42 @@ ctx_sched_in(struct perf_event_context *ctx,
1381 if (event_type & EVENT_FLEXIBLE) 2075 if (event_type & EVENT_FLEXIBLE)
1382 ctx_flexible_sched_in(ctx, cpuctx); 2076 ctx_flexible_sched_in(ctx, cpuctx);
1383 2077
1384 perf_enable(); 2078out:
1385 out:
1386 raw_spin_unlock(&ctx->lock); 2079 raw_spin_unlock(&ctx->lock);
1387} 2080}
1388 2081
1389static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 2082static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1390 enum event_type_t event_type) 2083 enum event_type_t event_type,
2084 struct task_struct *task)
1391{ 2085{
1392 struct perf_event_context *ctx = &cpuctx->ctx; 2086 struct perf_event_context *ctx = &cpuctx->ctx;
1393 2087
1394 ctx_sched_in(ctx, cpuctx, event_type); 2088 ctx_sched_in(ctx, cpuctx, event_type, task);
1395} 2089}
1396 2090
1397static void task_ctx_sched_in(struct task_struct *task, 2091static void task_ctx_sched_in(struct perf_event_context *ctx,
1398 enum event_type_t event_type) 2092 enum event_type_t event_type)
1399{ 2093{
1400 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 2094 struct perf_cpu_context *cpuctx;
1401 struct perf_event_context *ctx = task->perf_event_ctxp;
1402 2095
1403 if (likely(!ctx)) 2096 cpuctx = __get_cpu_context(ctx);
1404 return;
1405 if (cpuctx->task_ctx == ctx) 2097 if (cpuctx->task_ctx == ctx)
1406 return; 2098 return;
1407 ctx_sched_in(ctx, cpuctx, event_type); 2099
2100 ctx_sched_in(ctx, cpuctx, event_type, NULL);
1408 cpuctx->task_ctx = ctx; 2101 cpuctx->task_ctx = ctx;
1409} 2102}
1410/*
1411 * Called from scheduler to add the events of the current task
1412 * with interrupts disabled.
1413 *
1414 * We restore the event value and then enable it.
1415 *
1416 * This does not protect us against NMI, but enable()
1417 * sets the enabled bit in the control field of event _before_
1418 * accessing the event control register. If a NMI hits, then it will
1419 * keep the event running.
1420 */
1421void perf_event_task_sched_in(struct task_struct *task)
1422{
1423 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1424 struct perf_event_context *ctx = task->perf_event_ctxp;
1425 2103
1426 if (likely(!ctx)) 2104static void perf_event_context_sched_in(struct perf_event_context *ctx,
1427 return; 2105 struct task_struct *task)
2106{
2107 struct perf_cpu_context *cpuctx;
1428 2108
2109 cpuctx = __get_cpu_context(ctx);
1429 if (cpuctx->task_ctx == ctx) 2110 if (cpuctx->task_ctx == ctx)
1430 return; 2111 return;
1431 2112
1432 perf_disable(); 2113 perf_pmu_disable(ctx->pmu);
1433
1434 /* 2114 /*
1435 * We want to keep the following priority order: 2115 * We want to keep the following priority order:
1436 * cpu pinned (that don't need to move), task pinned, 2116 * cpu pinned (that don't need to move), task pinned,
@@ -1438,18 +2118,51 @@ void perf_event_task_sched_in(struct task_struct *task)
1438 */ 2118 */
1439 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2119 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1440 2120
1441 ctx_sched_in(ctx, cpuctx, EVENT_PINNED); 2121 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1442 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2122 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1443 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); 2123 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1444 2124
1445 cpuctx->task_ctx = ctx; 2125 cpuctx->task_ctx = ctx;
1446 2126
1447 perf_enable(); 2127 /*
2128 * Since these rotations are per-cpu, we need to ensure the
2129 * cpu-context we got scheduled on is actually rotating.
2130 */
2131 perf_pmu_rotate_start(ctx->pmu);
2132 perf_pmu_enable(ctx->pmu);
1448} 2133}
1449 2134
1450#define MAX_INTERRUPTS (~0ULL) 2135/*
2136 * Called from scheduler to add the events of the current task
2137 * with interrupts disabled.
2138 *
2139 * We restore the event value and then enable it.
2140 *
2141 * This does not protect us against NMI, but enable()
2142 * sets the enabled bit in the control field of event _before_
2143 * accessing the event control register. If a NMI hits, then it will
2144 * keep the event running.
2145 */
2146void __perf_event_task_sched_in(struct task_struct *task)
2147{
2148 struct perf_event_context *ctx;
2149 int ctxn;
1451 2150
1452static void perf_log_throttle(struct perf_event *event, int enable); 2151 for_each_task_context_nr(ctxn) {
2152 ctx = task->perf_event_ctxp[ctxn];
2153 if (likely(!ctx))
2154 continue;
2155
2156 perf_event_context_sched_in(ctx, task);
2157 }
2158 /*
2159 * if cgroup events exist on this CPU, then we need
2160 * to check if we have to switch in PMU state.
2161 * cgroup event are system-wide mode only
2162 */
2163 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2164 perf_cgroup_sched_in(task);
2165}
1453 2166
1454static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2167static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1455{ 2168{
@@ -1478,7 +2191,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1478 * Reduce accuracy by one bit such that @a and @b converge 2191 * Reduce accuracy by one bit such that @a and @b converge
1479 * to a similar magnitude. 2192 * to a similar magnitude.
1480 */ 2193 */
1481#define REDUCE_FLS(a, b) \ 2194#define REDUCE_FLS(a, b) \
1482do { \ 2195do { \
1483 if (a##_fls > b##_fls) { \ 2196 if (a##_fls > b##_fls) { \
1484 a >>= 1; \ 2197 a >>= 1; \
@@ -1524,22 +2237,6 @@ do { \
1524 return div64_u64(dividend, divisor); 2237 return div64_u64(dividend, divisor);
1525} 2238}
1526 2239
1527static void perf_event_stop(struct perf_event *event)
1528{
1529 if (!event->pmu->stop)
1530 return event->pmu->disable(event);
1531
1532 return event->pmu->stop(event);
1533}
1534
1535static int perf_event_start(struct perf_event *event)
1536{
1537 if (!event->pmu->start)
1538 return event->pmu->enable(event);
1539
1540 return event->pmu->start(event);
1541}
1542
1543static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) 2240static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1544{ 2241{
1545 struct hw_perf_event *hwc = &event->hw; 2242 struct hw_perf_event *hwc = &event->hw;
@@ -1559,15 +2256,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1559 hwc->sample_period = sample_period; 2256 hwc->sample_period = sample_period;
1560 2257
1561 if (local64_read(&hwc->period_left) > 8*sample_period) { 2258 if (local64_read(&hwc->period_left) > 8*sample_period) {
1562 perf_disable(); 2259 event->pmu->stop(event, PERF_EF_UPDATE);
1563 perf_event_stop(event);
1564 local64_set(&hwc->period_left, 0); 2260 local64_set(&hwc->period_left, 0);
1565 perf_event_start(event); 2261 event->pmu->start(event, PERF_EF_RELOAD);
1566 perf_enable();
1567 } 2262 }
1568} 2263}
1569 2264
1570static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 2265static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
1571{ 2266{
1572 struct perf_event *event; 2267 struct perf_event *event;
1573 struct hw_perf_event *hwc; 2268 struct hw_perf_event *hwc;
@@ -1579,7 +2274,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1579 if (event->state != PERF_EVENT_STATE_ACTIVE) 2274 if (event->state != PERF_EVENT_STATE_ACTIVE)
1580 continue; 2275 continue;
1581 2276
1582 if (event->cpu != -1 && event->cpu != smp_processor_id()) 2277 if (!event_filter_match(event))
1583 continue; 2278 continue;
1584 2279
1585 hwc = &event->hw; 2280 hwc = &event->hw;
@@ -1592,23 +2287,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1592 */ 2287 */
1593 if (interrupts == MAX_INTERRUPTS) { 2288 if (interrupts == MAX_INTERRUPTS) {
1594 perf_log_throttle(event, 1); 2289 perf_log_throttle(event, 1);
1595 perf_disable(); 2290 event->pmu->start(event, 0);
1596 event->pmu->unthrottle(event);
1597 perf_enable();
1598 } 2291 }
1599 2292
1600 if (!event->attr.freq || !event->attr.sample_freq) 2293 if (!event->attr.freq || !event->attr.sample_freq)
1601 continue; 2294 continue;
1602 2295
1603 perf_disable();
1604 event->pmu->read(event); 2296 event->pmu->read(event);
1605 now = local64_read(&event->count); 2297 now = local64_read(&event->count);
1606 delta = now - hwc->freq_count_stamp; 2298 delta = now - hwc->freq_count_stamp;
1607 hwc->freq_count_stamp = now; 2299 hwc->freq_count_stamp = now;
1608 2300
1609 if (delta > 0) 2301 if (delta > 0)
1610 perf_adjust_period(event, TICK_NSEC, delta); 2302 perf_adjust_period(event, period, delta);
1611 perf_enable();
1612 } 2303 }
1613 raw_spin_unlock(&ctx->lock); 2304 raw_spin_unlock(&ctx->lock);
1614} 2305}
@@ -1620,38 +2311,48 @@ static void rotate_ctx(struct perf_event_context *ctx)
1620{ 2311{
1621 raw_spin_lock(&ctx->lock); 2312 raw_spin_lock(&ctx->lock);
1622 2313
1623 /* Rotate the first entry last of non-pinned groups */ 2314 /*
1624 list_rotate_left(&ctx->flexible_groups); 2315 * Rotate the first entry last of non-pinned groups. Rotation might be
2316 * disabled by the inheritance code.
2317 */
2318 if (!ctx->rotate_disable)
2319 list_rotate_left(&ctx->flexible_groups);
1625 2320
1626 raw_spin_unlock(&ctx->lock); 2321 raw_spin_unlock(&ctx->lock);
1627} 2322}
1628 2323
1629void perf_event_task_tick(struct task_struct *curr) 2324/*
2325 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
2326 * because they're strictly cpu affine and rotate_start is called with IRQs
2327 * disabled, while rotate_context is called from IRQ context.
2328 */
2329static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1630{ 2330{
1631 struct perf_cpu_context *cpuctx; 2331 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
1632 struct perf_event_context *ctx; 2332 struct perf_event_context *ctx = NULL;
1633 int rotate = 0; 2333 int rotate = 0, remove = 1;
1634
1635 if (!atomic_read(&nr_events))
1636 return;
1637 2334
1638 cpuctx = &__get_cpu_var(perf_cpu_context); 2335 if (cpuctx->ctx.nr_events) {
1639 if (cpuctx->ctx.nr_events && 2336 remove = 0;
1640 cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) 2337 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
1641 rotate = 1; 2338 rotate = 1;
2339 }
1642 2340
1643 ctx = curr->perf_event_ctxp; 2341 ctx = cpuctx->task_ctx;
1644 if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) 2342 if (ctx && ctx->nr_events) {
1645 rotate = 1; 2343 remove = 0;
2344 if (ctx->nr_events != ctx->nr_active)
2345 rotate = 1;
2346 }
1646 2347
1647 perf_ctx_adjust_freq(&cpuctx->ctx); 2348 perf_pmu_disable(cpuctx->ctx.pmu);
2349 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
1648 if (ctx) 2350 if (ctx)
1649 perf_ctx_adjust_freq(ctx); 2351 perf_ctx_adjust_freq(ctx, interval);
1650 2352
1651 if (!rotate) 2353 if (!rotate)
1652 return; 2354 goto done;
1653 2355
1654 perf_disable();
1655 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2356 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1656 if (ctx) 2357 if (ctx)
1657 task_ctx_sched_out(ctx, EVENT_FLEXIBLE); 2358 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1660,10 +2361,29 @@ void perf_event_task_tick(struct task_struct *curr)
1660 if (ctx) 2361 if (ctx)
1661 rotate_ctx(ctx); 2362 rotate_ctx(ctx);
1662 2363
1663 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2364 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
1664 if (ctx) 2365 if (ctx)
1665 task_ctx_sched_in(curr, EVENT_FLEXIBLE); 2366 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1666 perf_enable(); 2367
2368done:
2369 if (remove)
2370 list_del_init(&cpuctx->rotation_list);
2371
2372 perf_pmu_enable(cpuctx->ctx.pmu);
2373}
2374
2375void perf_event_task_tick(void)
2376{
2377 struct list_head *head = &__get_cpu_var(rotation_list);
2378 struct perf_cpu_context *cpuctx, *tmp;
2379
2380 WARN_ON(!irqs_disabled());
2381
2382 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
2383 if (cpuctx->jiffies_interval == 1 ||
2384 !(jiffies % cpuctx->jiffies_interval))
2385 perf_rotate_context(cpuctx);
2386 }
1667} 2387}
1668 2388
1669static int event_enable_on_exec(struct perf_event *event, 2389static int event_enable_on_exec(struct perf_event *event,
@@ -1685,20 +2405,26 @@ static int event_enable_on_exec(struct perf_event *event,
1685 * Enable all of a task's events that have been marked enable-on-exec. 2405 * Enable all of a task's events that have been marked enable-on-exec.
1686 * This expects task == current. 2406 * This expects task == current.
1687 */ 2407 */
1688static void perf_event_enable_on_exec(struct task_struct *task) 2408static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1689{ 2409{
1690 struct perf_event_context *ctx;
1691 struct perf_event *event; 2410 struct perf_event *event;
1692 unsigned long flags; 2411 unsigned long flags;
1693 int enabled = 0; 2412 int enabled = 0;
1694 int ret; 2413 int ret;
1695 2414
1696 local_irq_save(flags); 2415 local_irq_save(flags);
1697 ctx = task->perf_event_ctxp;
1698 if (!ctx || !ctx->nr_events) 2416 if (!ctx || !ctx->nr_events)
1699 goto out; 2417 goto out;
1700 2418
1701 __perf_event_task_sched_out(ctx); 2419 /*
2420 * We must ctxsw out cgroup events to avoid conflict
2421 * when invoking perf_task_event_sched_in() later on
2422 * in this function. Otherwise we end up trying to
2423 * ctxswin cgroup events which are already scheduled
2424 * in.
2425 */
2426 perf_cgroup_sched_out(current);
2427 task_ctx_sched_out(ctx, EVENT_ALL);
1702 2428
1703 raw_spin_lock(&ctx->lock); 2429 raw_spin_lock(&ctx->lock);
1704 2430
@@ -1722,8 +2448,11 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1722 2448
1723 raw_spin_unlock(&ctx->lock); 2449 raw_spin_unlock(&ctx->lock);
1724 2450
1725 perf_event_task_sched_in(task); 2451 /*
1726 out: 2452 * Also calls ctxswin for cgroup events, if any:
2453 */
2454 perf_event_context_sched_in(ctx, ctx->task);
2455out:
1727 local_irq_restore(flags); 2456 local_irq_restore(flags);
1728} 2457}
1729 2458
@@ -1732,9 +2461,9 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1732 */ 2461 */
1733static void __perf_event_read(void *info) 2462static void __perf_event_read(void *info)
1734{ 2463{
1735 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1736 struct perf_event *event = info; 2464 struct perf_event *event = info;
1737 struct perf_event_context *ctx = event->ctx; 2465 struct perf_event_context *ctx = event->ctx;
2466 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1738 2467
1739 /* 2468 /*
1740 * If this is a task context, we need to check whether it is 2469 * If this is a task context, we need to check whether it is
@@ -1747,11 +2476,14 @@ static void __perf_event_read(void *info)
1747 return; 2476 return;
1748 2477
1749 raw_spin_lock(&ctx->lock); 2478 raw_spin_lock(&ctx->lock);
1750 update_context_time(ctx); 2479 if (ctx->is_active) {
2480 update_context_time(ctx);
2481 update_cgrp_time_from_event(event);
2482 }
1751 update_event_times(event); 2483 update_event_times(event);
2484 if (event->state == PERF_EVENT_STATE_ACTIVE)
2485 event->pmu->read(event);
1752 raw_spin_unlock(&ctx->lock); 2486 raw_spin_unlock(&ctx->lock);
1753
1754 event->pmu->read(event);
1755} 2487}
1756 2488
1757static inline u64 perf_event_count(struct perf_event *event) 2489static inline u64 perf_event_count(struct perf_event *event)
@@ -1773,7 +2505,15 @@ static u64 perf_event_read(struct perf_event *event)
1773 unsigned long flags; 2505 unsigned long flags;
1774 2506
1775 raw_spin_lock_irqsave(&ctx->lock, flags); 2507 raw_spin_lock_irqsave(&ctx->lock, flags);
1776 update_context_time(ctx); 2508 /*
2509 * may read while context is not active
2510 * (e.g., thread is blocked), in that case
2511 * we cannot update context time
2512 */
2513 if (ctx->is_active) {
2514 update_context_time(ctx);
2515 update_cgrp_time_from_event(event);
2516 }
1777 update_event_times(event); 2517 update_event_times(event);
1778 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2518 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1779 } 2519 }
@@ -1782,11 +2522,218 @@ static u64 perf_event_read(struct perf_event *event)
1782} 2522}
1783 2523
1784/* 2524/*
1785 * Initialize the perf_event context in a task_struct: 2525 * Callchain support
1786 */ 2526 */
2527
2528struct callchain_cpus_entries {
2529 struct rcu_head rcu_head;
2530 struct perf_callchain_entry *cpu_entries[0];
2531};
2532
2533static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
2534static atomic_t nr_callchain_events;
2535static DEFINE_MUTEX(callchain_mutex);
2536struct callchain_cpus_entries *callchain_cpus_entries;
2537
2538
2539__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
2540 struct pt_regs *regs)
2541{
2542}
2543
2544__weak void perf_callchain_user(struct perf_callchain_entry *entry,
2545 struct pt_regs *regs)
2546{
2547}
2548
2549static void release_callchain_buffers_rcu(struct rcu_head *head)
2550{
2551 struct callchain_cpus_entries *entries;
2552 int cpu;
2553
2554 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
2555
2556 for_each_possible_cpu(cpu)
2557 kfree(entries->cpu_entries[cpu]);
2558
2559 kfree(entries);
2560}
2561
2562static void release_callchain_buffers(void)
2563{
2564 struct callchain_cpus_entries *entries;
2565
2566 entries = callchain_cpus_entries;
2567 rcu_assign_pointer(callchain_cpus_entries, NULL);
2568 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
2569}
2570
2571static int alloc_callchain_buffers(void)
2572{
2573 int cpu;
2574 int size;
2575 struct callchain_cpus_entries *entries;
2576
2577 /*
2578 * We can't use the percpu allocation API for data that can be
2579 * accessed from NMI. Use a temporary manual per cpu allocation
2580 * until that gets sorted out.
2581 */
2582 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
2583
2584 entries = kzalloc(size, GFP_KERNEL);
2585 if (!entries)
2586 return -ENOMEM;
2587
2588 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
2589
2590 for_each_possible_cpu(cpu) {
2591 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
2592 cpu_to_node(cpu));
2593 if (!entries->cpu_entries[cpu])
2594 goto fail;
2595 }
2596
2597 rcu_assign_pointer(callchain_cpus_entries, entries);
2598
2599 return 0;
2600
2601fail:
2602 for_each_possible_cpu(cpu)
2603 kfree(entries->cpu_entries[cpu]);
2604 kfree(entries);
2605
2606 return -ENOMEM;
2607}
2608
2609static int get_callchain_buffers(void)
2610{
2611 int err = 0;
2612 int count;
2613
2614 mutex_lock(&callchain_mutex);
2615
2616 count = atomic_inc_return(&nr_callchain_events);
2617 if (WARN_ON_ONCE(count < 1)) {
2618 err = -EINVAL;
2619 goto exit;
2620 }
2621
2622 if (count > 1) {
2623 /* If the allocation failed, give up */
2624 if (!callchain_cpus_entries)
2625 err = -ENOMEM;
2626 goto exit;
2627 }
2628
2629 err = alloc_callchain_buffers();
2630 if (err)
2631 release_callchain_buffers();
2632exit:
2633 mutex_unlock(&callchain_mutex);
2634
2635 return err;
2636}
2637
2638static void put_callchain_buffers(void)
2639{
2640 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
2641 release_callchain_buffers();
2642 mutex_unlock(&callchain_mutex);
2643 }
2644}
2645
2646static int get_recursion_context(int *recursion)
2647{
2648 int rctx;
2649
2650 if (in_nmi())
2651 rctx = 3;
2652 else if (in_irq())
2653 rctx = 2;
2654 else if (in_softirq())
2655 rctx = 1;
2656 else
2657 rctx = 0;
2658
2659 if (recursion[rctx])
2660 return -1;
2661
2662 recursion[rctx]++;
2663 barrier();
2664
2665 return rctx;
2666}
2667
2668static inline void put_recursion_context(int *recursion, int rctx)
2669{
2670 barrier();
2671 recursion[rctx]--;
2672}
2673
2674static struct perf_callchain_entry *get_callchain_entry(int *rctx)
2675{
2676 int cpu;
2677 struct callchain_cpus_entries *entries;
2678
2679 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
2680 if (*rctx == -1)
2681 return NULL;
2682
2683 entries = rcu_dereference(callchain_cpus_entries);
2684 if (!entries)
2685 return NULL;
2686
2687 cpu = smp_processor_id();
2688
2689 return &entries->cpu_entries[cpu][*rctx];
2690}
2691
1787static void 2692static void
1788__perf_event_init_context(struct perf_event_context *ctx, 2693put_callchain_entry(int rctx)
1789 struct task_struct *task) 2694{
2695 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
2696}
2697
2698static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2699{
2700 int rctx;
2701 struct perf_callchain_entry *entry;
2702
2703
2704 entry = get_callchain_entry(&rctx);
2705 if (rctx == -1)
2706 return NULL;
2707
2708 if (!entry)
2709 goto exit_put;
2710
2711 entry->nr = 0;
2712
2713 if (!user_mode(regs)) {
2714 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
2715 perf_callchain_kernel(entry, regs);
2716 if (current->mm)
2717 regs = task_pt_regs(current);
2718 else
2719 regs = NULL;
2720 }
2721
2722 if (regs) {
2723 perf_callchain_store(entry, PERF_CONTEXT_USER);
2724 perf_callchain_user(entry, regs);
2725 }
2726
2727exit_put:
2728 put_callchain_entry(rctx);
2729
2730 return entry;
2731}
2732
2733/*
2734 * Initialize the perf_event context in a task_struct:
2735 */
2736static void __perf_event_init_context(struct perf_event_context *ctx)
1790{ 2737{
1791 raw_spin_lock_init(&ctx->lock); 2738 raw_spin_lock_init(&ctx->lock);
1792 mutex_init(&ctx->mutex); 2739 mutex_init(&ctx->mutex);
@@ -1794,25 +2741,73 @@ __perf_event_init_context(struct perf_event_context *ctx,
1794 INIT_LIST_HEAD(&ctx->flexible_groups); 2741 INIT_LIST_HEAD(&ctx->flexible_groups);
1795 INIT_LIST_HEAD(&ctx->event_list); 2742 INIT_LIST_HEAD(&ctx->event_list);
1796 atomic_set(&ctx->refcount, 1); 2743 atomic_set(&ctx->refcount, 1);
1797 ctx->task = task;
1798} 2744}
1799 2745
1800static struct perf_event_context *find_get_context(pid_t pid, int cpu) 2746static struct perf_event_context *
2747alloc_perf_context(struct pmu *pmu, struct task_struct *task)
1801{ 2748{
1802 struct perf_event_context *ctx; 2749 struct perf_event_context *ctx;
1803 struct perf_cpu_context *cpuctx; 2750
2751 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
2752 if (!ctx)
2753 return NULL;
2754
2755 __perf_event_init_context(ctx);
2756 if (task) {
2757 ctx->task = task;
2758 get_task_struct(task);
2759 }
2760 ctx->pmu = pmu;
2761
2762 return ctx;
2763}
2764
2765static struct task_struct *
2766find_lively_task_by_vpid(pid_t vpid)
2767{
1804 struct task_struct *task; 2768 struct task_struct *task;
1805 unsigned long flags;
1806 int err; 2769 int err;
1807 2770
1808 if (pid == -1 && cpu != -1) { 2771 rcu_read_lock();
2772 if (!vpid)
2773 task = current;
2774 else
2775 task = find_task_by_vpid(vpid);
2776 if (task)
2777 get_task_struct(task);
2778 rcu_read_unlock();
2779
2780 if (!task)
2781 return ERR_PTR(-ESRCH);
2782
2783 /* Reuse ptrace permission checks for now. */
2784 err = -EACCES;
2785 if (!ptrace_may_access(task, PTRACE_MODE_READ))
2786 goto errout;
2787
2788 return task;
2789errout:
2790 put_task_struct(task);
2791 return ERR_PTR(err);
2792
2793}
2794
2795/*
2796 * Returns a matching context with refcount and pincount.
2797 */
2798static struct perf_event_context *
2799find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2800{
2801 struct perf_event_context *ctx;
2802 struct perf_cpu_context *cpuctx;
2803 unsigned long flags;
2804 int ctxn, err;
2805
2806 if (!task) {
1809 /* Must be root to operate on a CPU event: */ 2807 /* Must be root to operate on a CPU event: */
1810 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 2808 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1811 return ERR_PTR(-EACCES); 2809 return ERR_PTR(-EACCES);
1812 2810
1813 if (cpu < 0 || cpu >= nr_cpumask_bits)
1814 return ERR_PTR(-EINVAL);
1815
1816 /* 2811 /*
1817 * We could be clever and allow to attach a event to an 2812 * We could be clever and allow to attach a event to an
1818 * offline CPU and activate it when the CPU comes up, but 2813 * offline CPU and activate it when the CPU comes up, but
@@ -1821,67 +2816,64 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1821 if (!cpu_online(cpu)) 2816 if (!cpu_online(cpu))
1822 return ERR_PTR(-ENODEV); 2817 return ERR_PTR(-ENODEV);
1823 2818
1824 cpuctx = &per_cpu(perf_cpu_context, cpu); 2819 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
1825 ctx = &cpuctx->ctx; 2820 ctx = &cpuctx->ctx;
1826 get_ctx(ctx); 2821 get_ctx(ctx);
2822 ++ctx->pin_count;
1827 2823
1828 return ctx; 2824 return ctx;
1829 } 2825 }
1830 2826
1831 rcu_read_lock(); 2827 err = -EINVAL;
1832 if (!pid) 2828 ctxn = pmu->task_ctx_nr;
1833 task = current; 2829 if (ctxn < 0)
1834 else
1835 task = find_task_by_vpid(pid);
1836 if (task)
1837 get_task_struct(task);
1838 rcu_read_unlock();
1839
1840 if (!task)
1841 return ERR_PTR(-ESRCH);
1842
1843 /*
1844 * Can't attach events to a dying task.
1845 */
1846 err = -ESRCH;
1847 if (task->flags & PF_EXITING)
1848 goto errout;
1849
1850 /* Reuse ptrace permission checks for now. */
1851 err = -EACCES;
1852 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1853 goto errout; 2830 goto errout;
1854 2831
1855 retry: 2832retry:
1856 ctx = perf_lock_task_context(task, &flags); 2833 ctx = perf_lock_task_context(task, ctxn, &flags);
1857 if (ctx) { 2834 if (ctx) {
1858 unclone_ctx(ctx); 2835 unclone_ctx(ctx);
2836 ++ctx->pin_count;
1859 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2837 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1860 } 2838 }
1861 2839
1862 if (!ctx) { 2840 if (!ctx) {
1863 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); 2841 ctx = alloc_perf_context(pmu, task);
1864 err = -ENOMEM; 2842 err = -ENOMEM;
1865 if (!ctx) 2843 if (!ctx)
1866 goto errout; 2844 goto errout;
1867 __perf_event_init_context(ctx, task); 2845
1868 get_ctx(ctx); 2846 get_ctx(ctx);
1869 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { 2847
1870 /* 2848 err = 0;
1871 * We raced with some other task; use 2849 mutex_lock(&task->perf_event_mutex);
1872 * the context they set. 2850 /*
1873 */ 2851 * If it has already passed perf_event_exit_task().
2852 * we must see PF_EXITING, it takes this mutex too.
2853 */
2854 if (task->flags & PF_EXITING)
2855 err = -ESRCH;
2856 else if (task->perf_event_ctxp[ctxn])
2857 err = -EAGAIN;
2858 else {
2859 ++ctx->pin_count;
2860 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2861 }
2862 mutex_unlock(&task->perf_event_mutex);
2863
2864 if (unlikely(err)) {
2865 put_task_struct(task);
1874 kfree(ctx); 2866 kfree(ctx);
1875 goto retry; 2867
2868 if (err == -EAGAIN)
2869 goto retry;
2870 goto errout;
1876 } 2871 }
1877 get_task_struct(task);
1878 } 2872 }
1879 2873
1880 put_task_struct(task);
1881 return ctx; 2874 return ctx;
1882 2875
1883 errout: 2876errout:
1884 put_task_struct(task);
1885 return ERR_PTR(err); 2877 return ERR_PTR(err);
1886} 2878}
1887 2879
@@ -1898,21 +2890,27 @@ static void free_event_rcu(struct rcu_head *head)
1898 kfree(event); 2890 kfree(event);
1899} 2891}
1900 2892
1901static void perf_pending_sync(struct perf_event *event);
1902static void perf_buffer_put(struct perf_buffer *buffer); 2893static void perf_buffer_put(struct perf_buffer *buffer);
1903 2894
1904static void free_event(struct perf_event *event) 2895static void free_event(struct perf_event *event)
1905{ 2896{
1906 perf_pending_sync(event); 2897 irq_work_sync(&event->pending);
1907 2898
1908 if (!event->parent) { 2899 if (!event->parent) {
1909 atomic_dec(&nr_events); 2900 if (event->attach_state & PERF_ATTACH_TASK)
2901 jump_label_dec(&perf_sched_events);
1910 if (event->attr.mmap || event->attr.mmap_data) 2902 if (event->attr.mmap || event->attr.mmap_data)
1911 atomic_dec(&nr_mmap_events); 2903 atomic_dec(&nr_mmap_events);
1912 if (event->attr.comm) 2904 if (event->attr.comm)
1913 atomic_dec(&nr_comm_events); 2905 atomic_dec(&nr_comm_events);
1914 if (event->attr.task) 2906 if (event->attr.task)
1915 atomic_dec(&nr_task_events); 2907 atomic_dec(&nr_task_events);
2908 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2909 put_callchain_buffers();
2910 if (is_cgroup_event(event)) {
2911 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2912 jump_label_dec(&perf_sched_events);
2913 }
1916 } 2914 }
1917 2915
1918 if (event->buffer) { 2916 if (event->buffer) {
@@ -1920,10 +2918,15 @@ static void free_event(struct perf_event *event)
1920 event->buffer = NULL; 2918 event->buffer = NULL;
1921 } 2919 }
1922 2920
2921 if (is_cgroup_event(event))
2922 perf_detach_cgroup(event);
2923
1923 if (event->destroy) 2924 if (event->destroy)
1924 event->destroy(event); 2925 event->destroy(event);
1925 2926
1926 put_ctx(event->ctx); 2927 if (event->ctx)
2928 put_ctx(event->ctx);
2929
1927 call_rcu(&event->rcu_head, free_event_rcu); 2930 call_rcu(&event->rcu_head, free_event_rcu);
1928} 2931}
1929 2932
@@ -1957,11 +2960,6 @@ int perf_event_release_kernel(struct perf_event *event)
1957 raw_spin_unlock_irq(&ctx->lock); 2960 raw_spin_unlock_irq(&ctx->lock);
1958 mutex_unlock(&ctx->mutex); 2961 mutex_unlock(&ctx->mutex);
1959 2962
1960 mutex_lock(&event->owner->perf_event_mutex);
1961 list_del_init(&event->owner_entry);
1962 mutex_unlock(&event->owner->perf_event_mutex);
1963 put_task_struct(event->owner);
1964
1965 free_event(event); 2963 free_event(event);
1966 2964
1967 return 0; 2965 return 0;
@@ -1974,35 +2972,44 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
1974static int perf_release(struct inode *inode, struct file *file) 2972static int perf_release(struct inode *inode, struct file *file)
1975{ 2973{
1976 struct perf_event *event = file->private_data; 2974 struct perf_event *event = file->private_data;
2975 struct task_struct *owner;
1977 2976
1978 file->private_data = NULL; 2977 file->private_data = NULL;
1979 2978
1980 return perf_event_release_kernel(event); 2979 rcu_read_lock();
1981} 2980 owner = ACCESS_ONCE(event->owner);
1982 2981 /*
1983static int perf_event_read_size(struct perf_event *event) 2982 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
1984{ 2983 * !owner it means the list deletion is complete and we can indeed
1985 int entry = sizeof(u64); /* value */ 2984 * free this event, otherwise we need to serialize on
1986 int size = 0; 2985 * owner->perf_event_mutex.
1987 int nr = 1; 2986 */
1988 2987 smp_read_barrier_depends();
1989 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 2988 if (owner) {
1990 size += sizeof(u64); 2989 /*
1991 2990 * Since delayed_put_task_struct() also drops the last
1992 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 2991 * task reference we can safely take a new reference
1993 size += sizeof(u64); 2992 * while holding the rcu_read_lock().
1994 2993 */
1995 if (event->attr.read_format & PERF_FORMAT_ID) 2994 get_task_struct(owner);
1996 entry += sizeof(u64);
1997
1998 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1999 nr += event->group_leader->nr_siblings;
2000 size += sizeof(u64);
2001 } 2995 }
2996 rcu_read_unlock();
2002 2997
2003 size += entry * nr; 2998 if (owner) {
2999 mutex_lock(&owner->perf_event_mutex);
3000 /*
3001 * We have to re-check the event->owner field, if it is cleared
3002 * we raced with perf_event_exit_task(), acquiring the mutex
3003 * ensured they're done, and we can proceed with freeing the
3004 * event.
3005 */
3006 if (event->owner)
3007 list_del_init(&event->owner_entry);
3008 mutex_unlock(&owner->perf_event_mutex);
3009 put_task_struct(owner);
3010 }
2004 3011
2005 return size; 3012 return perf_event_release_kernel(event);
2006} 3013}
2007 3014
2008u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 3015u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
@@ -2119,7 +3126,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
2119 if (event->state == PERF_EVENT_STATE_ERROR) 3126 if (event->state == PERF_EVENT_STATE_ERROR)
2120 return 0; 3127 return 0;
2121 3128
2122 if (count < perf_event_read_size(event)) 3129 if (count < event->read_size)
2123 return -ENOSPC; 3130 return -ENOSPC;
2124 3131
2125 WARN_ON_ONCE(event->ctx->parent_ctx); 3132 WARN_ON_ONCE(event->ctx->parent_ctx);
@@ -2205,7 +3212,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
2205 int ret = 0; 3212 int ret = 0;
2206 u64 value; 3213 u64 value;
2207 3214
2208 if (!event->attr.sample_period) 3215 if (!is_sampling_event(event))
2209 return -EINVAL; 3216 return -EINVAL;
2210 3217
2211 if (copy_from_user(&value, arg, sizeof(value))) 3218 if (copy_from_user(&value, arg, sizeof(value)))
@@ -2342,6 +3349,9 @@ int perf_event_task_disable(void)
2342 3349
2343static int perf_event_index(struct perf_event *event) 3350static int perf_event_index(struct perf_event *event)
2344{ 3351{
3352 if (event->hw.state & PERF_HES_STOPPED)
3353 return 0;
3354
2345 if (event->state != PERF_EVENT_STATE_ACTIVE) 3355 if (event->state != PERF_EVENT_STATE_ACTIVE)
2346 return 0; 3356 return 0;
2347 3357
@@ -2845,16 +3855,7 @@ void perf_event_wakeup(struct perf_event *event)
2845 } 3855 }
2846} 3856}
2847 3857
2848/* 3858static void perf_pending_event(struct irq_work *entry)
2849 * Pending wakeups
2850 *
2851 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2852 *
2853 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2854 * single linked list and use cmpxchg() to add entries lockless.
2855 */
2856
2857static void perf_pending_event(struct perf_pending_entry *entry)
2858{ 3859{
2859 struct perf_event *event = container_of(entry, 3860 struct perf_event *event = container_of(entry,
2860 struct perf_event, pending); 3861 struct perf_event, pending);
@@ -2870,99 +3871,6 @@ static void perf_pending_event(struct perf_pending_entry *entry)
2870 } 3871 }
2871} 3872}
2872 3873
2873#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2874
2875static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2876 PENDING_TAIL,
2877};
2878
2879static void perf_pending_queue(struct perf_pending_entry *entry,
2880 void (*func)(struct perf_pending_entry *))
2881{
2882 struct perf_pending_entry **head;
2883
2884 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2885 return;
2886
2887 entry->func = func;
2888
2889 head = &get_cpu_var(perf_pending_head);
2890
2891 do {
2892 entry->next = *head;
2893 } while (cmpxchg(head, entry->next, entry) != entry->next);
2894
2895 set_perf_event_pending();
2896
2897 put_cpu_var(perf_pending_head);
2898}
2899
2900static int __perf_pending_run(void)
2901{
2902 struct perf_pending_entry *list;
2903 int nr = 0;
2904
2905 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2906 while (list != PENDING_TAIL) {
2907 void (*func)(struct perf_pending_entry *);
2908 struct perf_pending_entry *entry = list;
2909
2910 list = list->next;
2911
2912 func = entry->func;
2913 entry->next = NULL;
2914 /*
2915 * Ensure we observe the unqueue before we issue the wakeup,
2916 * so that we won't be waiting forever.
2917 * -- see perf_not_pending().
2918 */
2919 smp_wmb();
2920
2921 func(entry);
2922 nr++;
2923 }
2924
2925 return nr;
2926}
2927
2928static inline int perf_not_pending(struct perf_event *event)
2929{
2930 /*
2931 * If we flush on whatever cpu we run, there is a chance we don't
2932 * need to wait.
2933 */
2934 get_cpu();
2935 __perf_pending_run();
2936 put_cpu();
2937
2938 /*
2939 * Ensure we see the proper queue state before going to sleep
2940 * so that we do not miss the wakeup. -- see perf_pending_handle()
2941 */
2942 smp_rmb();
2943 return event->pending.next == NULL;
2944}
2945
2946static void perf_pending_sync(struct perf_event *event)
2947{
2948 wait_event(event->waitq, perf_not_pending(event));
2949}
2950
2951void perf_event_do_pending(void)
2952{
2953 __perf_pending_run();
2954}
2955
2956/*
2957 * Callchain support -- arch specific
2958 */
2959
2960__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2961{
2962 return NULL;
2963}
2964
2965
2966/* 3874/*
2967 * We assume there is only KVM supporting the callbacks. 3875 * We assume there is only KVM supporting the callbacks.
2968 * Later on, we might change it to a list if there is 3876 * Later on, we might change it to a list if there is
@@ -3012,8 +3920,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
3012 3920
3013 if (handle->nmi) { 3921 if (handle->nmi) {
3014 handle->event->pending_wakeup = 1; 3922 handle->event->pending_wakeup = 1;
3015 perf_pending_queue(&handle->event->pending, 3923 irq_work_queue(&handle->event->pending);
3016 perf_pending_event);
3017 } else 3924 } else
3018 perf_event_wakeup(handle->event); 3925 perf_event_wakeup(handle->event);
3019} 3926}
@@ -3069,7 +3976,7 @@ again:
3069 if (handle->wakeup != local_read(&buffer->wakeup)) 3976 if (handle->wakeup != local_read(&buffer->wakeup))
3070 perf_output_wakeup(handle); 3977 perf_output_wakeup(handle);
3071 3978
3072 out: 3979out:
3073 preempt_enable(); 3980 preempt_enable();
3074} 3981}
3075 3982
@@ -3096,6 +4003,73 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
3096 } while (len); 4003 } while (len);
3097} 4004}
3098 4005
4006static void __perf_event_header__init_id(struct perf_event_header *header,
4007 struct perf_sample_data *data,
4008 struct perf_event *event)
4009{
4010 u64 sample_type = event->attr.sample_type;
4011
4012 data->type = sample_type;
4013 header->size += event->id_header_size;
4014
4015 if (sample_type & PERF_SAMPLE_TID) {
4016 /* namespace issues */
4017 data->tid_entry.pid = perf_event_pid(event, current);
4018 data->tid_entry.tid = perf_event_tid(event, current);
4019 }
4020
4021 if (sample_type & PERF_SAMPLE_TIME)
4022 data->time = perf_clock();
4023
4024 if (sample_type & PERF_SAMPLE_ID)
4025 data->id = primary_event_id(event);
4026
4027 if (sample_type & PERF_SAMPLE_STREAM_ID)
4028 data->stream_id = event->id;
4029
4030 if (sample_type & PERF_SAMPLE_CPU) {
4031 data->cpu_entry.cpu = raw_smp_processor_id();
4032 data->cpu_entry.reserved = 0;
4033 }
4034}
4035
4036static void perf_event_header__init_id(struct perf_event_header *header,
4037 struct perf_sample_data *data,
4038 struct perf_event *event)
4039{
4040 if (event->attr.sample_id_all)
4041 __perf_event_header__init_id(header, data, event);
4042}
4043
4044static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4045 struct perf_sample_data *data)
4046{
4047 u64 sample_type = data->type;
4048
4049 if (sample_type & PERF_SAMPLE_TID)
4050 perf_output_put(handle, data->tid_entry);
4051
4052 if (sample_type & PERF_SAMPLE_TIME)
4053 perf_output_put(handle, data->time);
4054
4055 if (sample_type & PERF_SAMPLE_ID)
4056 perf_output_put(handle, data->id);
4057
4058 if (sample_type & PERF_SAMPLE_STREAM_ID)
4059 perf_output_put(handle, data->stream_id);
4060
4061 if (sample_type & PERF_SAMPLE_CPU)
4062 perf_output_put(handle, data->cpu_entry);
4063}
4064
4065static void perf_event__output_id_sample(struct perf_event *event,
4066 struct perf_output_handle *handle,
4067 struct perf_sample_data *sample)
4068{
4069 if (event->attr.sample_id_all)
4070 __perf_event__output_id_sample(handle, sample);
4071}
4072
3099int perf_output_begin(struct perf_output_handle *handle, 4073int perf_output_begin(struct perf_output_handle *handle,
3100 struct perf_event *event, unsigned int size, 4074 struct perf_event *event, unsigned int size,
3101 int nmi, int sample) 4075 int nmi, int sample)
@@ -3103,6 +4077,7 @@ int perf_output_begin(struct perf_output_handle *handle,
3103 struct perf_buffer *buffer; 4077 struct perf_buffer *buffer;
3104 unsigned long tail, offset, head; 4078 unsigned long tail, offset, head;
3105 int have_lost; 4079 int have_lost;
4080 struct perf_sample_data sample_data;
3106 struct { 4081 struct {
3107 struct perf_event_header header; 4082 struct perf_event_header header;
3108 u64 id; 4083 u64 id;
@@ -3129,8 +4104,12 @@ int perf_output_begin(struct perf_output_handle *handle,
3129 goto out; 4104 goto out;
3130 4105
3131 have_lost = local_read(&buffer->lost); 4106 have_lost = local_read(&buffer->lost);
3132 if (have_lost) 4107 if (have_lost) {
3133 size += sizeof(lost_event); 4108 lost_event.header.size = sizeof(lost_event);
4109 perf_event_header__init_id(&lost_event.header, &sample_data,
4110 event);
4111 size += lost_event.header.size;
4112 }
3134 4113
3135 perf_output_get_handle(handle); 4114 perf_output_get_handle(handle);
3136 4115
@@ -3161,11 +4140,11 @@ int perf_output_begin(struct perf_output_handle *handle,
3161 if (have_lost) { 4140 if (have_lost) {
3162 lost_event.header.type = PERF_RECORD_LOST; 4141 lost_event.header.type = PERF_RECORD_LOST;
3163 lost_event.header.misc = 0; 4142 lost_event.header.misc = 0;
3164 lost_event.header.size = sizeof(lost_event);
3165 lost_event.id = event->id; 4143 lost_event.id = event->id;
3166 lost_event.lost = local_xchg(&buffer->lost, 0); 4144 lost_event.lost = local_xchg(&buffer->lost, 0);
3167 4145
3168 perf_output_put(handle, lost_event); 4146 perf_output_put(handle, lost_event);
4147 perf_event__output_id_sample(event, handle, &sample_data);
3169 } 4148 }
3170 4149
3171 return 0; 4150 return 0;
@@ -3198,30 +4177,9 @@ void perf_output_end(struct perf_output_handle *handle)
3198 rcu_read_unlock(); 4177 rcu_read_unlock();
3199} 4178}
3200 4179
3201static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
3202{
3203 /*
3204 * only top level events have the pid namespace they were created in
3205 */
3206 if (event->parent)
3207 event = event->parent;
3208
3209 return task_tgid_nr_ns(p, event->ns);
3210}
3211
3212static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
3213{
3214 /*
3215 * only top level events have the pid namespace they were created in
3216 */
3217 if (event->parent)
3218 event = event->parent;
3219
3220 return task_pid_nr_ns(p, event->ns);
3221}
3222
3223static void perf_output_read_one(struct perf_output_handle *handle, 4180static void perf_output_read_one(struct perf_output_handle *handle,
3224 struct perf_event *event) 4181 struct perf_event *event,
4182 u64 enabled, u64 running)
3225{ 4183{
3226 u64 read_format = event->attr.read_format; 4184 u64 read_format = event->attr.read_format;
3227 u64 values[4]; 4185 u64 values[4];
@@ -3229,11 +4187,11 @@ static void perf_output_read_one(struct perf_output_handle *handle,
3229 4187
3230 values[n++] = perf_event_count(event); 4188 values[n++] = perf_event_count(event);
3231 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 4189 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3232 values[n++] = event->total_time_enabled + 4190 values[n++] = enabled +
3233 atomic64_read(&event->child_total_time_enabled); 4191 atomic64_read(&event->child_total_time_enabled);
3234 } 4192 }
3235 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 4193 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3236 values[n++] = event->total_time_running + 4194 values[n++] = running +
3237 atomic64_read(&event->child_total_time_running); 4195 atomic64_read(&event->child_total_time_running);
3238 } 4196 }
3239 if (read_format & PERF_FORMAT_ID) 4197 if (read_format & PERF_FORMAT_ID)
@@ -3246,7 +4204,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
3246 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. 4204 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3247 */ 4205 */
3248static void perf_output_read_group(struct perf_output_handle *handle, 4206static void perf_output_read_group(struct perf_output_handle *handle,
3249 struct perf_event *event) 4207 struct perf_event *event,
4208 u64 enabled, u64 running)
3250{ 4209{
3251 struct perf_event *leader = event->group_leader, *sub; 4210 struct perf_event *leader = event->group_leader, *sub;
3252 u64 read_format = event->attr.read_format; 4211 u64 read_format = event->attr.read_format;
@@ -3256,10 +4215,10 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3256 values[n++] = 1 + leader->nr_siblings; 4215 values[n++] = 1 + leader->nr_siblings;
3257 4216
3258 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 4217 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3259 values[n++] = leader->total_time_enabled; 4218 values[n++] = enabled;
3260 4219
3261 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 4220 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3262 values[n++] = leader->total_time_running; 4221 values[n++] = running;
3263 4222
3264 if (leader != event) 4223 if (leader != event)
3265 leader->pmu->read(leader); 4224 leader->pmu->read(leader);
@@ -3284,13 +4243,35 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3284 } 4243 }
3285} 4244}
3286 4245
4246#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
4247 PERF_FORMAT_TOTAL_TIME_RUNNING)
4248
3287static void perf_output_read(struct perf_output_handle *handle, 4249static void perf_output_read(struct perf_output_handle *handle,
3288 struct perf_event *event) 4250 struct perf_event *event)
3289{ 4251{
4252 u64 enabled = 0, running = 0, now, ctx_time;
4253 u64 read_format = event->attr.read_format;
4254
4255 /*
4256 * compute total_time_enabled, total_time_running
4257 * based on snapshot values taken when the event
4258 * was last scheduled in.
4259 *
4260 * we cannot simply called update_context_time()
4261 * because of locking issue as we are called in
4262 * NMI context
4263 */
4264 if (read_format & PERF_FORMAT_TOTAL_TIMES) {
4265 now = perf_clock();
4266 ctx_time = event->shadow_ctx_time + now;
4267 enabled = ctx_time - event->tstamp_enabled;
4268 running = ctx_time - event->tstamp_running;
4269 }
4270
3290 if (event->attr.read_format & PERF_FORMAT_GROUP) 4271 if (event->attr.read_format & PERF_FORMAT_GROUP)
3291 perf_output_read_group(handle, event); 4272 perf_output_read_group(handle, event, enabled, running);
3292 else 4273 else
3293 perf_output_read_one(handle, event); 4274 perf_output_read_one(handle, event, enabled, running);
3294} 4275}
3295 4276
3296void perf_output_sample(struct perf_output_handle *handle, 4277void perf_output_sample(struct perf_output_handle *handle,
@@ -3370,61 +4351,16 @@ void perf_prepare_sample(struct perf_event_header *header,
3370{ 4351{
3371 u64 sample_type = event->attr.sample_type; 4352 u64 sample_type = event->attr.sample_type;
3372 4353
3373 data->type = sample_type;
3374
3375 header->type = PERF_RECORD_SAMPLE; 4354 header->type = PERF_RECORD_SAMPLE;
3376 header->size = sizeof(*header); 4355 header->size = sizeof(*header) + event->header_size;
3377 4356
3378 header->misc = 0; 4357 header->misc = 0;
3379 header->misc |= perf_misc_flags(regs); 4358 header->misc |= perf_misc_flags(regs);
3380 4359
3381 if (sample_type & PERF_SAMPLE_IP) { 4360 __perf_event_header__init_id(header, data, event);
3382 data->ip = perf_instruction_pointer(regs);
3383
3384 header->size += sizeof(data->ip);
3385 }
3386
3387 if (sample_type & PERF_SAMPLE_TID) {
3388 /* namespace issues */
3389 data->tid_entry.pid = perf_event_pid(event, current);
3390 data->tid_entry.tid = perf_event_tid(event, current);
3391
3392 header->size += sizeof(data->tid_entry);
3393 }
3394
3395 if (sample_type & PERF_SAMPLE_TIME) {
3396 data->time = perf_clock();
3397
3398 header->size += sizeof(data->time);
3399 }
3400
3401 if (sample_type & PERF_SAMPLE_ADDR)
3402 header->size += sizeof(data->addr);
3403
3404 if (sample_type & PERF_SAMPLE_ID) {
3405 data->id = primary_event_id(event);
3406
3407 header->size += sizeof(data->id);
3408 }
3409
3410 if (sample_type & PERF_SAMPLE_STREAM_ID) {
3411 data->stream_id = event->id;
3412
3413 header->size += sizeof(data->stream_id);
3414 }
3415
3416 if (sample_type & PERF_SAMPLE_CPU) {
3417 data->cpu_entry.cpu = raw_smp_processor_id();
3418 data->cpu_entry.reserved = 0;
3419 4361
3420 header->size += sizeof(data->cpu_entry); 4362 if (sample_type & PERF_SAMPLE_IP)
3421 } 4363 data->ip = perf_instruction_pointer(regs);
3422
3423 if (sample_type & PERF_SAMPLE_PERIOD)
3424 header->size += sizeof(data->period);
3425
3426 if (sample_type & PERF_SAMPLE_READ)
3427 header->size += perf_event_read_size(event);
3428 4364
3429 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 4365 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3430 int size = 1; 4366 int size = 1;
@@ -3457,14 +4393,20 @@ static void perf_event_output(struct perf_event *event, int nmi,
3457 struct perf_output_handle handle; 4393 struct perf_output_handle handle;
3458 struct perf_event_header header; 4394 struct perf_event_header header;
3459 4395
4396 /* protect the callchain buffers */
4397 rcu_read_lock();
4398
3460 perf_prepare_sample(&header, data, event, regs); 4399 perf_prepare_sample(&header, data, event, regs);
3461 4400
3462 if (perf_output_begin(&handle, event, header.size, nmi, 1)) 4401 if (perf_output_begin(&handle, event, header.size, nmi, 1))
3463 return; 4402 goto exit;
3464 4403
3465 perf_output_sample(&handle, &header, data, event); 4404 perf_output_sample(&handle, &header, data, event);
3466 4405
3467 perf_output_end(&handle); 4406 perf_output_end(&handle);
4407
4408exit:
4409 rcu_read_unlock();
3468} 4410}
3469 4411
3470/* 4412/*
@@ -3483,23 +4425,26 @@ perf_event_read_event(struct perf_event *event,
3483 struct task_struct *task) 4425 struct task_struct *task)
3484{ 4426{
3485 struct perf_output_handle handle; 4427 struct perf_output_handle handle;
4428 struct perf_sample_data sample;
3486 struct perf_read_event read_event = { 4429 struct perf_read_event read_event = {
3487 .header = { 4430 .header = {
3488 .type = PERF_RECORD_READ, 4431 .type = PERF_RECORD_READ,
3489 .misc = 0, 4432 .misc = 0,
3490 .size = sizeof(read_event) + perf_event_read_size(event), 4433 .size = sizeof(read_event) + event->read_size,
3491 }, 4434 },
3492 .pid = perf_event_pid(event, task), 4435 .pid = perf_event_pid(event, task),
3493 .tid = perf_event_tid(event, task), 4436 .tid = perf_event_tid(event, task),
3494 }; 4437 };
3495 int ret; 4438 int ret;
3496 4439
4440 perf_event_header__init_id(&read_event.header, &sample, event);
3497 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); 4441 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3498 if (ret) 4442 if (ret)
3499 return; 4443 return;
3500 4444
3501 perf_output_put(&handle, read_event); 4445 perf_output_put(&handle, read_event);
3502 perf_output_read(&handle, event); 4446 perf_output_read(&handle, event);
4447 perf_event__output_id_sample(event, &handle, &sample);
3503 4448
3504 perf_output_end(&handle); 4449 perf_output_end(&handle);
3505} 4450}
@@ -3529,14 +4474,16 @@ static void perf_event_task_output(struct perf_event *event,
3529 struct perf_task_event *task_event) 4474 struct perf_task_event *task_event)
3530{ 4475{
3531 struct perf_output_handle handle; 4476 struct perf_output_handle handle;
4477 struct perf_sample_data sample;
3532 struct task_struct *task = task_event->task; 4478 struct task_struct *task = task_event->task;
3533 int size, ret; 4479 int ret, size = task_event->event_id.header.size;
3534 4480
3535 size = task_event->event_id.header.size; 4481 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
3536 ret = perf_output_begin(&handle, event, size, 0, 0);
3537 4482
4483 ret = perf_output_begin(&handle, event,
4484 task_event->event_id.header.size, 0, 0);
3538 if (ret) 4485 if (ret)
3539 return; 4486 goto out;
3540 4487
3541 task_event->event_id.pid = perf_event_pid(event, task); 4488 task_event->event_id.pid = perf_event_pid(event, task);
3542 task_event->event_id.ppid = perf_event_pid(event, current); 4489 task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3546,7 +4493,11 @@ static void perf_event_task_output(struct perf_event *event,
3546 4493
3547 perf_output_put(&handle, task_event->event_id); 4494 perf_output_put(&handle, task_event->event_id);
3548 4495
4496 perf_event__output_id_sample(event, &handle, &sample);
4497
3549 perf_output_end(&handle); 4498 perf_output_end(&handle);
4499out:
4500 task_event->event_id.header.size = size;
3550} 4501}
3551 4502
3552static int perf_event_task_match(struct perf_event *event) 4503static int perf_event_task_match(struct perf_event *event)
@@ -3554,7 +4505,7 @@ static int perf_event_task_match(struct perf_event *event)
3554 if (event->state < PERF_EVENT_STATE_INACTIVE) 4505 if (event->state < PERF_EVENT_STATE_INACTIVE)
3555 return 0; 4506 return 0;
3556 4507
3557 if (event->cpu != -1 && event->cpu != smp_processor_id()) 4508 if (!event_filter_match(event))
3558 return 0; 4509 return 0;
3559 4510
3560 if (event->attr.comm || event->attr.mmap || 4511 if (event->attr.comm || event->attr.mmap ||
@@ -3578,16 +4529,29 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
3578static void perf_event_task_event(struct perf_task_event *task_event) 4529static void perf_event_task_event(struct perf_task_event *task_event)
3579{ 4530{
3580 struct perf_cpu_context *cpuctx; 4531 struct perf_cpu_context *cpuctx;
3581 struct perf_event_context *ctx = task_event->task_ctx; 4532 struct perf_event_context *ctx;
4533 struct pmu *pmu;
4534 int ctxn;
3582 4535
3583 rcu_read_lock(); 4536 rcu_read_lock();
3584 cpuctx = &get_cpu_var(perf_cpu_context); 4537 list_for_each_entry_rcu(pmu, &pmus, entry) {
3585 perf_event_task_ctx(&cpuctx->ctx, task_event); 4538 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3586 if (!ctx) 4539 if (cpuctx->active_pmu != pmu)
3587 ctx = rcu_dereference(current->perf_event_ctxp); 4540 goto next;
3588 if (ctx) 4541 perf_event_task_ctx(&cpuctx->ctx, task_event);
3589 perf_event_task_ctx(ctx, task_event); 4542
3590 put_cpu_var(perf_cpu_context); 4543 ctx = task_event->task_ctx;
4544 if (!ctx) {
4545 ctxn = pmu->task_ctx_nr;
4546 if (ctxn < 0)
4547 goto next;
4548 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4549 }
4550 if (ctx)
4551 perf_event_task_ctx(ctx, task_event);
4552next:
4553 put_cpu_ptr(pmu->pmu_cpu_context);
4554 }
3591 rcu_read_unlock(); 4555 rcu_read_unlock();
3592} 4556}
3593 4557
@@ -3648,11 +4612,16 @@ static void perf_event_comm_output(struct perf_event *event,
3648 struct perf_comm_event *comm_event) 4612 struct perf_comm_event *comm_event)
3649{ 4613{
3650 struct perf_output_handle handle; 4614 struct perf_output_handle handle;
4615 struct perf_sample_data sample;
3651 int size = comm_event->event_id.header.size; 4616 int size = comm_event->event_id.header.size;
3652 int ret = perf_output_begin(&handle, event, size, 0, 0); 4617 int ret;
4618
4619 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4620 ret = perf_output_begin(&handle, event,
4621 comm_event->event_id.header.size, 0, 0);
3653 4622
3654 if (ret) 4623 if (ret)
3655 return; 4624 goto out;
3656 4625
3657 comm_event->event_id.pid = perf_event_pid(event, comm_event->task); 4626 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3658 comm_event->event_id.tid = perf_event_tid(event, comm_event->task); 4627 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
@@ -3660,7 +4629,12 @@ static void perf_event_comm_output(struct perf_event *event,
3660 perf_output_put(&handle, comm_event->event_id); 4629 perf_output_put(&handle, comm_event->event_id);
3661 perf_output_copy(&handle, comm_event->comm, 4630 perf_output_copy(&handle, comm_event->comm,
3662 comm_event->comm_size); 4631 comm_event->comm_size);
4632
4633 perf_event__output_id_sample(event, &handle, &sample);
4634
3663 perf_output_end(&handle); 4635 perf_output_end(&handle);
4636out:
4637 comm_event->event_id.header.size = size;
3664} 4638}
3665 4639
3666static int perf_event_comm_match(struct perf_event *event) 4640static int perf_event_comm_match(struct perf_event *event)
@@ -3668,7 +4642,7 @@ static int perf_event_comm_match(struct perf_event *event)
3668 if (event->state < PERF_EVENT_STATE_INACTIVE) 4642 if (event->state < PERF_EVENT_STATE_INACTIVE)
3669 return 0; 4643 return 0;
3670 4644
3671 if (event->cpu != -1 && event->cpu != smp_processor_id()) 4645 if (!event_filter_match(event))
3672 return 0; 4646 return 0;
3673 4647
3674 if (event->attr.comm) 4648 if (event->attr.comm)
@@ -3692,8 +4666,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3692{ 4666{
3693 struct perf_cpu_context *cpuctx; 4667 struct perf_cpu_context *cpuctx;
3694 struct perf_event_context *ctx; 4668 struct perf_event_context *ctx;
3695 unsigned int size;
3696 char comm[TASK_COMM_LEN]; 4669 char comm[TASK_COMM_LEN];
4670 unsigned int size;
4671 struct pmu *pmu;
4672 int ctxn;
3697 4673
3698 memset(comm, 0, sizeof(comm)); 4674 memset(comm, 0, sizeof(comm));
3699 strlcpy(comm, comm_event->task->comm, sizeof(comm)); 4675 strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -3703,23 +4679,39 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3703 comm_event->comm_size = size; 4679 comm_event->comm_size = size;
3704 4680
3705 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 4681 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3706
3707 rcu_read_lock(); 4682 rcu_read_lock();
3708 cpuctx = &get_cpu_var(perf_cpu_context); 4683 list_for_each_entry_rcu(pmu, &pmus, entry) {
3709 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 4684 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3710 ctx = rcu_dereference(current->perf_event_ctxp); 4685 if (cpuctx->active_pmu != pmu)
3711 if (ctx) 4686 goto next;
3712 perf_event_comm_ctx(ctx, comm_event); 4687 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3713 put_cpu_var(perf_cpu_context); 4688
4689 ctxn = pmu->task_ctx_nr;
4690 if (ctxn < 0)
4691 goto next;
4692
4693 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4694 if (ctx)
4695 perf_event_comm_ctx(ctx, comm_event);
4696next:
4697 put_cpu_ptr(pmu->pmu_cpu_context);
4698 }
3714 rcu_read_unlock(); 4699 rcu_read_unlock();
3715} 4700}
3716 4701
3717void perf_event_comm(struct task_struct *task) 4702void perf_event_comm(struct task_struct *task)
3718{ 4703{
3719 struct perf_comm_event comm_event; 4704 struct perf_comm_event comm_event;
4705 struct perf_event_context *ctx;
4706 int ctxn;
4707
4708 for_each_task_context_nr(ctxn) {
4709 ctx = task->perf_event_ctxp[ctxn];
4710 if (!ctx)
4711 continue;
3720 4712
3721 if (task->perf_event_ctxp) 4713 perf_event_enable_on_exec(ctx);
3722 perf_event_enable_on_exec(task); 4714 }
3723 4715
3724 if (!atomic_read(&nr_comm_events)) 4716 if (!atomic_read(&nr_comm_events))
3725 return; 4717 return;
@@ -3767,11 +4759,15 @@ static void perf_event_mmap_output(struct perf_event *event,
3767 struct perf_mmap_event *mmap_event) 4759 struct perf_mmap_event *mmap_event)
3768{ 4760{
3769 struct perf_output_handle handle; 4761 struct perf_output_handle handle;
4762 struct perf_sample_data sample;
3770 int size = mmap_event->event_id.header.size; 4763 int size = mmap_event->event_id.header.size;
3771 int ret = perf_output_begin(&handle, event, size, 0, 0); 4764 int ret;
3772 4765
4766 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4767 ret = perf_output_begin(&handle, event,
4768 mmap_event->event_id.header.size, 0, 0);
3773 if (ret) 4769 if (ret)
3774 return; 4770 goto out;
3775 4771
3776 mmap_event->event_id.pid = perf_event_pid(event, current); 4772 mmap_event->event_id.pid = perf_event_pid(event, current);
3777 mmap_event->event_id.tid = perf_event_tid(event, current); 4773 mmap_event->event_id.tid = perf_event_tid(event, current);
@@ -3779,7 +4775,12 @@ static void perf_event_mmap_output(struct perf_event *event,
3779 perf_output_put(&handle, mmap_event->event_id); 4775 perf_output_put(&handle, mmap_event->event_id);
3780 perf_output_copy(&handle, mmap_event->file_name, 4776 perf_output_copy(&handle, mmap_event->file_name,
3781 mmap_event->file_size); 4777 mmap_event->file_size);
4778
4779 perf_event__output_id_sample(event, &handle, &sample);
4780
3782 perf_output_end(&handle); 4781 perf_output_end(&handle);
4782out:
4783 mmap_event->event_id.header.size = size;
3783} 4784}
3784 4785
3785static int perf_event_mmap_match(struct perf_event *event, 4786static int perf_event_mmap_match(struct perf_event *event,
@@ -3789,7 +4790,7 @@ static int perf_event_mmap_match(struct perf_event *event,
3789 if (event->state < PERF_EVENT_STATE_INACTIVE) 4790 if (event->state < PERF_EVENT_STATE_INACTIVE)
3790 return 0; 4791 return 0;
3791 4792
3792 if (event->cpu != -1 && event->cpu != smp_processor_id()) 4793 if (!event_filter_match(event))
3793 return 0; 4794 return 0;
3794 4795
3795 if ((!executable && event->attr.mmap_data) || 4796 if ((!executable && event->attr.mmap_data) ||
@@ -3821,6 +4822,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3821 char tmp[16]; 4822 char tmp[16];
3822 char *buf = NULL; 4823 char *buf = NULL;
3823 const char *name; 4824 const char *name;
4825 struct pmu *pmu;
4826 int ctxn;
3824 4827
3825 memset(tmp, 0, sizeof(tmp)); 4828 memset(tmp, 0, sizeof(tmp));
3826 4829
@@ -3873,12 +4876,25 @@ got_name:
3873 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 4876 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3874 4877
3875 rcu_read_lock(); 4878 rcu_read_lock();
3876 cpuctx = &get_cpu_var(perf_cpu_context); 4879 list_for_each_entry_rcu(pmu, &pmus, entry) {
3877 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); 4880 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3878 ctx = rcu_dereference(current->perf_event_ctxp); 4881 if (cpuctx->active_pmu != pmu)
3879 if (ctx) 4882 goto next;
3880 perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); 4883 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
3881 put_cpu_var(perf_cpu_context); 4884 vma->vm_flags & VM_EXEC);
4885
4886 ctxn = pmu->task_ctx_nr;
4887 if (ctxn < 0)
4888 goto next;
4889
4890 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4891 if (ctx) {
4892 perf_event_mmap_ctx(ctx, mmap_event,
4893 vma->vm_flags & VM_EXEC);
4894 }
4895next:
4896 put_cpu_ptr(pmu->pmu_cpu_context);
4897 }
3882 rcu_read_unlock(); 4898 rcu_read_unlock();
3883 4899
3884 kfree(buf); 4900 kfree(buf);
@@ -3919,6 +4935,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
3919static void perf_log_throttle(struct perf_event *event, int enable) 4935static void perf_log_throttle(struct perf_event *event, int enable)
3920{ 4936{
3921 struct perf_output_handle handle; 4937 struct perf_output_handle handle;
4938 struct perf_sample_data sample;
3922 int ret; 4939 int ret;
3923 4940
3924 struct { 4941 struct {
@@ -3940,11 +4957,15 @@ static void perf_log_throttle(struct perf_event *event, int enable)
3940 if (enable) 4957 if (enable)
3941 throttle_event.header.type = PERF_RECORD_UNTHROTTLE; 4958 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3942 4959
3943 ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); 4960 perf_event_header__init_id(&throttle_event.header, &sample, event);
4961
4962 ret = perf_output_begin(&handle, event,
4963 throttle_event.header.size, 1, 0);
3944 if (ret) 4964 if (ret)
3945 return; 4965 return;
3946 4966
3947 perf_output_put(&handle, throttle_event); 4967 perf_output_put(&handle, throttle_event);
4968 perf_event__output_id_sample(event, &handle, &sample);
3948 perf_output_end(&handle); 4969 perf_output_end(&handle);
3949} 4970}
3950 4971
@@ -3960,28 +4981,21 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3960 struct hw_perf_event *hwc = &event->hw; 4981 struct hw_perf_event *hwc = &event->hw;
3961 int ret = 0; 4982 int ret = 0;
3962 4983
3963 throttle = (throttle && event->pmu->unthrottle != NULL); 4984 /*
4985 * Non-sampling counters might still use the PMI to fold short
4986 * hardware counters, ignore those.
4987 */
4988 if (unlikely(!is_sampling_event(event)))
4989 return 0;
3964 4990
3965 if (!throttle) { 4991 if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
3966 hwc->interrupts++; 4992 if (throttle) {
3967 } else { 4993 hwc->interrupts = MAX_INTERRUPTS;
3968 if (hwc->interrupts != MAX_INTERRUPTS) { 4994 perf_log_throttle(event, 0);
3969 hwc->interrupts++;
3970 if (HZ * hwc->interrupts >
3971 (u64)sysctl_perf_event_sample_rate) {
3972 hwc->interrupts = MAX_INTERRUPTS;
3973 perf_log_throttle(event, 0);
3974 ret = 1;
3975 }
3976 } else {
3977 /*
3978 * Keep re-disabling events even though on the previous
3979 * pass we disabled it - just in case we raced with a
3980 * sched-in and the event got enabled again:
3981 */
3982 ret = 1; 4995 ret = 1;
3983 } 4996 }
3984 } 4997 } else
4998 hwc->interrupts++;
3985 4999
3986 if (event->attr.freq) { 5000 if (event->attr.freq) {
3987 u64 now = perf_clock(); 5001 u64 now = perf_clock();
@@ -4004,8 +5018,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4004 event->pending_kill = POLL_HUP; 5018 event->pending_kill = POLL_HUP;
4005 if (nmi) { 5019 if (nmi) {
4006 event->pending_disable = 1; 5020 event->pending_disable = 1;
4007 perf_pending_queue(&event->pending, 5021 irq_work_queue(&event->pending);
4008 perf_pending_event);
4009 } else 5022 } else
4010 perf_event_disable(event); 5023 perf_event_disable(event);
4011 } 5024 }
@@ -4015,6 +5028,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4015 else 5028 else
4016 perf_event_output(event, nmi, data, regs); 5029 perf_event_output(event, nmi, data, regs);
4017 5030
5031 if (event->fasync && event->pending_kill) {
5032 if (nmi) {
5033 event->pending_wakeup = 1;
5034 irq_work_queue(&event->pending);
5035 } else
5036 perf_event_wakeup(event);
5037 }
5038
4018 return ret; 5039 return ret;
4019} 5040}
4020 5041
@@ -4029,6 +5050,17 @@ int perf_event_overflow(struct perf_event *event, int nmi,
4029 * Generic software event infrastructure 5050 * Generic software event infrastructure
4030 */ 5051 */
4031 5052
5053struct swevent_htable {
5054 struct swevent_hlist *swevent_hlist;
5055 struct mutex hlist_mutex;
5056 int hlist_refcount;
5057
5058 /* Recursion avoidance in each contexts */
5059 int recursion[PERF_NR_CONTEXTS];
5060};
5061
5062static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
5063
4032/* 5064/*
4033 * We directly increment event->count and keep a second value in 5065 * We directly increment event->count and keep a second value in
4034 * event->hw.period_left to count intervals. This period event 5066 * event->hw.period_left to count intervals. This period event
@@ -4086,7 +5118,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4086 } 5118 }
4087} 5119}
4088 5120
4089static void perf_swevent_add(struct perf_event *event, u64 nr, 5121static void perf_swevent_event(struct perf_event *event, u64 nr,
4090 int nmi, struct perf_sample_data *data, 5122 int nmi, struct perf_sample_data *data,
4091 struct pt_regs *regs) 5123 struct pt_regs *regs)
4092{ 5124{
@@ -4097,7 +5129,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4097 if (!regs) 5129 if (!regs)
4098 return; 5130 return;
4099 5131
4100 if (!hwc->sample_period) 5132 if (!is_sampling_event(event))
4101 return; 5133 return;
4102 5134
4103 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 5135 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
@@ -4112,6 +5144,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4112static int perf_exclude_event(struct perf_event *event, 5144static int perf_exclude_event(struct perf_event *event,
4113 struct pt_regs *regs) 5145 struct pt_regs *regs)
4114{ 5146{
5147 if (event->hw.state & PERF_HES_STOPPED)
5148 return 1;
5149
4115 if (regs) { 5150 if (regs) {
4116 if (event->attr.exclude_user && user_mode(regs)) 5151 if (event->attr.exclude_user && user_mode(regs))
4117 return 1; 5152 return 1;
@@ -4158,11 +5193,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
4158 5193
4159/* For the read side: events when they trigger */ 5194/* For the read side: events when they trigger */
4160static inline struct hlist_head * 5195static inline struct hlist_head *
4161find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) 5196find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
4162{ 5197{
4163 struct swevent_hlist *hlist; 5198 struct swevent_hlist *hlist;
4164 5199
4165 hlist = rcu_dereference(ctx->swevent_hlist); 5200 hlist = rcu_dereference(swhash->swevent_hlist);
4166 if (!hlist) 5201 if (!hlist)
4167 return NULL; 5202 return NULL;
4168 5203
@@ -4171,7 +5206,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4171 5206
4172/* For the event head insertion and removal in the hlist */ 5207/* For the event head insertion and removal in the hlist */
4173static inline struct hlist_head * 5208static inline struct hlist_head *
4174find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) 5209find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
4175{ 5210{
4176 struct swevent_hlist *hlist; 5211 struct swevent_hlist *hlist;
4177 u32 event_id = event->attr.config; 5212 u32 event_id = event->attr.config;
@@ -4182,7 +5217,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
4182 * and release. Which makes the protected version suitable here. 5217 * and release. Which makes the protected version suitable here.
4183 * The context lock guarantees that. 5218 * The context lock guarantees that.
4184 */ 5219 */
4185 hlist = rcu_dereference_protected(ctx->swevent_hlist, 5220 hlist = rcu_dereference_protected(swhash->swevent_hlist,
4186 lockdep_is_held(&event->ctx->lock)); 5221 lockdep_is_held(&event->ctx->lock));
4187 if (!hlist) 5222 if (!hlist)
4188 return NULL; 5223 return NULL;
@@ -4195,23 +5230,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4195 struct perf_sample_data *data, 5230 struct perf_sample_data *data,
4196 struct pt_regs *regs) 5231 struct pt_regs *regs)
4197{ 5232{
4198 struct perf_cpu_context *cpuctx; 5233 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4199 struct perf_event *event; 5234 struct perf_event *event;
4200 struct hlist_node *node; 5235 struct hlist_node *node;
4201 struct hlist_head *head; 5236 struct hlist_head *head;
4202 5237
4203 cpuctx = &__get_cpu_var(perf_cpu_context);
4204
4205 rcu_read_lock(); 5238 rcu_read_lock();
4206 5239 head = find_swevent_head_rcu(swhash, type, event_id);
4207 head = find_swevent_head_rcu(cpuctx, type, event_id);
4208
4209 if (!head) 5240 if (!head)
4210 goto end; 5241 goto end;
4211 5242
4212 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 5243 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4213 if (perf_swevent_match(event, type, event_id, data, regs)) 5244 if (perf_swevent_match(event, type, event_id, data, regs))
4214 perf_swevent_add(event, nr, nmi, data, regs); 5245 perf_swevent_event(event, nr, nmi, data, regs);
4215 } 5246 }
4216end: 5247end:
4217 rcu_read_unlock(); 5248 rcu_read_unlock();
@@ -4219,33 +5250,17 @@ end:
4219 5250
4220int perf_swevent_get_recursion_context(void) 5251int perf_swevent_get_recursion_context(void)
4221{ 5252{
4222 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 5253 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4223 int rctx;
4224
4225 if (in_nmi())
4226 rctx = 3;
4227 else if (in_irq())
4228 rctx = 2;
4229 else if (in_softirq())
4230 rctx = 1;
4231 else
4232 rctx = 0;
4233
4234 if (cpuctx->recursion[rctx])
4235 return -1;
4236
4237 cpuctx->recursion[rctx]++;
4238 barrier();
4239 5254
4240 return rctx; 5255 return get_recursion_context(swhash->recursion);
4241} 5256}
4242EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 5257EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4243 5258
4244void inline perf_swevent_put_recursion_context(int rctx) 5259inline void perf_swevent_put_recursion_context(int rctx)
4245{ 5260{
4246 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 5261 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4247 barrier(); 5262
4248 cpuctx->recursion[rctx]--; 5263 put_recursion_context(swhash->recursion, rctx);
4249} 5264}
4250 5265
4251void __perf_sw_event(u32 event_id, u64 nr, int nmi, 5266void __perf_sw_event(u32 event_id, u64 nr, int nmi,
@@ -4271,20 +5286,20 @@ static void perf_swevent_read(struct perf_event *event)
4271{ 5286{
4272} 5287}
4273 5288
4274static int perf_swevent_enable(struct perf_event *event) 5289static int perf_swevent_add(struct perf_event *event, int flags)
4275{ 5290{
5291 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4276 struct hw_perf_event *hwc = &event->hw; 5292 struct hw_perf_event *hwc = &event->hw;
4277 struct perf_cpu_context *cpuctx;
4278 struct hlist_head *head; 5293 struct hlist_head *head;
4279 5294
4280 cpuctx = &__get_cpu_var(perf_cpu_context); 5295 if (is_sampling_event(event)) {
4281
4282 if (hwc->sample_period) {
4283 hwc->last_period = hwc->sample_period; 5296 hwc->last_period = hwc->sample_period;
4284 perf_swevent_set_period(event); 5297 perf_swevent_set_period(event);
4285 } 5298 }
4286 5299
4287 head = find_swevent_head(cpuctx, event); 5300 hwc->state = !(flags & PERF_EF_START);
5301
5302 head = find_swevent_head(swhash, event);
4288 if (WARN_ON_ONCE(!head)) 5303 if (WARN_ON_ONCE(!head))
4289 return -EINVAL; 5304 return -EINVAL;
4290 5305
@@ -4293,233 +5308,50 @@ static int perf_swevent_enable(struct perf_event *event)
4293 return 0; 5308 return 0;
4294} 5309}
4295 5310
4296static void perf_swevent_disable(struct perf_event *event) 5311static void perf_swevent_del(struct perf_event *event, int flags)
4297{ 5312{
4298 hlist_del_rcu(&event->hlist_entry); 5313 hlist_del_rcu(&event->hlist_entry);
4299} 5314}
4300 5315
4301static void perf_swevent_void(struct perf_event *event) 5316static void perf_swevent_start(struct perf_event *event, int flags)
4302{ 5317{
5318 event->hw.state = 0;
4303} 5319}
4304 5320
4305static int perf_swevent_int(struct perf_event *event) 5321static void perf_swevent_stop(struct perf_event *event, int flags)
4306{ 5322{
4307 return 0; 5323 event->hw.state = PERF_HES_STOPPED;
4308} 5324}
4309 5325
4310static const struct pmu perf_ops_generic = {
4311 .enable = perf_swevent_enable,
4312 .disable = perf_swevent_disable,
4313 .start = perf_swevent_int,
4314 .stop = perf_swevent_void,
4315 .read = perf_swevent_read,
4316 .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */
4317};
4318
4319/*
4320 * hrtimer based swevent callback
4321 */
4322
4323static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4324{
4325 enum hrtimer_restart ret = HRTIMER_RESTART;
4326 struct perf_sample_data data;
4327 struct pt_regs *regs;
4328 struct perf_event *event;
4329 u64 period;
4330
4331 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4332 event->pmu->read(event);
4333
4334 perf_sample_data_init(&data, 0);
4335 data.period = event->hw.last_period;
4336 regs = get_irq_regs();
4337
4338 if (regs && !perf_exclude_event(event, regs)) {
4339 if (!(event->attr.exclude_idle && current->pid == 0))
4340 if (perf_event_overflow(event, 0, &data, regs))
4341 ret = HRTIMER_NORESTART;
4342 }
4343
4344 period = max_t(u64, 10000, event->hw.sample_period);
4345 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4346
4347 return ret;
4348}
4349
4350static void perf_swevent_start_hrtimer(struct perf_event *event)
4351{
4352 struct hw_perf_event *hwc = &event->hw;
4353
4354 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4355 hwc->hrtimer.function = perf_swevent_hrtimer;
4356 if (hwc->sample_period) {
4357 u64 period;
4358
4359 if (hwc->remaining) {
4360 if (hwc->remaining < 0)
4361 period = 10000;
4362 else
4363 period = hwc->remaining;
4364 hwc->remaining = 0;
4365 } else {
4366 period = max_t(u64, 10000, hwc->sample_period);
4367 }
4368 __hrtimer_start_range_ns(&hwc->hrtimer,
4369 ns_to_ktime(period), 0,
4370 HRTIMER_MODE_REL, 0);
4371 }
4372}
4373
4374static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4375{
4376 struct hw_perf_event *hwc = &event->hw;
4377
4378 if (hwc->sample_period) {
4379 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4380 hwc->remaining = ktime_to_ns(remaining);
4381
4382 hrtimer_cancel(&hwc->hrtimer);
4383 }
4384}
4385
4386/*
4387 * Software event: cpu wall time clock
4388 */
4389
4390static void cpu_clock_perf_event_update(struct perf_event *event)
4391{
4392 int cpu = raw_smp_processor_id();
4393 s64 prev;
4394 u64 now;
4395
4396 now = cpu_clock(cpu);
4397 prev = local64_xchg(&event->hw.prev_count, now);
4398 local64_add(now - prev, &event->count);
4399}
4400
4401static int cpu_clock_perf_event_enable(struct perf_event *event)
4402{
4403 struct hw_perf_event *hwc = &event->hw;
4404 int cpu = raw_smp_processor_id();
4405
4406 local64_set(&hwc->prev_count, cpu_clock(cpu));
4407 perf_swevent_start_hrtimer(event);
4408
4409 return 0;
4410}
4411
4412static void cpu_clock_perf_event_disable(struct perf_event *event)
4413{
4414 perf_swevent_cancel_hrtimer(event);
4415 cpu_clock_perf_event_update(event);
4416}
4417
4418static void cpu_clock_perf_event_read(struct perf_event *event)
4419{
4420 cpu_clock_perf_event_update(event);
4421}
4422
4423static const struct pmu perf_ops_cpu_clock = {
4424 .enable = cpu_clock_perf_event_enable,
4425 .disable = cpu_clock_perf_event_disable,
4426 .read = cpu_clock_perf_event_read,
4427};
4428
4429/*
4430 * Software event: task time clock
4431 */
4432
4433static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4434{
4435 u64 prev;
4436 s64 delta;
4437
4438 prev = local64_xchg(&event->hw.prev_count, now);
4439 delta = now - prev;
4440 local64_add(delta, &event->count);
4441}
4442
4443static int task_clock_perf_event_enable(struct perf_event *event)
4444{
4445 struct hw_perf_event *hwc = &event->hw;
4446 u64 now;
4447
4448 now = event->ctx->time;
4449
4450 local64_set(&hwc->prev_count, now);
4451
4452 perf_swevent_start_hrtimer(event);
4453
4454 return 0;
4455}
4456
4457static void task_clock_perf_event_disable(struct perf_event *event)
4458{
4459 perf_swevent_cancel_hrtimer(event);
4460 task_clock_perf_event_update(event, event->ctx->time);
4461
4462}
4463
4464static void task_clock_perf_event_read(struct perf_event *event)
4465{
4466 u64 time;
4467
4468 if (!in_nmi()) {
4469 update_context_time(event->ctx);
4470 time = event->ctx->time;
4471 } else {
4472 u64 now = perf_clock();
4473 u64 delta = now - event->ctx->timestamp;
4474 time = event->ctx->time + delta;
4475 }
4476
4477 task_clock_perf_event_update(event, time);
4478}
4479
4480static const struct pmu perf_ops_task_clock = {
4481 .enable = task_clock_perf_event_enable,
4482 .disable = task_clock_perf_event_disable,
4483 .read = task_clock_perf_event_read,
4484};
4485
4486/* Deref the hlist from the update side */ 5326/* Deref the hlist from the update side */
4487static inline struct swevent_hlist * 5327static inline struct swevent_hlist *
4488swevent_hlist_deref(struct perf_cpu_context *cpuctx) 5328swevent_hlist_deref(struct swevent_htable *swhash)
4489{
4490 return rcu_dereference_protected(cpuctx->swevent_hlist,
4491 lockdep_is_held(&cpuctx->hlist_mutex));
4492}
4493
4494static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4495{ 5329{
4496 struct swevent_hlist *hlist; 5330 return rcu_dereference_protected(swhash->swevent_hlist,
4497 5331 lockdep_is_held(&swhash->hlist_mutex));
4498 hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
4499 kfree(hlist);
4500} 5332}
4501 5333
4502static void swevent_hlist_release(struct perf_cpu_context *cpuctx) 5334static void swevent_hlist_release(struct swevent_htable *swhash)
4503{ 5335{
4504 struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); 5336 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
4505 5337
4506 if (!hlist) 5338 if (!hlist)
4507 return; 5339 return;
4508 5340
4509 rcu_assign_pointer(cpuctx->swevent_hlist, NULL); 5341 rcu_assign_pointer(swhash->swevent_hlist, NULL);
4510 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); 5342 kfree_rcu(hlist, rcu_head);
4511} 5343}
4512 5344
4513static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) 5345static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
4514{ 5346{
4515 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 5347 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4516 5348
4517 mutex_lock(&cpuctx->hlist_mutex); 5349 mutex_lock(&swhash->hlist_mutex);
4518 5350
4519 if (!--cpuctx->hlist_refcount) 5351 if (!--swhash->hlist_refcount)
4520 swevent_hlist_release(cpuctx); 5352 swevent_hlist_release(swhash);
4521 5353
4522 mutex_unlock(&cpuctx->hlist_mutex); 5354 mutex_unlock(&swhash->hlist_mutex);
4523} 5355}
4524 5356
4525static void swevent_hlist_put(struct perf_event *event) 5357static void swevent_hlist_put(struct perf_event *event)
@@ -4537,12 +5369,12 @@ static void swevent_hlist_put(struct perf_event *event)
4537 5369
4538static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) 5370static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4539{ 5371{
4540 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 5372 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4541 int err = 0; 5373 int err = 0;
4542 5374
4543 mutex_lock(&cpuctx->hlist_mutex); 5375 mutex_lock(&swhash->hlist_mutex);
4544 5376
4545 if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { 5377 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
4546 struct swevent_hlist *hlist; 5378 struct swevent_hlist *hlist;
4547 5379
4548 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 5380 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4550,11 +5382,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4550 err = -ENOMEM; 5382 err = -ENOMEM;
4551 goto exit; 5383 goto exit;
4552 } 5384 }
4553 rcu_assign_pointer(cpuctx->swevent_hlist, hlist); 5385 rcu_assign_pointer(swhash->swevent_hlist, hlist);
4554 } 5386 }
4555 cpuctx->hlist_refcount++; 5387 swhash->hlist_refcount++;
4556 exit: 5388exit:
4557 mutex_unlock(&cpuctx->hlist_mutex); 5389 mutex_unlock(&swhash->hlist_mutex);
4558 5390
4559 return err; 5391 return err;
4560} 5392}
@@ -4578,7 +5410,7 @@ static int swevent_hlist_get(struct perf_event *event)
4578 put_online_cpus(); 5410 put_online_cpus();
4579 5411
4580 return 0; 5412 return 0;
4581 fail: 5413fail:
4582 for_each_possible_cpu(cpu) { 5414 for_each_possible_cpu(cpu) {
4583 if (cpu == failed_cpu) 5415 if (cpu == failed_cpu)
4584 break; 5416 break;
@@ -4589,17 +5421,64 @@ static int swevent_hlist_get(struct perf_event *event)
4589 return err; 5421 return err;
4590} 5422}
4591 5423
4592#ifdef CONFIG_EVENT_TRACING 5424struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
5425
5426static void sw_perf_event_destroy(struct perf_event *event)
5427{
5428 u64 event_id = event->attr.config;
5429
5430 WARN_ON(event->parent);
5431
5432 jump_label_dec(&perf_swevent_enabled[event_id]);
5433 swevent_hlist_put(event);
5434}
5435
5436static int perf_swevent_init(struct perf_event *event)
5437{
5438 int event_id = event->attr.config;
5439
5440 if (event->attr.type != PERF_TYPE_SOFTWARE)
5441 return -ENOENT;
5442
5443 switch (event_id) {
5444 case PERF_COUNT_SW_CPU_CLOCK:
5445 case PERF_COUNT_SW_TASK_CLOCK:
5446 return -ENOENT;
5447
5448 default:
5449 break;
5450 }
5451
5452 if (event_id >= PERF_COUNT_SW_MAX)
5453 return -ENOENT;
5454
5455 if (!event->parent) {
5456 int err;
5457
5458 err = swevent_hlist_get(event);
5459 if (err)
5460 return err;
5461
5462 jump_label_inc(&perf_swevent_enabled[event_id]);
5463 event->destroy = sw_perf_event_destroy;
5464 }
4593 5465
4594static const struct pmu perf_ops_tracepoint = { 5466 return 0;
4595 .enable = perf_trace_enable, 5467}
4596 .disable = perf_trace_disable, 5468
4597 .start = perf_swevent_int, 5469static struct pmu perf_swevent = {
4598 .stop = perf_swevent_void, 5470 .task_ctx_nr = perf_sw_context,
5471
5472 .event_init = perf_swevent_init,
5473 .add = perf_swevent_add,
5474 .del = perf_swevent_del,
5475 .start = perf_swevent_start,
5476 .stop = perf_swevent_stop,
4599 .read = perf_swevent_read, 5477 .read = perf_swevent_read,
4600 .unthrottle = perf_swevent_void,
4601}; 5478};
4602 5479
5480#ifdef CONFIG_EVENT_TRACING
5481
4603static int perf_tp_filter_match(struct perf_event *event, 5482static int perf_tp_filter_match(struct perf_event *event,
4604 struct perf_sample_data *data) 5483 struct perf_sample_data *data)
4605{ 5484{
@@ -4614,6 +5493,8 @@ static int perf_tp_event_match(struct perf_event *event,
4614 struct perf_sample_data *data, 5493 struct perf_sample_data *data,
4615 struct pt_regs *regs) 5494 struct pt_regs *regs)
4616{ 5495{
5496 if (event->hw.state & PERF_HES_STOPPED)
5497 return 0;
4617 /* 5498 /*
4618 * All tracepoints are from kernel-space. 5499 * All tracepoints are from kernel-space.
4619 */ 5500 */
@@ -4643,7 +5524,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4643 5524
4644 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 5525 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4645 if (perf_tp_event_match(event, &data, regs)) 5526 if (perf_tp_event_match(event, &data, regs))
4646 perf_swevent_add(event, count, 1, &data, regs); 5527 perf_swevent_event(event, count, 1, &data, regs);
4647 } 5528 }
4648 5529
4649 perf_swevent_put_recursion_context(rctx); 5530 perf_swevent_put_recursion_context(rctx);
@@ -4655,26 +5536,36 @@ static void tp_perf_event_destroy(struct perf_event *event)
4655 perf_trace_destroy(event); 5536 perf_trace_destroy(event);
4656} 5537}
4657 5538
4658static const struct pmu *tp_perf_event_init(struct perf_event *event) 5539static int perf_tp_event_init(struct perf_event *event)
4659{ 5540{
4660 int err; 5541 int err;
4661 5542
4662 /* 5543 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4663 * Raw tracepoint data is a severe data leak, only allow root to 5544 return -ENOENT;
4664 * have these.
4665 */
4666 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4667 perf_paranoid_tracepoint_raw() &&
4668 !capable(CAP_SYS_ADMIN))
4669 return ERR_PTR(-EPERM);
4670 5545
4671 err = perf_trace_init(event); 5546 err = perf_trace_init(event);
4672 if (err) 5547 if (err)
4673 return NULL; 5548 return err;
4674 5549
4675 event->destroy = tp_perf_event_destroy; 5550 event->destroy = tp_perf_event_destroy;
4676 5551
4677 return &perf_ops_tracepoint; 5552 return 0;
5553}
5554
5555static struct pmu perf_tracepoint = {
5556 .task_ctx_nr = perf_sw_context,
5557
5558 .event_init = perf_tp_event_init,
5559 .add = perf_trace_add,
5560 .del = perf_trace_del,
5561 .start = perf_swevent_start,
5562 .stop = perf_swevent_stop,
5563 .read = perf_swevent_read,
5564};
5565
5566static inline void perf_tp_register(void)
5567{
5568 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
4678} 5569}
4679 5570
4680static int perf_event_set_filter(struct perf_event *event, void __user *arg) 5571static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4702,9 +5593,8 @@ static void perf_event_free_filter(struct perf_event *event)
4702 5593
4703#else 5594#else
4704 5595
4705static const struct pmu *tp_perf_event_init(struct perf_event *event) 5596static inline void perf_tp_register(void)
4706{ 5597{
4707 return NULL;
4708} 5598}
4709 5599
4710static int perf_event_set_filter(struct perf_event *event, void __user *arg) 5600static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4719,105 +5609,535 @@ static void perf_event_free_filter(struct perf_event *event)
4719#endif /* CONFIG_EVENT_TRACING */ 5609#endif /* CONFIG_EVENT_TRACING */
4720 5610
4721#ifdef CONFIG_HAVE_HW_BREAKPOINT 5611#ifdef CONFIG_HAVE_HW_BREAKPOINT
4722static void bp_perf_event_destroy(struct perf_event *event) 5612void perf_bp_event(struct perf_event *bp, void *data)
4723{ 5613{
4724 release_bp_slot(event); 5614 struct perf_sample_data sample;
5615 struct pt_regs *regs = data;
5616
5617 perf_sample_data_init(&sample, bp->attr.bp_addr);
5618
5619 if (!bp->hw.state && !perf_exclude_event(bp, regs))
5620 perf_swevent_event(bp, 1, 1, &sample, regs);
4725} 5621}
5622#endif
5623
5624/*
5625 * hrtimer based swevent callback
5626 */
4726 5627
4727static const struct pmu *bp_perf_event_init(struct perf_event *bp) 5628static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4728{ 5629{
4729 int err; 5630 enum hrtimer_restart ret = HRTIMER_RESTART;
5631 struct perf_sample_data data;
5632 struct pt_regs *regs;
5633 struct perf_event *event;
5634 u64 period;
4730 5635
4731 err = register_perf_hw_breakpoint(bp); 5636 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4732 if (err) 5637
4733 return ERR_PTR(err); 5638 if (event->state != PERF_EVENT_STATE_ACTIVE)
5639 return HRTIMER_NORESTART;
5640
5641 event->pmu->read(event);
5642
5643 perf_sample_data_init(&data, 0);
5644 data.period = event->hw.last_period;
5645 regs = get_irq_regs();
5646
5647 if (regs && !perf_exclude_event(event, regs)) {
5648 if (!(event->attr.exclude_idle && current->pid == 0))
5649 if (perf_event_overflow(event, 0, &data, regs))
5650 ret = HRTIMER_NORESTART;
5651 }
4734 5652
4735 bp->destroy = bp_perf_event_destroy; 5653 period = max_t(u64, 10000, event->hw.sample_period);
5654 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4736 5655
4737 return &perf_ops_bp; 5656 return ret;
4738} 5657}
4739 5658
4740void perf_bp_event(struct perf_event *bp, void *data) 5659static void perf_swevent_start_hrtimer(struct perf_event *event)
4741{ 5660{
4742 struct perf_sample_data sample; 5661 struct hw_perf_event *hwc = &event->hw;
4743 struct pt_regs *regs = data; 5662 s64 period;
4744 5663
4745 perf_sample_data_init(&sample, bp->attr.bp_addr); 5664 if (!is_sampling_event(event))
5665 return;
4746 5666
4747 if (!perf_exclude_event(bp, regs)) 5667 period = local64_read(&hwc->period_left);
4748 perf_swevent_add(bp, 1, 1, &sample, regs); 5668 if (period) {
5669 if (period < 0)
5670 period = 10000;
5671
5672 local64_set(&hwc->period_left, 0);
5673 } else {
5674 period = max_t(u64, 10000, hwc->sample_period);
5675 }
5676 __hrtimer_start_range_ns(&hwc->hrtimer,
5677 ns_to_ktime(period), 0,
5678 HRTIMER_MODE_REL_PINNED, 0);
4749} 5679}
4750#else 5680
4751static const struct pmu *bp_perf_event_init(struct perf_event *bp) 5681static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5682{
5683 struct hw_perf_event *hwc = &event->hw;
5684
5685 if (is_sampling_event(event)) {
5686 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
5687 local64_set(&hwc->period_left, ktime_to_ns(remaining));
5688
5689 hrtimer_cancel(&hwc->hrtimer);
5690 }
5691}
5692
5693static void perf_swevent_init_hrtimer(struct perf_event *event)
5694{
5695 struct hw_perf_event *hwc = &event->hw;
5696
5697 if (!is_sampling_event(event))
5698 return;
5699
5700 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5701 hwc->hrtimer.function = perf_swevent_hrtimer;
5702
5703 /*
5704 * Since hrtimers have a fixed rate, we can do a static freq->period
5705 * mapping and avoid the whole period adjust feedback stuff.
5706 */
5707 if (event->attr.freq) {
5708 long freq = event->attr.sample_freq;
5709
5710 event->attr.sample_period = NSEC_PER_SEC / freq;
5711 hwc->sample_period = event->attr.sample_period;
5712 local64_set(&hwc->period_left, hwc->sample_period);
5713 event->attr.freq = 0;
5714 }
5715}
5716
5717/*
5718 * Software event: cpu wall time clock
5719 */
5720
5721static void cpu_clock_event_update(struct perf_event *event)
5722{
5723 s64 prev;
5724 u64 now;
5725
5726 now = local_clock();
5727 prev = local64_xchg(&event->hw.prev_count, now);
5728 local64_add(now - prev, &event->count);
5729}
5730
5731static void cpu_clock_event_start(struct perf_event *event, int flags)
5732{
5733 local64_set(&event->hw.prev_count, local_clock());
5734 perf_swevent_start_hrtimer(event);
5735}
5736
5737static void cpu_clock_event_stop(struct perf_event *event, int flags)
5738{
5739 perf_swevent_cancel_hrtimer(event);
5740 cpu_clock_event_update(event);
5741}
5742
5743static int cpu_clock_event_add(struct perf_event *event, int flags)
5744{
5745 if (flags & PERF_EF_START)
5746 cpu_clock_event_start(event, flags);
5747
5748 return 0;
5749}
5750
5751static void cpu_clock_event_del(struct perf_event *event, int flags)
5752{
5753 cpu_clock_event_stop(event, flags);
5754}
5755
5756static void cpu_clock_event_read(struct perf_event *event)
5757{
5758 cpu_clock_event_update(event);
5759}
5760
5761static int cpu_clock_event_init(struct perf_event *event)
5762{
5763 if (event->attr.type != PERF_TYPE_SOFTWARE)
5764 return -ENOENT;
5765
5766 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5767 return -ENOENT;
5768
5769 perf_swevent_init_hrtimer(event);
5770
5771 return 0;
5772}
5773
5774static struct pmu perf_cpu_clock = {
5775 .task_ctx_nr = perf_sw_context,
5776
5777 .event_init = cpu_clock_event_init,
5778 .add = cpu_clock_event_add,
5779 .del = cpu_clock_event_del,
5780 .start = cpu_clock_event_start,
5781 .stop = cpu_clock_event_stop,
5782 .read = cpu_clock_event_read,
5783};
5784
5785/*
5786 * Software event: task time clock
5787 */
5788
5789static void task_clock_event_update(struct perf_event *event, u64 now)
5790{
5791 u64 prev;
5792 s64 delta;
5793
5794 prev = local64_xchg(&event->hw.prev_count, now);
5795 delta = now - prev;
5796 local64_add(delta, &event->count);
5797}
5798
5799static void task_clock_event_start(struct perf_event *event, int flags)
5800{
5801 local64_set(&event->hw.prev_count, event->ctx->time);
5802 perf_swevent_start_hrtimer(event);
5803}
5804
5805static void task_clock_event_stop(struct perf_event *event, int flags)
5806{
5807 perf_swevent_cancel_hrtimer(event);
5808 task_clock_event_update(event, event->ctx->time);
5809}
5810
5811static int task_clock_event_add(struct perf_event *event, int flags)
5812{
5813 if (flags & PERF_EF_START)
5814 task_clock_event_start(event, flags);
5815
5816 return 0;
5817}
5818
5819static void task_clock_event_del(struct perf_event *event, int flags)
5820{
5821 task_clock_event_stop(event, PERF_EF_UPDATE);
5822}
5823
5824static void task_clock_event_read(struct perf_event *event)
5825{
5826 u64 now = perf_clock();
5827 u64 delta = now - event->ctx->timestamp;
5828 u64 time = event->ctx->time + delta;
5829
5830 task_clock_event_update(event, time);
5831}
5832
5833static int task_clock_event_init(struct perf_event *event)
5834{
5835 if (event->attr.type != PERF_TYPE_SOFTWARE)
5836 return -ENOENT;
5837
5838 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5839 return -ENOENT;
5840
5841 perf_swevent_init_hrtimer(event);
5842
5843 return 0;
5844}
5845
5846static struct pmu perf_task_clock = {
5847 .task_ctx_nr = perf_sw_context,
5848
5849 .event_init = task_clock_event_init,
5850 .add = task_clock_event_add,
5851 .del = task_clock_event_del,
5852 .start = task_clock_event_start,
5853 .stop = task_clock_event_stop,
5854 .read = task_clock_event_read,
5855};
5856
5857static void perf_pmu_nop_void(struct pmu *pmu)
5858{
5859}
5860
5861static int perf_pmu_nop_int(struct pmu *pmu)
5862{
5863 return 0;
5864}
5865
5866static void perf_pmu_start_txn(struct pmu *pmu)
5867{
5868 perf_pmu_disable(pmu);
5869}
5870
5871static int perf_pmu_commit_txn(struct pmu *pmu)
5872{
5873 perf_pmu_enable(pmu);
5874 return 0;
5875}
5876
5877static void perf_pmu_cancel_txn(struct pmu *pmu)
5878{
5879 perf_pmu_enable(pmu);
5880}
5881
5882/*
5883 * Ensures all contexts with the same task_ctx_nr have the same
5884 * pmu_cpu_context too.
5885 */
5886static void *find_pmu_context(int ctxn)
4752{ 5887{
5888 struct pmu *pmu;
5889
5890 if (ctxn < 0)
5891 return NULL;
5892
5893 list_for_each_entry(pmu, &pmus, entry) {
5894 if (pmu->task_ctx_nr == ctxn)
5895 return pmu->pmu_cpu_context;
5896 }
5897
4753 return NULL; 5898 return NULL;
4754} 5899}
4755 5900
4756void perf_bp_event(struct perf_event *bp, void *regs) 5901static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
4757{ 5902{
5903 int cpu;
5904
5905 for_each_possible_cpu(cpu) {
5906 struct perf_cpu_context *cpuctx;
5907
5908 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5909
5910 if (cpuctx->active_pmu == old_pmu)
5911 cpuctx->active_pmu = pmu;
5912 }
4758} 5913}
4759#endif
4760 5914
4761atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 5915static void free_pmu_context(struct pmu *pmu)
5916{
5917 struct pmu *i;
4762 5918
4763static void sw_perf_event_destroy(struct perf_event *event) 5919 mutex_lock(&pmus_lock);
5920 /*
5921 * Like a real lame refcount.
5922 */
5923 list_for_each_entry(i, &pmus, entry) {
5924 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
5925 update_pmu_context(i, pmu);
5926 goto out;
5927 }
5928 }
5929
5930 free_percpu(pmu->pmu_cpu_context);
5931out:
5932 mutex_unlock(&pmus_lock);
5933}
5934static struct idr pmu_idr;
5935
5936static ssize_t
5937type_show(struct device *dev, struct device_attribute *attr, char *page)
4764{ 5938{
4765 u64 event_id = event->attr.config; 5939 struct pmu *pmu = dev_get_drvdata(dev);
4766 5940
4767 WARN_ON(event->parent); 5941 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5942}
4768 5943
4769 atomic_dec(&perf_swevent_enabled[event_id]); 5944static struct device_attribute pmu_dev_attrs[] = {
4770 swevent_hlist_put(event); 5945 __ATTR_RO(type),
5946 __ATTR_NULL,
5947};
5948
5949static int pmu_bus_running;
5950static struct bus_type pmu_bus = {
5951 .name = "event_source",
5952 .dev_attrs = pmu_dev_attrs,
5953};
5954
5955static void pmu_dev_release(struct device *dev)
5956{
5957 kfree(dev);
4771} 5958}
4772 5959
4773static const struct pmu *sw_perf_event_init(struct perf_event *event) 5960static int pmu_dev_alloc(struct pmu *pmu)
4774{ 5961{
4775 const struct pmu *pmu = NULL; 5962 int ret = -ENOMEM;
4776 u64 event_id = event->attr.config; 5963
5964 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
5965 if (!pmu->dev)
5966 goto out;
5967
5968 device_initialize(pmu->dev);
5969 ret = dev_set_name(pmu->dev, "%s", pmu->name);
5970 if (ret)
5971 goto free_dev;
5972
5973 dev_set_drvdata(pmu->dev, pmu);
5974 pmu->dev->bus = &pmu_bus;
5975 pmu->dev->release = pmu_dev_release;
5976 ret = device_add(pmu->dev);
5977 if (ret)
5978 goto free_dev;
5979
5980out:
5981 return ret;
5982
5983free_dev:
5984 put_device(pmu->dev);
5985 goto out;
5986}
5987
5988static struct lock_class_key cpuctx_mutex;
5989
5990int perf_pmu_register(struct pmu *pmu, char *name, int type)
5991{
5992 int cpu, ret;
5993
5994 mutex_lock(&pmus_lock);
5995 ret = -ENOMEM;
5996 pmu->pmu_disable_count = alloc_percpu(int);
5997 if (!pmu->pmu_disable_count)
5998 goto unlock;
5999
6000 pmu->type = -1;
6001 if (!name)
6002 goto skip_type;
6003 pmu->name = name;
6004
6005 if (type < 0) {
6006 int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
6007 if (!err)
6008 goto free_pdc;
6009
6010 err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
6011 if (err) {
6012 ret = err;
6013 goto free_pdc;
6014 }
6015 }
6016 pmu->type = type;
6017
6018 if (pmu_bus_running) {
6019 ret = pmu_dev_alloc(pmu);
6020 if (ret)
6021 goto free_idr;
6022 }
6023
6024skip_type:
6025 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
6026 if (pmu->pmu_cpu_context)
6027 goto got_cpu_context;
6028
6029 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
6030 if (!pmu->pmu_cpu_context)
6031 goto free_dev;
6032
6033 for_each_possible_cpu(cpu) {
6034 struct perf_cpu_context *cpuctx;
6035
6036 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6037 __perf_event_init_context(&cpuctx->ctx);
6038 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
6039 cpuctx->ctx.type = cpu_context;
6040 cpuctx->ctx.pmu = pmu;
6041 cpuctx->jiffies_interval = 1;
6042 INIT_LIST_HEAD(&cpuctx->rotation_list);
6043 cpuctx->active_pmu = pmu;
6044 }
6045
6046got_cpu_context:
6047 if (!pmu->start_txn) {
6048 if (pmu->pmu_enable) {
6049 /*
6050 * If we have pmu_enable/pmu_disable calls, install
6051 * transaction stubs that use that to try and batch
6052 * hardware accesses.
6053 */
6054 pmu->start_txn = perf_pmu_start_txn;
6055 pmu->commit_txn = perf_pmu_commit_txn;
6056 pmu->cancel_txn = perf_pmu_cancel_txn;
6057 } else {
6058 pmu->start_txn = perf_pmu_nop_void;
6059 pmu->commit_txn = perf_pmu_nop_int;
6060 pmu->cancel_txn = perf_pmu_nop_void;
6061 }
6062 }
6063
6064 if (!pmu->pmu_enable) {
6065 pmu->pmu_enable = perf_pmu_nop_void;
6066 pmu->pmu_disable = perf_pmu_nop_void;
6067 }
6068
6069 list_add_rcu(&pmu->entry, &pmus);
6070 ret = 0;
6071unlock:
6072 mutex_unlock(&pmus_lock);
6073
6074 return ret;
6075
6076free_dev:
6077 device_del(pmu->dev);
6078 put_device(pmu->dev);
6079
6080free_idr:
6081 if (pmu->type >= PERF_TYPE_MAX)
6082 idr_remove(&pmu_idr, pmu->type);
6083
6084free_pdc:
6085 free_percpu(pmu->pmu_disable_count);
6086 goto unlock;
6087}
6088
6089void perf_pmu_unregister(struct pmu *pmu)
6090{
6091 mutex_lock(&pmus_lock);
6092 list_del_rcu(&pmu->entry);
6093 mutex_unlock(&pmus_lock);
4777 6094
4778 /* 6095 /*
4779 * Software events (currently) can't in general distinguish 6096 * We dereference the pmu list under both SRCU and regular RCU, so
4780 * between user, kernel and hypervisor events. 6097 * synchronize against both of those.
4781 * However, context switches and cpu migrations are considered
4782 * to be kernel events, and page faults are never hypervisor
4783 * events.
4784 */ 6098 */
4785 switch (event_id) { 6099 synchronize_srcu(&pmus_srcu);
4786 case PERF_COUNT_SW_CPU_CLOCK: 6100 synchronize_rcu();
4787 pmu = &perf_ops_cpu_clock;
4788 6101
4789 break; 6102 free_percpu(pmu->pmu_disable_count);
4790 case PERF_COUNT_SW_TASK_CLOCK: 6103 if (pmu->type >= PERF_TYPE_MAX)
4791 /* 6104 idr_remove(&pmu_idr, pmu->type);
4792 * If the user instantiates this as a per-cpu event, 6105 device_del(pmu->dev);
4793 * use the cpu_clock event instead. 6106 put_device(pmu->dev);
4794 */ 6107 free_pmu_context(pmu);
4795 if (event->ctx->task) 6108}
4796 pmu = &perf_ops_task_clock;
4797 else
4798 pmu = &perf_ops_cpu_clock;
4799 6109
4800 break; 6110struct pmu *perf_init_event(struct perf_event *event)
4801 case PERF_COUNT_SW_PAGE_FAULTS: 6111{
4802 case PERF_COUNT_SW_PAGE_FAULTS_MIN: 6112 struct pmu *pmu = NULL;
4803 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 6113 int idx;
4804 case PERF_COUNT_SW_CONTEXT_SWITCHES: 6114 int ret;
4805 case PERF_COUNT_SW_CPU_MIGRATIONS: 6115
4806 case PERF_COUNT_SW_ALIGNMENT_FAULTS: 6116 idx = srcu_read_lock(&pmus_srcu);
4807 case PERF_COUNT_SW_EMULATION_FAULTS: 6117
4808 if (!event->parent) { 6118 rcu_read_lock();
4809 int err; 6119 pmu = idr_find(&pmu_idr, event->attr.type);
4810 6120 rcu_read_unlock();
4811 err = swevent_hlist_get(event); 6121 if (pmu) {
4812 if (err) 6122 ret = pmu->event_init(event);
4813 return ERR_PTR(err); 6123 if (ret)
6124 pmu = ERR_PTR(ret);
6125 goto unlock;
6126 }
4814 6127
4815 atomic_inc(&perf_swevent_enabled[event_id]); 6128 list_for_each_entry_rcu(pmu, &pmus, entry) {
4816 event->destroy = sw_perf_event_destroy; 6129 ret = pmu->event_init(event);
6130 if (!ret)
6131 goto unlock;
6132
6133 if (ret != -ENOENT) {
6134 pmu = ERR_PTR(ret);
6135 goto unlock;
4817 } 6136 }
4818 pmu = &perf_ops_generic;
4819 break;
4820 } 6137 }
6138 pmu = ERR_PTR(-ENOENT);
6139unlock:
6140 srcu_read_unlock(&pmus_srcu, idx);
4821 6141
4822 return pmu; 6142 return pmu;
4823} 6143}
@@ -4826,20 +6146,23 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4826 * Allocate and initialize a event structure 6146 * Allocate and initialize a event structure
4827 */ 6147 */
4828static struct perf_event * 6148static struct perf_event *
4829perf_event_alloc(struct perf_event_attr *attr, 6149perf_event_alloc(struct perf_event_attr *attr, int cpu,
4830 int cpu, 6150 struct task_struct *task,
4831 struct perf_event_context *ctx, 6151 struct perf_event *group_leader,
4832 struct perf_event *group_leader, 6152 struct perf_event *parent_event,
4833 struct perf_event *parent_event, 6153 perf_overflow_handler_t overflow_handler)
4834 perf_overflow_handler_t overflow_handler, 6154{
4835 gfp_t gfpflags) 6155 struct pmu *pmu;
4836{
4837 const struct pmu *pmu;
4838 struct perf_event *event; 6156 struct perf_event *event;
4839 struct hw_perf_event *hwc; 6157 struct hw_perf_event *hwc;
4840 long err; 6158 long err;
4841 6159
4842 event = kzalloc(sizeof(*event), gfpflags); 6160 if ((unsigned)cpu >= nr_cpu_ids) {
6161 if (!task || cpu != -1)
6162 return ERR_PTR(-EINVAL);
6163 }
6164
6165 event = kzalloc(sizeof(*event), GFP_KERNEL);
4843 if (!event) 6166 if (!event)
4844 return ERR_PTR(-ENOMEM); 6167 return ERR_PTR(-ENOMEM);
4845 6168
@@ -4857,6 +6180,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4857 INIT_LIST_HEAD(&event->event_entry); 6180 INIT_LIST_HEAD(&event->event_entry);
4858 INIT_LIST_HEAD(&event->sibling_list); 6181 INIT_LIST_HEAD(&event->sibling_list);
4859 init_waitqueue_head(&event->waitq); 6182 init_waitqueue_head(&event->waitq);
6183 init_irq_work(&event->pending, perf_pending_event);
4860 6184
4861 mutex_init(&event->mmap_mutex); 6185 mutex_init(&event->mmap_mutex);
4862 6186
@@ -4864,7 +6188,6 @@ perf_event_alloc(struct perf_event_attr *attr,
4864 event->attr = *attr; 6188 event->attr = *attr;
4865 event->group_leader = group_leader; 6189 event->group_leader = group_leader;
4866 event->pmu = NULL; 6190 event->pmu = NULL;
4867 event->ctx = ctx;
4868 event->oncpu = -1; 6191 event->oncpu = -1;
4869 6192
4870 event->parent = parent_event; 6193 event->parent = parent_event;
@@ -4874,9 +6197,20 @@ perf_event_alloc(struct perf_event_attr *attr,
4874 6197
4875 event->state = PERF_EVENT_STATE_INACTIVE; 6198 event->state = PERF_EVENT_STATE_INACTIVE;
4876 6199
6200 if (task) {
6201 event->attach_state = PERF_ATTACH_TASK;
6202#ifdef CONFIG_HAVE_HW_BREAKPOINT
6203 /*
6204 * hw_breakpoint is a bit difficult here..
6205 */
6206 if (attr->type == PERF_TYPE_BREAKPOINT)
6207 event->hw.bp_target = task;
6208#endif
6209 }
6210
4877 if (!overflow_handler && parent_event) 6211 if (!overflow_handler && parent_event)
4878 overflow_handler = parent_event->overflow_handler; 6212 overflow_handler = parent_event->overflow_handler;
4879 6213
4880 event->overflow_handler = overflow_handler; 6214 event->overflow_handler = overflow_handler;
4881 6215
4882 if (attr->disabled) 6216 if (attr->disabled)
@@ -4898,29 +6232,8 @@ perf_event_alloc(struct perf_event_attr *attr,
4898 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) 6232 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4899 goto done; 6233 goto done;
4900 6234
4901 switch (attr->type) { 6235 pmu = perf_init_event(event);
4902 case PERF_TYPE_RAW:
4903 case PERF_TYPE_HARDWARE:
4904 case PERF_TYPE_HW_CACHE:
4905 pmu = hw_perf_event_init(event);
4906 break;
4907
4908 case PERF_TYPE_SOFTWARE:
4909 pmu = sw_perf_event_init(event);
4910 break;
4911
4912 case PERF_TYPE_TRACEPOINT:
4913 pmu = tp_perf_event_init(event);
4914 break;
4915
4916 case PERF_TYPE_BREAKPOINT:
4917 pmu = bp_perf_event_init(event);
4918 break;
4919
4920 6236
4921 default:
4922 break;
4923 }
4924done: 6237done:
4925 err = 0; 6238 err = 0;
4926 if (!pmu) 6239 if (!pmu)
@@ -4938,13 +6251,21 @@ done:
4938 event->pmu = pmu; 6251 event->pmu = pmu;
4939 6252
4940 if (!event->parent) { 6253 if (!event->parent) {
4941 atomic_inc(&nr_events); 6254 if (event->attach_state & PERF_ATTACH_TASK)
6255 jump_label_inc(&perf_sched_events);
4942 if (event->attr.mmap || event->attr.mmap_data) 6256 if (event->attr.mmap || event->attr.mmap_data)
4943 atomic_inc(&nr_mmap_events); 6257 atomic_inc(&nr_mmap_events);
4944 if (event->attr.comm) 6258 if (event->attr.comm)
4945 atomic_inc(&nr_comm_events); 6259 atomic_inc(&nr_comm_events);
4946 if (event->attr.task) 6260 if (event->attr.task)
4947 atomic_inc(&nr_task_events); 6261 atomic_inc(&nr_task_events);
6262 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
6263 err = get_callchain_buffers();
6264 if (err) {
6265 free_event(event);
6266 return ERR_PTR(err);
6267 }
6268 }
4948 } 6269 }
4949 6270
4950 return event; 6271 return event;
@@ -5092,17 +6413,21 @@ SYSCALL_DEFINE5(perf_event_open,
5092 struct perf_event_attr __user *, attr_uptr, 6413 struct perf_event_attr __user *, attr_uptr,
5093 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) 6414 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
5094{ 6415{
5095 struct perf_event *event, *group_leader = NULL, *output_event = NULL; 6416 struct perf_event *group_leader = NULL, *output_event = NULL;
6417 struct perf_event *event, *sibling;
5096 struct perf_event_attr attr; 6418 struct perf_event_attr attr;
5097 struct perf_event_context *ctx; 6419 struct perf_event_context *ctx;
5098 struct file *event_file = NULL; 6420 struct file *event_file = NULL;
5099 struct file *group_file = NULL; 6421 struct file *group_file = NULL;
6422 struct task_struct *task = NULL;
6423 struct pmu *pmu;
5100 int event_fd; 6424 int event_fd;
6425 int move_group = 0;
5101 int fput_needed = 0; 6426 int fput_needed = 0;
5102 int err; 6427 int err;
5103 6428
5104 /* for future expandability... */ 6429 /* for future expandability... */
5105 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) 6430 if (flags & ~PERF_FLAG_ALL)
5106 return -EINVAL; 6431 return -EINVAL;
5107 6432
5108 err = perf_copy_attr(attr_uptr, &attr); 6433 err = perf_copy_attr(attr_uptr, &attr);
@@ -5119,24 +6444,24 @@ SYSCALL_DEFINE5(perf_event_open,
5119 return -EINVAL; 6444 return -EINVAL;
5120 } 6445 }
5121 6446
6447 /*
6448 * In cgroup mode, the pid argument is used to pass the fd
6449 * opened to the cgroup directory in cgroupfs. The cpu argument
6450 * designates the cpu on which to monitor threads from that
6451 * cgroup.
6452 */
6453 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6454 return -EINVAL;
6455
5122 event_fd = get_unused_fd_flags(O_RDWR); 6456 event_fd = get_unused_fd_flags(O_RDWR);
5123 if (event_fd < 0) 6457 if (event_fd < 0)
5124 return event_fd; 6458 return event_fd;
5125 6459
5126 /*
5127 * Get the target context (task or percpu):
5128 */
5129 ctx = find_get_context(pid, cpu);
5130 if (IS_ERR(ctx)) {
5131 err = PTR_ERR(ctx);
5132 goto err_fd;
5133 }
5134
5135 if (group_fd != -1) { 6460 if (group_fd != -1) {
5136 group_leader = perf_fget_light(group_fd, &fput_needed); 6461 group_leader = perf_fget_light(group_fd, &fput_needed);
5137 if (IS_ERR(group_leader)) { 6462 if (IS_ERR(group_leader)) {
5138 err = PTR_ERR(group_leader); 6463 err = PTR_ERR(group_leader);
5139 goto err_put_context; 6464 goto err_fd;
5140 } 6465 }
5141 group_file = group_leader->filp; 6466 group_file = group_leader->filp;
5142 if (flags & PERF_FLAG_FD_OUTPUT) 6467 if (flags & PERF_FLAG_FD_OUTPUT)
@@ -5145,6 +6470,76 @@ SYSCALL_DEFINE5(perf_event_open,
5145 group_leader = NULL; 6470 group_leader = NULL;
5146 } 6471 }
5147 6472
6473 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
6474 task = find_lively_task_by_vpid(pid);
6475 if (IS_ERR(task)) {
6476 err = PTR_ERR(task);
6477 goto err_group_fd;
6478 }
6479 }
6480
6481 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
6482 if (IS_ERR(event)) {
6483 err = PTR_ERR(event);
6484 goto err_task;
6485 }
6486
6487 if (flags & PERF_FLAG_PID_CGROUP) {
6488 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6489 if (err)
6490 goto err_alloc;
6491 /*
6492 * one more event:
6493 * - that has cgroup constraint on event->cpu
6494 * - that may need work on context switch
6495 */
6496 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6497 jump_label_inc(&perf_sched_events);
6498 }
6499
6500 /*
6501 * Special case software events and allow them to be part of
6502 * any hardware group.
6503 */
6504 pmu = event->pmu;
6505
6506 if (group_leader &&
6507 (is_software_event(event) != is_software_event(group_leader))) {
6508 if (is_software_event(event)) {
6509 /*
6510 * If event and group_leader are not both a software
6511 * event, and event is, then group leader is not.
6512 *
6513 * Allow the addition of software events to !software
6514 * groups, this is safe because software events never
6515 * fail to schedule.
6516 */
6517 pmu = group_leader->pmu;
6518 } else if (is_software_event(group_leader) &&
6519 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
6520 /*
6521 * In case the group is a pure software group, and we
6522 * try to add a hardware event, move the whole group to
6523 * the hardware context.
6524 */
6525 move_group = 1;
6526 }
6527 }
6528
6529 /*
6530 * Get the target context (task or percpu):
6531 */
6532 ctx = find_get_context(pmu, task, cpu);
6533 if (IS_ERR(ctx)) {
6534 err = PTR_ERR(ctx);
6535 goto err_alloc;
6536 }
6537
6538 if (task) {
6539 put_task_struct(task);
6540 task = NULL;
6541 }
6542
5148 /* 6543 /*
5149 * Look up the group leader (we will attach this event to it): 6544 * Look up the group leader (we will attach this event to it):
5150 */ 6545 */
@@ -5156,53 +6551,84 @@ SYSCALL_DEFINE5(perf_event_open,
5156 * becoming part of another group-sibling): 6551 * becoming part of another group-sibling):
5157 */ 6552 */
5158 if (group_leader->group_leader != group_leader) 6553 if (group_leader->group_leader != group_leader)
5159 goto err_put_context; 6554 goto err_context;
5160 /* 6555 /*
5161 * Do not allow to attach to a group in a different 6556 * Do not allow to attach to a group in a different
5162 * task or CPU context: 6557 * task or CPU context:
5163 */ 6558 */
5164 if (group_leader->ctx != ctx) 6559 if (move_group) {
5165 goto err_put_context; 6560 if (group_leader->ctx->type != ctx->type)
6561 goto err_context;
6562 } else {
6563 if (group_leader->ctx != ctx)
6564 goto err_context;
6565 }
6566
5166 /* 6567 /*
5167 * Only a group leader can be exclusive or pinned 6568 * Only a group leader can be exclusive or pinned
5168 */ 6569 */
5169 if (attr.exclusive || attr.pinned) 6570 if (attr.exclusive || attr.pinned)
5170 goto err_put_context; 6571 goto err_context;
5171 }
5172
5173 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
5174 NULL, NULL, GFP_KERNEL);
5175 if (IS_ERR(event)) {
5176 err = PTR_ERR(event);
5177 goto err_put_context;
5178 } 6572 }
5179 6573
5180 if (output_event) { 6574 if (output_event) {
5181 err = perf_event_set_output(event, output_event); 6575 err = perf_event_set_output(event, output_event);
5182 if (err) 6576 if (err)
5183 goto err_free_put_context; 6577 goto err_context;
5184 } 6578 }
5185 6579
5186 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); 6580 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
5187 if (IS_ERR(event_file)) { 6581 if (IS_ERR(event_file)) {
5188 err = PTR_ERR(event_file); 6582 err = PTR_ERR(event_file);
5189 goto err_free_put_context; 6583 goto err_context;
6584 }
6585
6586 if (move_group) {
6587 struct perf_event_context *gctx = group_leader->ctx;
6588
6589 mutex_lock(&gctx->mutex);
6590 perf_remove_from_context(group_leader);
6591 list_for_each_entry(sibling, &group_leader->sibling_list,
6592 group_entry) {
6593 perf_remove_from_context(sibling);
6594 put_ctx(gctx);
6595 }
6596 mutex_unlock(&gctx->mutex);
6597 put_ctx(gctx);
5190 } 6598 }
5191 6599
5192 event->filp = event_file; 6600 event->filp = event_file;
5193 WARN_ON_ONCE(ctx->parent_ctx); 6601 WARN_ON_ONCE(ctx->parent_ctx);
5194 mutex_lock(&ctx->mutex); 6602 mutex_lock(&ctx->mutex);
6603
6604 if (move_group) {
6605 perf_install_in_context(ctx, group_leader, cpu);
6606 get_ctx(ctx);
6607 list_for_each_entry(sibling, &group_leader->sibling_list,
6608 group_entry) {
6609 perf_install_in_context(ctx, sibling, cpu);
6610 get_ctx(ctx);
6611 }
6612 }
6613
5195 perf_install_in_context(ctx, event, cpu); 6614 perf_install_in_context(ctx, event, cpu);
5196 ++ctx->generation; 6615 ++ctx->generation;
6616 perf_unpin_context(ctx);
5197 mutex_unlock(&ctx->mutex); 6617 mutex_unlock(&ctx->mutex);
5198 6618
5199 event->owner = current; 6619 event->owner = current;
5200 get_task_struct(current); 6620
5201 mutex_lock(&current->perf_event_mutex); 6621 mutex_lock(&current->perf_event_mutex);
5202 list_add_tail(&event->owner_entry, &current->perf_event_list); 6622 list_add_tail(&event->owner_entry, &current->perf_event_list);
5203 mutex_unlock(&current->perf_event_mutex); 6623 mutex_unlock(&current->perf_event_mutex);
5204 6624
5205 /* 6625 /*
6626 * Precalculate sample_data sizes
6627 */
6628 perf_event__header_size(event);
6629 perf_event__id_header_size(event);
6630
6631 /*
5206 * Drop the reference on the group_event after placing the 6632 * Drop the reference on the group_event after placing the
5207 * new event on the sibling_list. This ensures destruction 6633 * new event on the sibling_list. This ensures destruction
5208 * of the group leader will find the pointer to itself in 6634 * of the group leader will find the pointer to itself in
@@ -5212,11 +6638,16 @@ SYSCALL_DEFINE5(perf_event_open,
5212 fd_install(event_fd, event_file); 6638 fd_install(event_fd, event_file);
5213 return event_fd; 6639 return event_fd;
5214 6640
5215err_free_put_context: 6641err_context:
6642 perf_unpin_context(ctx);
6643 put_ctx(ctx);
6644err_alloc:
5216 free_event(event); 6645 free_event(event);
5217err_put_context: 6646err_task:
6647 if (task)
6648 put_task_struct(task);
6649err_group_fd:
5218 fput_light(group_file, fput_needed); 6650 fput_light(group_file, fput_needed);
5219 put_ctx(ctx);
5220err_fd: 6651err_fd:
5221 put_unused_fd(event_fd); 6652 put_unused_fd(event_fd);
5222 return err; 6653 return err;
@@ -5227,32 +6658,31 @@ err_fd:
5227 * 6658 *
5228 * @attr: attributes of the counter to create 6659 * @attr: attributes of the counter to create
5229 * @cpu: cpu in which the counter is bound 6660 * @cpu: cpu in which the counter is bound
5230 * @pid: task to profile 6661 * @task: task to profile (NULL for percpu)
5231 */ 6662 */
5232struct perf_event * 6663struct perf_event *
5233perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 6664perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5234 pid_t pid, 6665 struct task_struct *task,
5235 perf_overflow_handler_t overflow_handler) 6666 perf_overflow_handler_t overflow_handler)
5236{ 6667{
5237 struct perf_event *event;
5238 struct perf_event_context *ctx; 6668 struct perf_event_context *ctx;
6669 struct perf_event *event;
5239 int err; 6670 int err;
5240 6671
5241 /* 6672 /*
5242 * Get the target context (task or percpu): 6673 * Get the target context (task or percpu):
5243 */ 6674 */
5244 6675
5245 ctx = find_get_context(pid, cpu); 6676 event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
5246 if (IS_ERR(ctx)) {
5247 err = PTR_ERR(ctx);
5248 goto err_exit;
5249 }
5250
5251 event = perf_event_alloc(attr, cpu, ctx, NULL,
5252 NULL, overflow_handler, GFP_KERNEL);
5253 if (IS_ERR(event)) { 6677 if (IS_ERR(event)) {
5254 err = PTR_ERR(event); 6678 err = PTR_ERR(event);
5255 goto err_put_context; 6679 goto err;
6680 }
6681
6682 ctx = find_get_context(event->pmu, task, cpu);
6683 if (IS_ERR(ctx)) {
6684 err = PTR_ERR(ctx);
6685 goto err_free;
5256 } 6686 }
5257 6687
5258 event->filp = NULL; 6688 event->filp = NULL;
@@ -5260,122 +6690,18 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5260 mutex_lock(&ctx->mutex); 6690 mutex_lock(&ctx->mutex);
5261 perf_install_in_context(ctx, event, cpu); 6691 perf_install_in_context(ctx, event, cpu);
5262 ++ctx->generation; 6692 ++ctx->generation;
6693 perf_unpin_context(ctx);
5263 mutex_unlock(&ctx->mutex); 6694 mutex_unlock(&ctx->mutex);
5264 6695
5265 event->owner = current;
5266 get_task_struct(current);
5267 mutex_lock(&current->perf_event_mutex);
5268 list_add_tail(&event->owner_entry, &current->perf_event_list);
5269 mutex_unlock(&current->perf_event_mutex);
5270
5271 return event; 6696 return event;
5272 6697
5273 err_put_context: 6698err_free:
5274 put_ctx(ctx); 6699 free_event(event);
5275 err_exit: 6700err:
5276 return ERR_PTR(err); 6701 return ERR_PTR(err);
5277} 6702}
5278EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); 6703EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
5279 6704
5280/*
5281 * inherit a event from parent task to child task:
5282 */
5283static struct perf_event *
5284inherit_event(struct perf_event *parent_event,
5285 struct task_struct *parent,
5286 struct perf_event_context *parent_ctx,
5287 struct task_struct *child,
5288 struct perf_event *group_leader,
5289 struct perf_event_context *child_ctx)
5290{
5291 struct perf_event *child_event;
5292
5293 /*
5294 * Instead of creating recursive hierarchies of events,
5295 * we link inherited events back to the original parent,
5296 * which has a filp for sure, which we use as the reference
5297 * count:
5298 */
5299 if (parent_event->parent)
5300 parent_event = parent_event->parent;
5301
5302 child_event = perf_event_alloc(&parent_event->attr,
5303 parent_event->cpu, child_ctx,
5304 group_leader, parent_event,
5305 NULL, GFP_KERNEL);
5306 if (IS_ERR(child_event))
5307 return child_event;
5308 get_ctx(child_ctx);
5309
5310 /*
5311 * Make the child state follow the state of the parent event,
5312 * not its attr.disabled bit. We hold the parent's mutex,
5313 * so we won't race with perf_event_{en, dis}able_family.
5314 */
5315 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
5316 child_event->state = PERF_EVENT_STATE_INACTIVE;
5317 else
5318 child_event->state = PERF_EVENT_STATE_OFF;
5319
5320 if (parent_event->attr.freq) {
5321 u64 sample_period = parent_event->hw.sample_period;
5322 struct hw_perf_event *hwc = &child_event->hw;
5323
5324 hwc->sample_period = sample_period;
5325 hwc->last_period = sample_period;
5326
5327 local64_set(&hwc->period_left, sample_period);
5328 }
5329
5330 child_event->overflow_handler = parent_event->overflow_handler;
5331
5332 /*
5333 * Link it up in the child's context:
5334 */
5335 add_event_to_ctx(child_event, child_ctx);
5336
5337 /*
5338 * Get a reference to the parent filp - we will fput it
5339 * when the child event exits. This is safe to do because
5340 * we are in the parent and we know that the filp still
5341 * exists and has a nonzero count:
5342 */
5343 atomic_long_inc(&parent_event->filp->f_count);
5344
5345 /*
5346 * Link this into the parent event's child list
5347 */
5348 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
5349 mutex_lock(&parent_event->child_mutex);
5350 list_add_tail(&child_event->child_list, &parent_event->child_list);
5351 mutex_unlock(&parent_event->child_mutex);
5352
5353 return child_event;
5354}
5355
5356static int inherit_group(struct perf_event *parent_event,
5357 struct task_struct *parent,
5358 struct perf_event_context *parent_ctx,
5359 struct task_struct *child,
5360 struct perf_event_context *child_ctx)
5361{
5362 struct perf_event *leader;
5363 struct perf_event *sub;
5364 struct perf_event *child_ctr;
5365
5366 leader = inherit_event(parent_event, parent, parent_ctx,
5367 child, NULL, child_ctx);
5368 if (IS_ERR(leader))
5369 return PTR_ERR(leader);
5370 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
5371 child_ctr = inherit_event(sub, parent, parent_ctx,
5372 child, leader, child_ctx);
5373 if (IS_ERR(child_ctr))
5374 return PTR_ERR(child_ctr);
5375 }
5376 return 0;
5377}
5378
5379static void sync_child_event(struct perf_event *child_event, 6705static void sync_child_event(struct perf_event *child_event,
5380 struct task_struct *child) 6706 struct task_struct *child)
5381{ 6707{
@@ -5416,32 +6742,32 @@ __perf_event_exit_task(struct perf_event *child_event,
5416 struct perf_event_context *child_ctx, 6742 struct perf_event_context *child_ctx,
5417 struct task_struct *child) 6743 struct task_struct *child)
5418{ 6744{
5419 struct perf_event *parent_event; 6745 if (child_event->parent) {
6746 raw_spin_lock_irq(&child_ctx->lock);
6747 perf_group_detach(child_event);
6748 raw_spin_unlock_irq(&child_ctx->lock);
6749 }
5420 6750
5421 perf_event_remove_from_context(child_event); 6751 perf_remove_from_context(child_event);
5422 6752
5423 parent_event = child_event->parent;
5424 /* 6753 /*
5425 * It can happen that parent exits first, and has events 6754 * It can happen that the parent exits first, and has events
5426 * that are still around due to the child reference. These 6755 * that are still around due to the child reference. These
5427 * events need to be zapped - but otherwise linger. 6756 * events need to be zapped.
5428 */ 6757 */
5429 if (parent_event) { 6758 if (child_event->parent) {
5430 sync_child_event(child_event, child); 6759 sync_child_event(child_event, child);
5431 free_event(child_event); 6760 free_event(child_event);
5432 } 6761 }
5433} 6762}
5434 6763
5435/* 6764static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
5436 * When a child task exits, feed back event values to parent events.
5437 */
5438void perf_event_exit_task(struct task_struct *child)
5439{ 6765{
5440 struct perf_event *child_event, *tmp; 6766 struct perf_event *child_event, *tmp;
5441 struct perf_event_context *child_ctx; 6767 struct perf_event_context *child_ctx;
5442 unsigned long flags; 6768 unsigned long flags;
5443 6769
5444 if (likely(!child->perf_event_ctxp)) { 6770 if (likely(!child->perf_event_ctxp[ctxn])) {
5445 perf_event_task(child, NULL, 0); 6771 perf_event_task(child, NULL, 0);
5446 return; 6772 return;
5447 } 6773 }
@@ -5453,8 +6779,8 @@ void perf_event_exit_task(struct task_struct *child)
5453 * scheduled, so we are now safe from rescheduling changing 6779 * scheduled, so we are now safe from rescheduling changing
5454 * our context. 6780 * our context.
5455 */ 6781 */
5456 child_ctx = child->perf_event_ctxp; 6782 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
5457 __perf_event_task_sched_out(child_ctx); 6783 task_ctx_sched_out(child_ctx, EVENT_ALL);
5458 6784
5459 /* 6785 /*
5460 * Take the context lock here so that if find_get_context is 6786 * Take the context lock here so that if find_get_context is
@@ -5462,7 +6788,7 @@ void perf_event_exit_task(struct task_struct *child)
5462 * incremented the context's refcount before we do put_ctx below. 6788 * incremented the context's refcount before we do put_ctx below.
5463 */ 6789 */
5464 raw_spin_lock(&child_ctx->lock); 6790 raw_spin_lock(&child_ctx->lock);
5465 child->perf_event_ctxp = NULL; 6791 child->perf_event_ctxp[ctxn] = NULL;
5466 /* 6792 /*
5467 * If this context is a clone; unclone it so it can't get 6793 * If this context is a clone; unclone it so it can't get
5468 * swapped to another process while we're removing all 6794 * swapped to another process while we're removing all
@@ -5515,6 +6841,33 @@ again:
5515 put_ctx(child_ctx); 6841 put_ctx(child_ctx);
5516} 6842}
5517 6843
6844/*
6845 * When a child task exits, feed back event values to parent events.
6846 */
6847void perf_event_exit_task(struct task_struct *child)
6848{
6849 struct perf_event *event, *tmp;
6850 int ctxn;
6851
6852 mutex_lock(&child->perf_event_mutex);
6853 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
6854 owner_entry) {
6855 list_del_init(&event->owner_entry);
6856
6857 /*
6858 * Ensure the list deletion is visible before we clear
6859 * the owner, closes a race against perf_release() where
6860 * we need to serialize on the owner->perf_event_mutex.
6861 */
6862 smp_wmb();
6863 event->owner = NULL;
6864 }
6865 mutex_unlock(&child->perf_event_mutex);
6866
6867 for_each_task_context_nr(ctxn)
6868 perf_event_exit_task_context(child, ctxn);
6869}
6870
5518static void perf_free_event(struct perf_event *event, 6871static void perf_free_event(struct perf_event *event,
5519 struct perf_event_context *ctx) 6872 struct perf_event_context *ctx)
5520{ 6873{
@@ -5536,48 +6889,172 @@ static void perf_free_event(struct perf_event *event,
5536 6889
5537/* 6890/*
5538 * free an unexposed, unused context as created by inheritance by 6891 * free an unexposed, unused context as created by inheritance by
5539 * init_task below, used by fork() in case of fail. 6892 * perf_event_init_task below, used by fork() in case of fail.
5540 */ 6893 */
5541void perf_event_free_task(struct task_struct *task) 6894void perf_event_free_task(struct task_struct *task)
5542{ 6895{
5543 struct perf_event_context *ctx = task->perf_event_ctxp; 6896 struct perf_event_context *ctx;
5544 struct perf_event *event, *tmp; 6897 struct perf_event *event, *tmp;
6898 int ctxn;
5545 6899
5546 if (!ctx) 6900 for_each_task_context_nr(ctxn) {
5547 return; 6901 ctx = task->perf_event_ctxp[ctxn];
6902 if (!ctx)
6903 continue;
5548 6904
5549 mutex_lock(&ctx->mutex); 6905 mutex_lock(&ctx->mutex);
5550again: 6906again:
5551 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 6907 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
5552 perf_free_event(event, ctx); 6908 group_entry)
6909 perf_free_event(event, ctx);
5553 6910
5554 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, 6911 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5555 group_entry) 6912 group_entry)
5556 perf_free_event(event, ctx); 6913 perf_free_event(event, ctx);
5557 6914
5558 if (!list_empty(&ctx->pinned_groups) || 6915 if (!list_empty(&ctx->pinned_groups) ||
5559 !list_empty(&ctx->flexible_groups)) 6916 !list_empty(&ctx->flexible_groups))
5560 goto again; 6917 goto again;
5561 6918
5562 mutex_unlock(&ctx->mutex); 6919 mutex_unlock(&ctx->mutex);
5563 6920
5564 put_ctx(ctx); 6921 put_ctx(ctx);
6922 }
6923}
6924
6925void perf_event_delayed_put(struct task_struct *task)
6926{
6927 int ctxn;
6928
6929 for_each_task_context_nr(ctxn)
6930 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
6931}
6932
6933/*
6934 * inherit a event from parent task to child task:
6935 */
6936static struct perf_event *
6937inherit_event(struct perf_event *parent_event,
6938 struct task_struct *parent,
6939 struct perf_event_context *parent_ctx,
6940 struct task_struct *child,
6941 struct perf_event *group_leader,
6942 struct perf_event_context *child_ctx)
6943{
6944 struct perf_event *child_event;
6945 unsigned long flags;
6946
6947 /*
6948 * Instead of creating recursive hierarchies of events,
6949 * we link inherited events back to the original parent,
6950 * which has a filp for sure, which we use as the reference
6951 * count:
6952 */
6953 if (parent_event->parent)
6954 parent_event = parent_event->parent;
6955
6956 child_event = perf_event_alloc(&parent_event->attr,
6957 parent_event->cpu,
6958 child,
6959 group_leader, parent_event,
6960 NULL);
6961 if (IS_ERR(child_event))
6962 return child_event;
6963 get_ctx(child_ctx);
6964
6965 /*
6966 * Make the child state follow the state of the parent event,
6967 * not its attr.disabled bit. We hold the parent's mutex,
6968 * so we won't race with perf_event_{en, dis}able_family.
6969 */
6970 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
6971 child_event->state = PERF_EVENT_STATE_INACTIVE;
6972 else
6973 child_event->state = PERF_EVENT_STATE_OFF;
6974
6975 if (parent_event->attr.freq) {
6976 u64 sample_period = parent_event->hw.sample_period;
6977 struct hw_perf_event *hwc = &child_event->hw;
6978
6979 hwc->sample_period = sample_period;
6980 hwc->last_period = sample_period;
6981
6982 local64_set(&hwc->period_left, sample_period);
6983 }
6984
6985 child_event->ctx = child_ctx;
6986 child_event->overflow_handler = parent_event->overflow_handler;
6987
6988 /*
6989 * Precalculate sample_data sizes
6990 */
6991 perf_event__header_size(child_event);
6992 perf_event__id_header_size(child_event);
6993
6994 /*
6995 * Link it up in the child's context:
6996 */
6997 raw_spin_lock_irqsave(&child_ctx->lock, flags);
6998 add_event_to_ctx(child_event, child_ctx);
6999 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
7000
7001 /*
7002 * Get a reference to the parent filp - we will fput it
7003 * when the child event exits. This is safe to do because
7004 * we are in the parent and we know that the filp still
7005 * exists and has a nonzero count:
7006 */
7007 atomic_long_inc(&parent_event->filp->f_count);
7008
7009 /*
7010 * Link this into the parent event's child list
7011 */
7012 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
7013 mutex_lock(&parent_event->child_mutex);
7014 list_add_tail(&child_event->child_list, &parent_event->child_list);
7015 mutex_unlock(&parent_event->child_mutex);
7016
7017 return child_event;
7018}
7019
7020static int inherit_group(struct perf_event *parent_event,
7021 struct task_struct *parent,
7022 struct perf_event_context *parent_ctx,
7023 struct task_struct *child,
7024 struct perf_event_context *child_ctx)
7025{
7026 struct perf_event *leader;
7027 struct perf_event *sub;
7028 struct perf_event *child_ctr;
7029
7030 leader = inherit_event(parent_event, parent, parent_ctx,
7031 child, NULL, child_ctx);
7032 if (IS_ERR(leader))
7033 return PTR_ERR(leader);
7034 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
7035 child_ctr = inherit_event(sub, parent, parent_ctx,
7036 child, leader, child_ctx);
7037 if (IS_ERR(child_ctr))
7038 return PTR_ERR(child_ctr);
7039 }
7040 return 0;
5565} 7041}
5566 7042
5567static int 7043static int
5568inherit_task_group(struct perf_event *event, struct task_struct *parent, 7044inherit_task_group(struct perf_event *event, struct task_struct *parent,
5569 struct perf_event_context *parent_ctx, 7045 struct perf_event_context *parent_ctx,
5570 struct task_struct *child, 7046 struct task_struct *child, int ctxn,
5571 int *inherited_all) 7047 int *inherited_all)
5572{ 7048{
5573 int ret; 7049 int ret;
5574 struct perf_event_context *child_ctx = child->perf_event_ctxp; 7050 struct perf_event_context *child_ctx;
5575 7051
5576 if (!event->attr.inherit) { 7052 if (!event->attr.inherit) {
5577 *inherited_all = 0; 7053 *inherited_all = 0;
5578 return 0; 7054 return 0;
5579 } 7055 }
5580 7056
7057 child_ctx = child->perf_event_ctxp[ctxn];
5581 if (!child_ctx) { 7058 if (!child_ctx) {
5582 /* 7059 /*
5583 * This is executed from the parent task context, so 7060 * This is executed from the parent task context, so
@@ -5586,14 +7063,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
5586 * child. 7063 * child.
5587 */ 7064 */
5588 7065
5589 child_ctx = kzalloc(sizeof(struct perf_event_context), 7066 child_ctx = alloc_perf_context(event->pmu, child);
5590 GFP_KERNEL);
5591 if (!child_ctx) 7067 if (!child_ctx)
5592 return -ENOMEM; 7068 return -ENOMEM;
5593 7069
5594 __perf_event_init_context(child_ctx, child); 7070 child->perf_event_ctxp[ctxn] = child_ctx;
5595 child->perf_event_ctxp = child_ctx;
5596 get_task_struct(child);
5597 } 7071 }
5598 7072
5599 ret = inherit_group(event, parent, parent_ctx, 7073 ret = inherit_group(event, parent, parent_ctx,
@@ -5605,32 +7079,27 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
5605 return ret; 7079 return ret;
5606} 7080}
5607 7081
5608
5609/* 7082/*
5610 * Initialize the perf_event context in task_struct 7083 * Initialize the perf_event context in task_struct
5611 */ 7084 */
5612int perf_event_init_task(struct task_struct *child) 7085int perf_event_init_context(struct task_struct *child, int ctxn)
5613{ 7086{
5614 struct perf_event_context *child_ctx, *parent_ctx; 7087 struct perf_event_context *child_ctx, *parent_ctx;
5615 struct perf_event_context *cloned_ctx; 7088 struct perf_event_context *cloned_ctx;
5616 struct perf_event *event; 7089 struct perf_event *event;
5617 struct task_struct *parent = current; 7090 struct task_struct *parent = current;
5618 int inherited_all = 1; 7091 int inherited_all = 1;
7092 unsigned long flags;
5619 int ret = 0; 7093 int ret = 0;
5620 7094
5621 child->perf_event_ctxp = NULL; 7095 if (likely(!parent->perf_event_ctxp[ctxn]))
5622
5623 mutex_init(&child->perf_event_mutex);
5624 INIT_LIST_HEAD(&child->perf_event_list);
5625
5626 if (likely(!parent->perf_event_ctxp))
5627 return 0; 7096 return 0;
5628 7097
5629 /* 7098 /*
5630 * If the parent's context is a clone, pin it so it won't get 7099 * If the parent's context is a clone, pin it so it won't get
5631 * swapped under us. 7100 * swapped under us.
5632 */ 7101 */
5633 parent_ctx = perf_pin_task_context(parent); 7102 parent_ctx = perf_pin_task_context(parent, ctxn);
5634 7103
5635 /* 7104 /*
5636 * No need to check if parent_ctx != NULL here; since we saw 7105 * No need to check if parent_ctx != NULL here; since we saw
@@ -5650,31 +7119,42 @@ int perf_event_init_task(struct task_struct *child)
5650 * the list, not manipulating it: 7119 * the list, not manipulating it:
5651 */ 7120 */
5652 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { 7121 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5653 ret = inherit_task_group(event, parent, parent_ctx, child, 7122 ret = inherit_task_group(event, parent, parent_ctx,
5654 &inherited_all); 7123 child, ctxn, &inherited_all);
5655 if (ret) 7124 if (ret)
5656 break; 7125 break;
5657 } 7126 }
5658 7127
7128 /*
7129 * We can't hold ctx->lock when iterating the ->flexible_group list due
7130 * to allocations, but we need to prevent rotation because
7131 * rotate_ctx() will change the list from interrupt context.
7132 */
7133 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
7134 parent_ctx->rotate_disable = 1;
7135 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
7136
5659 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 7137 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5660 ret = inherit_task_group(event, parent, parent_ctx, child, 7138 ret = inherit_task_group(event, parent, parent_ctx,
5661 &inherited_all); 7139 child, ctxn, &inherited_all);
5662 if (ret) 7140 if (ret)
5663 break; 7141 break;
5664 } 7142 }
5665 7143
5666 child_ctx = child->perf_event_ctxp; 7144 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
7145 parent_ctx->rotate_disable = 0;
7146
7147 child_ctx = child->perf_event_ctxp[ctxn];
5667 7148
5668 if (child_ctx && inherited_all) { 7149 if (child_ctx && inherited_all) {
5669 /* 7150 /*
5670 * Mark the child context as a clone of the parent 7151 * Mark the child context as a clone of the parent
5671 * context, or of whatever the parent is a clone of. 7152 * context, or of whatever the parent is a clone of.
5672 * Note that if the parent is a clone, it could get 7153 *
5673 * uncloned at any point, but that doesn't matter 7154 * Note that if the parent is a clone, the holding of
5674 * because the list of events and the generation 7155 * parent_ctx->lock avoids it from being uncloned.
5675 * count can't have changed since we took the mutex.
5676 */ 7156 */
5677 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); 7157 cloned_ctx = parent_ctx->parent_ctx;
5678 if (cloned_ctx) { 7158 if (cloned_ctx) {
5679 child_ctx->parent_ctx = cloned_ctx; 7159 child_ctx->parent_ctx = cloned_ctx;
5680 child_ctx->parent_gen = parent_ctx->parent_gen; 7160 child_ctx->parent_gen = parent_ctx->parent_gen;
@@ -5685,75 +7165,136 @@ int perf_event_init_task(struct task_struct *child)
5685 get_ctx(child_ctx->parent_ctx); 7165 get_ctx(child_ctx->parent_ctx);
5686 } 7166 }
5687 7167
7168 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
5688 mutex_unlock(&parent_ctx->mutex); 7169 mutex_unlock(&parent_ctx->mutex);
5689 7170
5690 perf_unpin_context(parent_ctx); 7171 perf_unpin_context(parent_ctx);
7172 put_ctx(parent_ctx);
5691 7173
5692 return ret; 7174 return ret;
5693} 7175}
5694 7176
7177/*
7178 * Initialize the perf_event context in task_struct
7179 */
7180int perf_event_init_task(struct task_struct *child)
7181{
7182 int ctxn, ret;
7183
7184 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
7185 mutex_init(&child->perf_event_mutex);
7186 INIT_LIST_HEAD(&child->perf_event_list);
7187
7188 for_each_task_context_nr(ctxn) {
7189 ret = perf_event_init_context(child, ctxn);
7190 if (ret)
7191 return ret;
7192 }
7193
7194 return 0;
7195}
7196
5695static void __init perf_event_init_all_cpus(void) 7197static void __init perf_event_init_all_cpus(void)
5696{ 7198{
7199 struct swevent_htable *swhash;
5697 int cpu; 7200 int cpu;
5698 struct perf_cpu_context *cpuctx;
5699 7201
5700 for_each_possible_cpu(cpu) { 7202 for_each_possible_cpu(cpu) {
5701 cpuctx = &per_cpu(perf_cpu_context, cpu); 7203 swhash = &per_cpu(swevent_htable, cpu);
5702 mutex_init(&cpuctx->hlist_mutex); 7204 mutex_init(&swhash->hlist_mutex);
5703 __perf_event_init_context(&cpuctx->ctx, NULL); 7205 INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
5704 } 7206 }
5705} 7207}
5706 7208
5707static void __cpuinit perf_event_init_cpu(int cpu) 7209static void __cpuinit perf_event_init_cpu(int cpu)
5708{ 7210{
5709 struct perf_cpu_context *cpuctx; 7211 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5710 7212
5711 cpuctx = &per_cpu(perf_cpu_context, cpu); 7213 mutex_lock(&swhash->hlist_mutex);
5712 7214 if (swhash->hlist_refcount > 0) {
5713 spin_lock(&perf_resource_lock);
5714 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5715 spin_unlock(&perf_resource_lock);
5716
5717 mutex_lock(&cpuctx->hlist_mutex);
5718 if (cpuctx->hlist_refcount > 0) {
5719 struct swevent_hlist *hlist; 7215 struct swevent_hlist *hlist;
5720 7216
5721 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 7217 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
5722 WARN_ON_ONCE(!hlist); 7218 WARN_ON(!hlist);
5723 rcu_assign_pointer(cpuctx->swevent_hlist, hlist); 7219 rcu_assign_pointer(swhash->swevent_hlist, hlist);
5724 } 7220 }
5725 mutex_unlock(&cpuctx->hlist_mutex); 7221 mutex_unlock(&swhash->hlist_mutex);
5726} 7222}
5727 7223
5728#ifdef CONFIG_HOTPLUG_CPU 7224#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
5729static void __perf_event_exit_cpu(void *info) 7225static void perf_pmu_rotate_stop(struct pmu *pmu)
5730{ 7226{
5731 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 7227 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
5732 struct perf_event_context *ctx = &cpuctx->ctx; 7228
7229 WARN_ON(!irqs_disabled());
7230
7231 list_del_init(&cpuctx->rotation_list);
7232}
7233
7234static void __perf_event_exit_context(void *__info)
7235{
7236 struct perf_event_context *ctx = __info;
5733 struct perf_event *event, *tmp; 7237 struct perf_event *event, *tmp;
5734 7238
7239 perf_pmu_rotate_stop(ctx->pmu);
7240
5735 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 7241 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5736 __perf_event_remove_from_context(event); 7242 __perf_remove_from_context(event);
5737 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) 7243 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5738 __perf_event_remove_from_context(event); 7244 __perf_remove_from_context(event);
7245}
7246
7247static void perf_event_exit_cpu_context(int cpu)
7248{
7249 struct perf_event_context *ctx;
7250 struct pmu *pmu;
7251 int idx;
7252
7253 idx = srcu_read_lock(&pmus_srcu);
7254 list_for_each_entry_rcu(pmu, &pmus, entry) {
7255 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
7256
7257 mutex_lock(&ctx->mutex);
7258 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
7259 mutex_unlock(&ctx->mutex);
7260 }
7261 srcu_read_unlock(&pmus_srcu, idx);
5739} 7262}
7263
5740static void perf_event_exit_cpu(int cpu) 7264static void perf_event_exit_cpu(int cpu)
5741{ 7265{
5742 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 7266 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5743 struct perf_event_context *ctx = &cpuctx->ctx;
5744 7267
5745 mutex_lock(&cpuctx->hlist_mutex); 7268 mutex_lock(&swhash->hlist_mutex);
5746 swevent_hlist_release(cpuctx); 7269 swevent_hlist_release(swhash);
5747 mutex_unlock(&cpuctx->hlist_mutex); 7270 mutex_unlock(&swhash->hlist_mutex);
5748 7271
5749 mutex_lock(&ctx->mutex); 7272 perf_event_exit_cpu_context(cpu);
5750 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5751 mutex_unlock(&ctx->mutex);
5752} 7273}
5753#else 7274#else
5754static inline void perf_event_exit_cpu(int cpu) { } 7275static inline void perf_event_exit_cpu(int cpu) { }
5755#endif 7276#endif
5756 7277
7278static int
7279perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
7280{
7281 int cpu;
7282
7283 for_each_online_cpu(cpu)
7284 perf_event_exit_cpu(cpu);
7285
7286 return NOTIFY_OK;
7287}
7288
7289/*
7290 * Run the perf reboot notifier at the very last possible moment so that
7291 * the generic watchdog code runs as long as possible.
7292 */
7293static struct notifier_block perf_reboot_notifier = {
7294 .notifier_call = perf_reboot,
7295 .priority = INT_MIN,
7296};
7297
5757static int __cpuinit 7298static int __cpuinit
5758perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) 7299perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5759{ 7300{
@@ -5778,118 +7319,115 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5778 return NOTIFY_OK; 7319 return NOTIFY_OK;
5779} 7320}
5780 7321
5781/*
5782 * This has to have a higher priority than migration_notifier in sched.c.
5783 */
5784static struct notifier_block __cpuinitdata perf_cpu_nb = {
5785 .notifier_call = perf_cpu_notify,
5786 .priority = 20,
5787};
5788
5789void __init perf_event_init(void) 7322void __init perf_event_init(void)
5790{ 7323{
7324 int ret;
7325
7326 idr_init(&pmu_idr);
7327
5791 perf_event_init_all_cpus(); 7328 perf_event_init_all_cpus();
5792 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 7329 init_srcu_struct(&pmus_srcu);
5793 (void *)(long)smp_processor_id()); 7330 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
5794 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, 7331 perf_pmu_register(&perf_cpu_clock, NULL, -1);
5795 (void *)(long)smp_processor_id()); 7332 perf_pmu_register(&perf_task_clock, NULL, -1);
5796 register_cpu_notifier(&perf_cpu_nb); 7333 perf_tp_register();
5797} 7334 perf_cpu_notifier(perf_cpu_notify);
7335 register_reboot_notifier(&perf_reboot_notifier);
5798 7336
5799static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, 7337 ret = init_hw_breakpoint();
5800 struct sysdev_class_attribute *attr, 7338 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
5801 char *buf)
5802{
5803 return sprintf(buf, "%d\n", perf_reserved_percpu);
5804} 7339}
5805 7340
5806static ssize_t 7341static int __init perf_event_sysfs_init(void)
5807perf_set_reserve_percpu(struct sysdev_class *class,
5808 struct sysdev_class_attribute *attr,
5809 const char *buf,
5810 size_t count)
5811{ 7342{
5812 struct perf_cpu_context *cpuctx; 7343 struct pmu *pmu;
5813 unsigned long val; 7344 int ret;
5814 int err, cpu, mpt;
5815 7345
5816 err = strict_strtoul(buf, 10, &val); 7346 mutex_lock(&pmus_lock);
5817 if (err) 7347
5818 return err; 7348 ret = bus_register(&pmu_bus);
5819 if (val > perf_max_events) 7349 if (ret)
5820 return -EINVAL; 7350 goto unlock;
7351
7352 list_for_each_entry(pmu, &pmus, entry) {
7353 if (!pmu->name || pmu->type < 0)
7354 continue;
5821 7355
5822 spin_lock(&perf_resource_lock); 7356 ret = pmu_dev_alloc(pmu);
5823 perf_reserved_percpu = val; 7357 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
5824 for_each_online_cpu(cpu) {
5825 cpuctx = &per_cpu(perf_cpu_context, cpu);
5826 raw_spin_lock_irq(&cpuctx->ctx.lock);
5827 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5828 perf_max_events - perf_reserved_percpu);
5829 cpuctx->max_pertask = mpt;
5830 raw_spin_unlock_irq(&cpuctx->ctx.lock);
5831 } 7358 }
5832 spin_unlock(&perf_resource_lock); 7359 pmu_bus_running = 1;
7360 ret = 0;
5833 7361
5834 return count; 7362unlock:
5835} 7363 mutex_unlock(&pmus_lock);
5836 7364
5837static ssize_t perf_show_overcommit(struct sysdev_class *class, 7365 return ret;
5838 struct sysdev_class_attribute *attr,
5839 char *buf)
5840{
5841 return sprintf(buf, "%d\n", perf_overcommit);
5842} 7366}
7367device_initcall(perf_event_sysfs_init);
5843 7368
5844static ssize_t 7369#ifdef CONFIG_CGROUP_PERF
5845perf_set_overcommit(struct sysdev_class *class, 7370static struct cgroup_subsys_state *perf_cgroup_create(
5846 struct sysdev_class_attribute *attr, 7371 struct cgroup_subsys *ss, struct cgroup *cont)
5847 const char *buf, size_t count)
5848{ 7372{
5849 unsigned long val; 7373 struct perf_cgroup *jc;
5850 int err;
5851 7374
5852 err = strict_strtoul(buf, 10, &val); 7375 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
5853 if (err) 7376 if (!jc)
5854 return err; 7377 return ERR_PTR(-ENOMEM);
5855 if (val > 1)
5856 return -EINVAL;
5857 7378
5858 spin_lock(&perf_resource_lock); 7379 jc->info = alloc_percpu(struct perf_cgroup_info);
5859 perf_overcommit = val; 7380 if (!jc->info) {
5860 spin_unlock(&perf_resource_lock); 7381 kfree(jc);
7382 return ERR_PTR(-ENOMEM);
7383 }
5861 7384
5862 return count; 7385 return &jc->css;
5863} 7386}
5864 7387
5865static SYSDEV_CLASS_ATTR( 7388static void perf_cgroup_destroy(struct cgroup_subsys *ss,
5866 reserve_percpu, 7389 struct cgroup *cont)
5867 0644, 7390{
5868 perf_show_reserve_percpu, 7391 struct perf_cgroup *jc;
5869 perf_set_reserve_percpu 7392 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
5870 ); 7393 struct perf_cgroup, css);
5871 7394 free_percpu(jc->info);
5872static SYSDEV_CLASS_ATTR( 7395 kfree(jc);
5873 overcommit, 7396}
5874 0644,
5875 perf_show_overcommit,
5876 perf_set_overcommit
5877 );
5878 7397
5879static struct attribute *perfclass_attrs[] = { 7398static int __perf_cgroup_move(void *info)
5880 &attr_reserve_percpu.attr, 7399{
5881 &attr_overcommit.attr, 7400 struct task_struct *task = info;
5882 NULL 7401 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
5883}; 7402 return 0;
7403}
5884 7404
5885static struct attribute_group perfclass_attr_group = { 7405static void
5886 .attrs = perfclass_attrs, 7406perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task)
5887 .name = "perf_events", 7407{
5888}; 7408 task_function_call(task, __perf_cgroup_move, task);
7409}
5889 7410
5890static int __init perf_event_sysfs_init(void) 7411static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7412 struct cgroup *old_cgrp, struct task_struct *task)
5891{ 7413{
5892 return sysfs_create_group(&cpu_sysdev_class.kset.kobj, 7414 /*
5893 &perfclass_attr_group); 7415 * cgroup_exit() is called in the copy_process() failure path.
7416 * Ignore this case since the task hasn't ran yet, this avoids
7417 * trying to poke a half freed task state from generic code.
7418 */
7419 if (!(task->flags & PF_EXITING))
7420 return;
7421
7422 perf_cgroup_attach_task(cgrp, task);
5894} 7423}
5895device_initcall(perf_event_sysfs_init); 7424
7425struct cgroup_subsys perf_subsys = {
7426 .name = "perf_event",
7427 .subsys_id = perf_subsys_id,
7428 .create = perf_cgroup_create,
7429 .destroy = perf_cgroup_destroy,
7430 .exit = perf_cgroup_exit,
7431 .attach_task = perf_cgroup_attach_task,
7432};
7433#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index c7c2aed9e2dc..086adf25a55e 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -113,12 +113,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
113 */ 113 */
114static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) 114static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
115{ 115{
116 struct perf_event_context *ctx = bp->ctx; 116 struct task_struct *tsk = bp->hw.bp_target;
117 struct perf_event *iter; 117 struct perf_event *iter;
118 int count = 0; 118 int count = 0;
119 119
120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) { 120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
121 if (iter->ctx == ctx && find_slot_idx(iter) == type) 121 if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type)
122 count += hw_breakpoint_weight(iter); 122 count += hw_breakpoint_weight(iter);
123 } 123 }
124 124
@@ -134,7 +134,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
134 enum bp_type_idx type) 134 enum bp_type_idx type)
135{ 135{
136 int cpu = bp->cpu; 136 int cpu = bp->cpu;
137 struct task_struct *tsk = bp->ctx->task; 137 struct task_struct *tsk = bp->hw.bp_target;
138 138
139 if (cpu >= 0) { 139 if (cpu >= 0) {
140 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); 140 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
@@ -213,7 +213,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
213 int weight) 213 int weight)
214{ 214{
215 int cpu = bp->cpu; 215 int cpu = bp->cpu;
216 struct task_struct *tsk = bp->ctx->task; 216 struct task_struct *tsk = bp->hw.bp_target;
217 217
218 /* Pinned counter cpu profiling */ 218 /* Pinned counter cpu profiling */
219 if (!tsk) { 219 if (!tsk) {
@@ -433,8 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
433 perf_overflow_handler_t triggered, 433 perf_overflow_handler_t triggered,
434 struct task_struct *tsk) 434 struct task_struct *tsk)
435{ 435{
436 return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk), 436 return perf_event_create_kernel_counter(attr, -1, tsk, triggered);
437 triggered);
438} 437}
439EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); 438EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
440 439
@@ -516,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
516 get_online_cpus(); 515 get_online_cpus();
517 for_each_online_cpu(cpu) { 516 for_each_online_cpu(cpu) {
518 pevent = per_cpu_ptr(cpu_events, cpu); 517 pevent = per_cpu_ptr(cpu_events, cpu);
519 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); 518 bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered);
520 519
521 *pevent = bp; 520 *pevent = bp;
522 521
@@ -566,7 +565,62 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
566 .priority = 0x7fffffff 565 .priority = 0x7fffffff
567}; 566};
568 567
569static int __init init_hw_breakpoint(void) 568static void bp_perf_event_destroy(struct perf_event *event)
569{
570 release_bp_slot(event);
571}
572
573static int hw_breakpoint_event_init(struct perf_event *bp)
574{
575 int err;
576
577 if (bp->attr.type != PERF_TYPE_BREAKPOINT)
578 return -ENOENT;
579
580 err = register_perf_hw_breakpoint(bp);
581 if (err)
582 return err;
583
584 bp->destroy = bp_perf_event_destroy;
585
586 return 0;
587}
588
589static int hw_breakpoint_add(struct perf_event *bp, int flags)
590{
591 if (!(flags & PERF_EF_START))
592 bp->hw.state = PERF_HES_STOPPED;
593
594 return arch_install_hw_breakpoint(bp);
595}
596
597static void hw_breakpoint_del(struct perf_event *bp, int flags)
598{
599 arch_uninstall_hw_breakpoint(bp);
600}
601
602static void hw_breakpoint_start(struct perf_event *bp, int flags)
603{
604 bp->hw.state = 0;
605}
606
607static void hw_breakpoint_stop(struct perf_event *bp, int flags)
608{
609 bp->hw.state = PERF_HES_STOPPED;
610}
611
612static struct pmu perf_breakpoint = {
613 .task_ctx_nr = perf_sw_context, /* could eventually get its own */
614
615 .event_init = hw_breakpoint_event_init,
616 .add = hw_breakpoint_add,
617 .del = hw_breakpoint_del,
618 .start = hw_breakpoint_start,
619 .stop = hw_breakpoint_stop,
620 .read = hw_breakpoint_pmu_read,
621};
622
623int __init init_hw_breakpoint(void)
570{ 624{
571 unsigned int **task_bp_pinned; 625 unsigned int **task_bp_pinned;
572 int cpu, err_cpu; 626 int cpu, err_cpu;
@@ -587,6 +641,8 @@ static int __init init_hw_breakpoint(void)
587 641
588 constraints_initialized = 1; 642 constraints_initialized = 1;
589 643
644 perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
645
590 return register_die_notifier(&hw_breakpoint_exceptions_nb); 646 return register_die_notifier(&hw_breakpoint_exceptions_nb);
591 647
592 err_alloc: 648 err_alloc:
@@ -599,11 +655,5 @@ static int __init init_hw_breakpoint(void)
599 655
600 return -ENOMEM; 656 return -ENOMEM;
601} 657}
602core_initcall(init_hw_breakpoint);
603 658
604 659
605struct pmu perf_ops_bp = {
606 .enable = arch_install_hw_breakpoint,
607 .disable = arch_uninstall_hw_breakpoint,
608 .read = hw_breakpoint_pmu_read,
609};
diff --git a/kernel/exit.c b/kernel/exit.c
index b9d3bc6c21ec..64879bdff921 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -50,6 +50,7 @@
50#include <linux/perf_event.h> 50#include <linux/perf_event.h>
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h> 52#include <linux/hw_breakpoint.h>
53#include <linux/oom.h>
53 54
54#include <asm/uaccess.h> 55#include <asm/uaccess.h>
55#include <asm/unistd.h> 56#include <asm/unistd.h>
@@ -70,7 +71,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
70 71
71 list_del_rcu(&p->tasks); 72 list_del_rcu(&p->tasks);
72 list_del_init(&p->sibling); 73 list_del_init(&p->sibling);
73 __get_cpu_var(process_counts)--; 74 __this_cpu_dec(process_counts);
74 } 75 }
75 list_del_rcu(&p->thread_group); 76 list_del_rcu(&p->thread_group);
76} 77}
@@ -97,6 +98,14 @@ static void __exit_signal(struct task_struct *tsk)
97 sig->tty = NULL; 98 sig->tty = NULL;
98 } else { 99 } else {
99 /* 100 /*
101 * This can only happen if the caller is de_thread().
102 * FIXME: this is the temporary hack, we should teach
103 * posix-cpu-timers to handle this case correctly.
104 */
105 if (unlikely(has_group_leader_pid(tsk)))
106 posix_cpu_timers_exit_group(tsk);
107
108 /*
100 * If there is any task waiting for the group exit 109 * If there is any task waiting for the group exit
101 * then notify it: 110 * then notify it:
102 */ 111 */
@@ -151,9 +160,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
151{ 160{
152 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 161 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
153 162
154#ifdef CONFIG_PERF_EVENTS 163 perf_event_delayed_put(tsk);
155 WARN_ON_ONCE(tsk->perf_event_ctxp);
156#endif
157 trace_sched_process_free(tsk); 164 trace_sched_process_free(tsk);
158 put_task_struct(tsk); 165 put_task_struct(tsk);
159} 166}
@@ -556,29 +563,28 @@ void exit_files(struct task_struct *tsk)
556 563
557#ifdef CONFIG_MM_OWNER 564#ifdef CONFIG_MM_OWNER
558/* 565/*
559 * Task p is exiting and it owned mm, lets find a new owner for it 566 * A task is exiting. If it owned this mm, find a new owner for the mm.
560 */ 567 */
561static inline int
562mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
563{
564 /*
565 * If there are other users of the mm and the owner (us) is exiting
566 * we need to find a new owner to take on the responsibility.
567 */
568 if (atomic_read(&mm->mm_users) <= 1)
569 return 0;
570 if (mm->owner != p)
571 return 0;
572 return 1;
573}
574
575void mm_update_next_owner(struct mm_struct *mm) 568void mm_update_next_owner(struct mm_struct *mm)
576{ 569{
577 struct task_struct *c, *g, *p = current; 570 struct task_struct *c, *g, *p = current;
578 571
579retry: 572retry:
580 if (!mm_need_new_owner(mm, p)) 573 /*
574 * If the exiting or execing task is not the owner, it's
575 * someone else's problem.
576 */
577 if (mm->owner != p)
578 return;
579 /*
580 * The current owner is exiting/execing and there are no other
581 * candidates. Do not leave the mm pointing to a possibly
582 * freed task structure.
583 */
584 if (atomic_read(&mm->mm_users) <= 1) {
585 mm->owner = NULL;
581 return; 586 return;
587 }
582 588
583 read_lock(&tasklist_lock); 589 read_lock(&tasklist_lock);
584 /* 590 /*
@@ -691,6 +697,8 @@ static void exit_mm(struct task_struct * tsk)
691 enter_lazy_tlb(mm, current); 697 enter_lazy_tlb(mm, current);
692 /* We don't want this task to be frozen prematurely */ 698 /* We don't want this task to be frozen prematurely */
693 clear_freeze_flag(tsk); 699 clear_freeze_flag(tsk);
700 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
701 atomic_dec(&mm->oom_disable_count);
694 task_unlock(tsk); 702 task_unlock(tsk);
695 mm_update_next_owner(mm); 703 mm_update_next_owner(mm);
696 mmput(mm); 704 mmput(mm);
@@ -704,6 +712,8 @@ static void exit_mm(struct task_struct * tsk)
704 * space. 712 * space.
705 */ 713 */
706static struct task_struct *find_new_reaper(struct task_struct *father) 714static struct task_struct *find_new_reaper(struct task_struct *father)
715 __releases(&tasklist_lock)
716 __acquires(&tasklist_lock)
707{ 717{
708 struct pid_namespace *pid_ns = task_active_pid_ns(father); 718 struct pid_namespace *pid_ns = task_active_pid_ns(father);
709 struct task_struct *thread; 719 struct task_struct *thread;
@@ -832,7 +842,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
832 /* Let father know we died 842 /* Let father know we died
833 * 843 *
834 * Thread signals are configurable, but you aren't going to use 844 * Thread signals are configurable, but you aren't going to use
835 * that to send signals to arbitary processes. 845 * that to send signals to arbitrary processes.
836 * That stops right now. 846 * That stops right now.
837 * 847 *
838 * If the parent exec id doesn't match the exec id we saved 848 * If the parent exec id doesn't match the exec id we saved
@@ -899,12 +909,22 @@ NORET_TYPE void do_exit(long code)
899 profile_task_exit(tsk); 909 profile_task_exit(tsk);
900 910
901 WARN_ON(atomic_read(&tsk->fs_excl)); 911 WARN_ON(atomic_read(&tsk->fs_excl));
912 WARN_ON(blk_needs_flush_plug(tsk));
902 913
903 if (unlikely(in_interrupt())) 914 if (unlikely(in_interrupt()))
904 panic("Aiee, killing interrupt handler!"); 915 panic("Aiee, killing interrupt handler!");
905 if (unlikely(!tsk->pid)) 916 if (unlikely(!tsk->pid))
906 panic("Attempted to kill the idle task!"); 917 panic("Attempted to kill the idle task!");
907 918
919 /*
920 * If do_exit is called because this processes oopsed, it's possible
921 * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
922 * continuing. Amongst other possible reasons, this is to prevent
923 * mm_release()->clear_child_tid() from writing to a user-controlled
924 * kernel address.
925 */
926 set_fs(USER_DS);
927
908 tracehook_report_exit(&code); 928 tracehook_report_exit(&code);
909 929
910 validate_creds_for_do_exit(tsk); 930 validate_creds_for_do_exit(tsk);
@@ -978,6 +998,15 @@ NORET_TYPE void do_exit(long code)
978 exit_fs(tsk); 998 exit_fs(tsk);
979 check_stack_usage(); 999 check_stack_usage();
980 exit_thread(); 1000 exit_thread();
1001
1002 /*
1003 * Flush inherited counters to the parent - before the parent
1004 * gets woken up by child-exit notifications.
1005 *
1006 * because of cgroup mode, must be called before cgroup_exit()
1007 */
1008 perf_event_exit_task(tsk);
1009
981 cgroup_exit(tsk, 1); 1010 cgroup_exit(tsk, 1);
982 1011
983 if (group_dead) 1012 if (group_dead)
@@ -990,12 +1019,7 @@ NORET_TYPE void do_exit(long code)
990 /* 1019 /*
991 * FIXME: do that only when needed, using sched_exit tracepoint 1020 * FIXME: do that only when needed, using sched_exit tracepoint
992 */ 1021 */
993 flush_ptrace_hw_breakpoint(tsk); 1022 ptrace_put_breakpoints(tsk);
994 /*
995 * Flush inherited counters to the parent - before the parent
996 * gets woken up by child-exit notifications.
997 */
998 perf_event_exit_task(tsk);
999 1023
1000 exit_notify(tsk, group_dead); 1024 exit_notify(tsk, group_dead);
1001#ifdef CONFIG_NUMA 1025#ifdef CONFIG_NUMA
@@ -1356,11 +1380,23 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
1356 return NULL; 1380 return NULL;
1357} 1381}
1358 1382
1359/* 1383/**
1360 * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold 1384 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
1361 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold 1385 * @wo: wait options
1362 * the lock and this task is uninteresting. If we return nonzero, we have 1386 * @ptrace: is the wait for ptrace
1363 * released the lock and the system call should return. 1387 * @p: task to wait for
1388 *
1389 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
1390 *
1391 * CONTEXT:
1392 * read_lock(&tasklist_lock), which is released if return value is
1393 * non-zero. Also, grabs and releases @p->sighand->siglock.
1394 *
1395 * RETURNS:
1396 * 0 if wait condition didn't exist and search for other wait conditions
1397 * should continue. Non-zero return, -errno on failure and @p's pid on
1398 * success, implies that tasklist_lock is released and wait condition
1399 * search should terminate.
1364 */ 1400 */
1365static int wait_task_stopped(struct wait_opts *wo, 1401static int wait_task_stopped(struct wait_opts *wo,
1366 int ptrace, struct task_struct *p) 1402 int ptrace, struct task_struct *p)
@@ -1376,6 +1412,9 @@ static int wait_task_stopped(struct wait_opts *wo,
1376 if (!ptrace && !(wo->wo_flags & WUNTRACED)) 1412 if (!ptrace && !(wo->wo_flags & WUNTRACED))
1377 return 0; 1413 return 0;
1378 1414
1415 if (!task_stopped_code(p, ptrace))
1416 return 0;
1417
1379 exit_code = 0; 1418 exit_code = 0;
1380 spin_lock_irq(&p->sighand->siglock); 1419 spin_lock_irq(&p->sighand->siglock);
1381 1420
@@ -1517,33 +1556,84 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1517 return 0; 1556 return 0;
1518 } 1557 }
1519 1558
1520 if (likely(!ptrace) && unlikely(task_ptrace(p))) { 1559 /* dead body doesn't have much to contribute */
1560 if (p->exit_state == EXIT_DEAD)
1561 return 0;
1562
1563 /* slay zombie? */
1564 if (p->exit_state == EXIT_ZOMBIE) {
1521 /* 1565 /*
1522 * This child is hidden by ptrace. 1566 * A zombie ptracee is only visible to its ptracer.
1523 * We aren't allowed to see it now, but eventually we will. 1567 * Notification and reaping will be cascaded to the real
1568 * parent when the ptracer detaches.
1569 */
1570 if (likely(!ptrace) && unlikely(task_ptrace(p))) {
1571 /* it will become visible, clear notask_error */
1572 wo->notask_error = 0;
1573 return 0;
1574 }
1575
1576 /* we don't reap group leaders with subthreads */
1577 if (!delay_group_leader(p))
1578 return wait_task_zombie(wo, p);
1579
1580 /*
1581 * Allow access to stopped/continued state via zombie by
1582 * falling through. Clearing of notask_error is complex.
1583 *
1584 * When !@ptrace:
1585 *
1586 * If WEXITED is set, notask_error should naturally be
1587 * cleared. If not, subset of WSTOPPED|WCONTINUED is set,
1588 * so, if there are live subthreads, there are events to
1589 * wait for. If all subthreads are dead, it's still safe
1590 * to clear - this function will be called again in finite
1591 * amount time once all the subthreads are released and
1592 * will then return without clearing.
1593 *
1594 * When @ptrace:
1595 *
1596 * Stopped state is per-task and thus can't change once the
1597 * target task dies. Only continued and exited can happen.
1598 * Clear notask_error if WCONTINUED | WEXITED.
1599 */
1600 if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
1601 wo->notask_error = 0;
1602 } else {
1603 /*
1604 * If @p is ptraced by a task in its real parent's group,
1605 * hide group stop/continued state when looking at @p as
1606 * the real parent; otherwise, a single stop can be
1607 * reported twice as group and ptrace stops.
1608 *
1609 * If a ptracer wants to distinguish the two events for its
1610 * own children, it should create a separate process which
1611 * takes the role of real parent.
1612 */
1613 if (likely(!ptrace) && task_ptrace(p) &&
1614 same_thread_group(p->parent, p->real_parent))
1615 return 0;
1616
1617 /*
1618 * @p is alive and it's gonna stop, continue or exit, so
1619 * there always is something to wait for.
1524 */ 1620 */
1525 wo->notask_error = 0; 1621 wo->notask_error = 0;
1526 return 0;
1527 } 1622 }
1528 1623
1529 if (p->exit_state == EXIT_DEAD)
1530 return 0;
1531
1532 /* 1624 /*
1533 * We don't reap group leaders with subthreads. 1625 * Wait for stopped. Depending on @ptrace, different stopped state
1626 * is used and the two don't interact with each other.
1534 */ 1627 */
1535 if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) 1628 ret = wait_task_stopped(wo, ptrace, p);
1536 return wait_task_zombie(wo, p); 1629 if (ret)
1630 return ret;
1537 1631
1538 /* 1632 /*
1539 * It's stopped or running now, so it might 1633 * Wait for continued. There's only one continued state and the
1540 * later continue, exit, or stop again. 1634 * ptracer can consume it which can confuse the real parent. Don't
1635 * use WCONTINUED from ptracer. You don't need or want it.
1541 */ 1636 */
1542 wo->notask_error = 0;
1543
1544 if (task_stopped_code(p, ptrace))
1545 return wait_task_stopped(wo, ptrace, p);
1546
1547 return wait_task_continued(wo, p); 1637 return wait_task_continued(wo, p);
1548} 1638}
1549 1639
diff --git a/kernel/extable.c b/kernel/extable.c
index 7f8f263f8524..5339705b8241 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -72,6 +72,24 @@ int core_kernel_text(unsigned long addr)
72 return 0; 72 return 0;
73} 73}
74 74
75/**
76 * core_kernel_data - tell if addr points to kernel data
77 * @addr: address to test
78 *
79 * Returns true if @addr passed in is from the core kernel data
80 * section.
81 *
82 * Note: On some archs it may return true for core RODATA, and false
83 * for others. But will always be true for core RW data.
84 */
85int core_kernel_data(unsigned long addr)
86{
87 if (addr >= (unsigned long)_sdata &&
88 addr < (unsigned long)_edata)
89 return 1;
90 return 0;
91}
92
75int __kernel_text_address(unsigned long addr) 93int __kernel_text_address(unsigned long addr)
76{ 94{
77 if (core_kernel_text(addr)) 95 if (core_kernel_text(addr))
diff --git a/kernel/fork.c b/kernel/fork.c
index ab7f29d906c7..25c6111fe3a6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
40#include <linux/tracehook.h> 40#include <linux/tracehook.h>
41#include <linux/futex.h> 41#include <linux/futex.h>
42#include <linux/compat.h> 42#include <linux/compat.h>
43#include <linux/kthread.h>
43#include <linux/task_io_accounting_ops.h> 44#include <linux/task_io_accounting_ops.h>
44#include <linux/rcupdate.h> 45#include <linux/rcupdate.h>
45#include <linux/ptrace.h> 46#include <linux/ptrace.h>
@@ -58,13 +59,14 @@
58#include <linux/taskstats_kern.h> 59#include <linux/taskstats_kern.h>
59#include <linux/random.h> 60#include <linux/random.h>
60#include <linux/tty.h> 61#include <linux/tty.h>
61#include <linux/proc_fs.h>
62#include <linux/blkdev.h> 62#include <linux/blkdev.h>
63#include <linux/fs_struct.h> 63#include <linux/fs_struct.h>
64#include <linux/magic.h> 64#include <linux/magic.h>
65#include <linux/perf_event.h> 65#include <linux/perf_event.h>
66#include <linux/posix-timers.h> 66#include <linux/posix-timers.h>
67#include <linux/user-return-notifier.h> 67#include <linux/user-return-notifier.h>
68#include <linux/oom.h>
69#include <linux/khugepaged.h>
68 70
69#include <asm/pgtable.h> 71#include <asm/pgtable.h>
70#include <asm/pgalloc.h> 72#include <asm/pgalloc.h>
@@ -110,20 +112,25 @@ int nr_processes(void)
110} 112}
111 113
112#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 114#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
113# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) 115# define alloc_task_struct_node(node) \
114# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) 116 kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
117# define free_task_struct(tsk) \
118 kmem_cache_free(task_struct_cachep, (tsk))
115static struct kmem_cache *task_struct_cachep; 119static struct kmem_cache *task_struct_cachep;
116#endif 120#endif
117 121
118#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR 122#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
119static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) 123static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
124 int node)
120{ 125{
121#ifdef CONFIG_DEBUG_STACK_USAGE 126#ifdef CONFIG_DEBUG_STACK_USAGE
122 gfp_t mask = GFP_KERNEL | __GFP_ZERO; 127 gfp_t mask = GFP_KERNEL | __GFP_ZERO;
123#else 128#else
124 gfp_t mask = GFP_KERNEL; 129 gfp_t mask = GFP_KERNEL;
125#endif 130#endif
126 return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); 131 struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
132
133 return page ? page_address(page) : NULL;
127} 134}
128 135
129static inline void free_thread_info(struct thread_info *ti) 136static inline void free_thread_info(struct thread_info *ti)
@@ -171,6 +178,7 @@ EXPORT_SYMBOL(free_task);
171static inline void free_signal_struct(struct signal_struct *sig) 178static inline void free_signal_struct(struct signal_struct *sig)
172{ 179{
173 taskstats_tgid_free(sig); 180 taskstats_tgid_free(sig);
181 sched_autogroup_exit(sig);
174 kmem_cache_free(signal_cachep, sig); 182 kmem_cache_free(signal_cachep, sig);
175} 183}
176 184
@@ -194,6 +202,7 @@ void __put_task_struct(struct task_struct *tsk)
194 if (!profile_handoff_task(tsk)) 202 if (!profile_handoff_task(tsk))
195 free_task(tsk); 203 free_task(tsk);
196} 204}
205EXPORT_SYMBOL_GPL(__put_task_struct);
197 206
198/* 207/*
199 * macro override instead of weak attribute alias, to workaround 208 * macro override instead of weak attribute alias, to workaround
@@ -249,16 +258,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
249 struct task_struct *tsk; 258 struct task_struct *tsk;
250 struct thread_info *ti; 259 struct thread_info *ti;
251 unsigned long *stackend; 260 unsigned long *stackend;
252 261 int node = tsk_fork_get_node(orig);
253 int err; 262 int err;
254 263
255 prepare_to_copy(orig); 264 prepare_to_copy(orig);
256 265
257 tsk = alloc_task_struct(); 266 tsk = alloc_task_struct_node(node);
258 if (!tsk) 267 if (!tsk)
259 return NULL; 268 return NULL;
260 269
261 ti = alloc_thread_info(tsk); 270 ti = alloc_thread_info_node(tsk, node);
262 if (!ti) { 271 if (!ti) {
263 free_task_struct(tsk); 272 free_task_struct(tsk);
264 return NULL; 273 return NULL;
@@ -279,6 +288,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
279 288
280 setup_thread_stack(tsk, orig); 289 setup_thread_stack(tsk, orig);
281 clear_user_return_notifier(tsk); 290 clear_user_return_notifier(tsk);
291 clear_tsk_need_resched(tsk);
282 stackend = end_of_stack(tsk); 292 stackend = end_of_stack(tsk);
283 *stackend = STACK_END_MAGIC; /* for overflow detection */ 293 *stackend = STACK_END_MAGIC; /* for overflow detection */
284 294
@@ -334,6 +344,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
334 retval = ksm_fork(mm, oldmm); 344 retval = ksm_fork(mm, oldmm);
335 if (retval) 345 if (retval)
336 goto out; 346 goto out;
347 retval = khugepaged_fork(mm, oldmm);
348 if (retval)
349 goto out;
337 350
338 prev = NULL; 351 prev = NULL;
339 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 352 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@ -376,15 +389,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
376 get_file(file); 389 get_file(file);
377 if (tmp->vm_flags & VM_DENYWRITE) 390 if (tmp->vm_flags & VM_DENYWRITE)
378 atomic_dec(&inode->i_writecount); 391 atomic_dec(&inode->i_writecount);
379 spin_lock(&mapping->i_mmap_lock); 392 mutex_lock(&mapping->i_mmap_mutex);
380 if (tmp->vm_flags & VM_SHARED) 393 if (tmp->vm_flags & VM_SHARED)
381 mapping->i_mmap_writable++; 394 mapping->i_mmap_writable++;
382 tmp->vm_truncate_count = mpnt->vm_truncate_count;
383 flush_dcache_mmap_lock(mapping); 395 flush_dcache_mmap_lock(mapping);
384 /* insert tmp into the share list, just after mpnt */ 396 /* insert tmp into the share list, just after mpnt */
385 vma_prio_tree_add(tmp, mpnt); 397 vma_prio_tree_add(tmp, mpnt);
386 flush_dcache_mmap_unlock(mapping); 398 flush_dcache_mmap_unlock(mapping);
387 spin_unlock(&mapping->i_mmap_lock); 399 mutex_unlock(&mapping->i_mmap_mutex);
388 } 400 }
389 401
390 /* 402 /*
@@ -495,6 +507,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
495 mm->cached_hole_size = ~0UL; 507 mm->cached_hole_size = ~0UL;
496 mm_init_aio(mm); 508 mm_init_aio(mm);
497 mm_init_owner(mm, p); 509 mm_init_owner(mm, p);
510 atomic_set(&mm->oom_disable_count, 0);
498 511
499 if (likely(!mm_alloc_pgd(mm))) { 512 if (likely(!mm_alloc_pgd(mm))) {
500 mm->def_flags = 0; 513 mm->def_flags = 0;
@@ -514,11 +527,12 @@ struct mm_struct * mm_alloc(void)
514 struct mm_struct * mm; 527 struct mm_struct * mm;
515 528
516 mm = allocate_mm(); 529 mm = allocate_mm();
517 if (mm) { 530 if (!mm)
518 memset(mm, 0, sizeof(*mm)); 531 return NULL;
519 mm = mm_init(mm, current); 532
520 } 533 memset(mm, 0, sizeof(*mm));
521 return mm; 534 mm_init_cpumask(mm);
535 return mm_init(mm, current);
522} 536}
523 537
524/* 538/*
@@ -532,6 +546,9 @@ void __mmdrop(struct mm_struct *mm)
532 mm_free_pgd(mm); 546 mm_free_pgd(mm);
533 destroy_context(mm); 547 destroy_context(mm);
534 mmu_notifier_mm_destroy(mm); 548 mmu_notifier_mm_destroy(mm);
549#ifdef CONFIG_TRANSPARENT_HUGEPAGE
550 VM_BUG_ON(mm->pmd_huge_pte);
551#endif
535 free_mm(mm); 552 free_mm(mm);
536} 553}
537EXPORT_SYMBOL_GPL(__mmdrop); 554EXPORT_SYMBOL_GPL(__mmdrop);
@@ -546,6 +563,7 @@ void mmput(struct mm_struct *mm)
546 if (atomic_dec_and_test(&mm->mm_users)) { 563 if (atomic_dec_and_test(&mm->mm_users)) {
547 exit_aio(mm); 564 exit_aio(mm);
548 ksm_exit(mm); 565 ksm_exit(mm);
566 khugepaged_exit(mm); /* must run before exit_mmap */
549 exit_mmap(mm); 567 exit_mmap(mm);
550 set_mm_exe_file(mm, NULL); 568 set_mm_exe_file(mm, NULL);
551 if (!list_empty(&mm->mmlist)) { 569 if (!list_empty(&mm->mmlist)) {
@@ -561,6 +579,57 @@ void mmput(struct mm_struct *mm)
561} 579}
562EXPORT_SYMBOL_GPL(mmput); 580EXPORT_SYMBOL_GPL(mmput);
563 581
582/*
583 * We added or removed a vma mapping the executable. The vmas are only mapped
584 * during exec and are not mapped with the mmap system call.
585 * Callers must hold down_write() on the mm's mmap_sem for these
586 */
587void added_exe_file_vma(struct mm_struct *mm)
588{
589 mm->num_exe_file_vmas++;
590}
591
592void removed_exe_file_vma(struct mm_struct *mm)
593{
594 mm->num_exe_file_vmas--;
595 if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
596 fput(mm->exe_file);
597 mm->exe_file = NULL;
598 }
599
600}
601
602void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
603{
604 if (new_exe_file)
605 get_file(new_exe_file);
606 if (mm->exe_file)
607 fput(mm->exe_file);
608 mm->exe_file = new_exe_file;
609 mm->num_exe_file_vmas = 0;
610}
611
612struct file *get_mm_exe_file(struct mm_struct *mm)
613{
614 struct file *exe_file;
615
616 /* We need mmap_sem to protect against races with removal of
617 * VM_EXECUTABLE vmas */
618 down_read(&mm->mmap_sem);
619 exe_file = mm->exe_file;
620 if (exe_file)
621 get_file(exe_file);
622 up_read(&mm->mmap_sem);
623 return exe_file;
624}
625
626static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
627{
628 /* It's safe to write the exe_file pointer without exe_file_lock because
629 * this is called during fork when the task is not yet in /proc */
630 newmm->exe_file = get_mm_exe_file(oldmm);
631}
632
564/** 633/**
565 * get_task_mm - acquire a reference to the task's mm 634 * get_task_mm - acquire a reference to the task's mm
566 * 635 *
@@ -667,11 +736,16 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
667 goto fail_nomem; 736 goto fail_nomem;
668 737
669 memcpy(mm, oldmm, sizeof(*mm)); 738 memcpy(mm, oldmm, sizeof(*mm));
739 mm_init_cpumask(mm);
670 740
671 /* Initializing for Swap token stuff */ 741 /* Initializing for Swap token stuff */
672 mm->token_priority = 0; 742 mm->token_priority = 0;
673 mm->last_interval = 0; 743 mm->last_interval = 0;
674 744
745#ifdef CONFIG_TRANSPARENT_HUGEPAGE
746 mm->pmd_huge_pte = NULL;
747#endif
748
675 if (!mm_init(mm, tsk)) 749 if (!mm_init(mm, tsk))
676 goto fail_nomem; 750 goto fail_nomem;
677 751
@@ -748,6 +822,8 @@ good_mm:
748 /* Initializing for Swap token stuff */ 822 /* Initializing for Swap token stuff */
749 mm->token_priority = 0; 823 mm->token_priority = 0;
750 mm->last_interval = 0; 824 mm->last_interval = 0;
825 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
826 atomic_inc(&mm->oom_disable_count);
751 827
752 tsk->mm = mm; 828 tsk->mm = mm;
753 tsk->active_mm = mm; 829 tsk->active_mm = mm;
@@ -907,9 +983,17 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
907 posix_cpu_timers_init_group(sig); 983 posix_cpu_timers_init_group(sig);
908 984
909 tty_audit_fork(sig); 985 tty_audit_fork(sig);
986 sched_autogroup_fork(sig);
987
988#ifdef CONFIG_CGROUPS
989 init_rwsem(&sig->threadgroup_fork_lock);
990#endif
910 991
911 sig->oom_adj = current->signal->oom_adj; 992 sig->oom_adj = current->signal->oom_adj;
912 sig->oom_score_adj = current->signal->oom_score_adj; 993 sig->oom_score_adj = current->signal->oom_score_adj;
994 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
995
996 mutex_init(&sig->cred_guard_mutex);
913 997
914 return 0; 998 return 0;
915} 999}
@@ -1081,12 +1165,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1081 1165
1082 posix_cpu_timers_init(p); 1166 posix_cpu_timers_init(p);
1083 1167
1084 p->lock_depth = -1; /* -1 = no lock */
1085 do_posix_clock_monotonic_gettime(&p->start_time); 1168 do_posix_clock_monotonic_gettime(&p->start_time);
1086 p->real_start_time = p->start_time; 1169 p->real_start_time = p->start_time;
1087 monotonic_to_bootbased(&p->real_start_time); 1170 monotonic_to_bootbased(&p->real_start_time);
1088 p->io_context = NULL; 1171 p->io_context = NULL;
1089 p->audit_context = NULL; 1172 p->audit_context = NULL;
1173 if (clone_flags & CLONE_THREAD)
1174 threadgroup_fork_read_lock(current);
1090 cgroup_fork(p); 1175 cgroup_fork(p);
1091#ifdef CONFIG_NUMA 1176#ifdef CONFIG_NUMA
1092 p->mempolicy = mpol_dup(p->mempolicy); 1177 p->mempolicy = mpol_dup(p->mempolicy);
@@ -1131,7 +1216,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1131#endif 1216#endif
1132 1217
1133 /* Perform scheduler related setup. Assign this task to a CPU. */ 1218 /* Perform scheduler related setup. Assign this task to a CPU. */
1134 sched_fork(p, clone_flags); 1219 sched_fork(p);
1135 1220
1136 retval = perf_event_init_task(p); 1221 retval = perf_event_init_task(p);
1137 if (retval) 1222 if (retval)
@@ -1165,12 +1250,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1165 pid = alloc_pid(p->nsproxy->pid_ns); 1250 pid = alloc_pid(p->nsproxy->pid_ns);
1166 if (!pid) 1251 if (!pid)
1167 goto bad_fork_cleanup_io; 1252 goto bad_fork_cleanup_io;
1168
1169 if (clone_flags & CLONE_NEWPID) {
1170 retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
1171 if (retval < 0)
1172 goto bad_fork_free_pid;
1173 }
1174 } 1253 }
1175 1254
1176 p->pid = pid_nr(pid); 1255 p->pid = pid_nr(pid);
@@ -1178,17 +1257,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1178 if (clone_flags & CLONE_THREAD) 1257 if (clone_flags & CLONE_THREAD)
1179 p->tgid = current->tgid; 1258 p->tgid = current->tgid;
1180 1259
1181 if (current->nsproxy != p->nsproxy) {
1182 retval = ns_cgroup_clone(p, pid);
1183 if (retval)
1184 goto bad_fork_free_pid;
1185 }
1186
1187 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1260 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1188 /* 1261 /*
1189 * Clear TID on mm_release()? 1262 * Clear TID on mm_release()?
1190 */ 1263 */
1191 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; 1264 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
1265#ifdef CONFIG_BLOCK
1266 p->plug = NULL;
1267#endif
1192#ifdef CONFIG_FUTEX 1268#ifdef CONFIG_FUTEX
1193 p->robust_list = NULL; 1269 p->robust_list = NULL;
1194#ifdef CONFIG_COMPAT 1270#ifdef CONFIG_COMPAT
@@ -1274,7 +1350,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1274 tracehook_finish_clone(p, clone_flags, trace); 1350 tracehook_finish_clone(p, clone_flags, trace);
1275 1351
1276 if (thread_group_leader(p)) { 1352 if (thread_group_leader(p)) {
1277 if (clone_flags & CLONE_NEWPID) 1353 if (is_child_reaper(pid))
1278 p->nsproxy->pid_ns->child_reaper = p; 1354 p->nsproxy->pid_ns->child_reaper = p;
1279 1355
1280 p->signal->leader_pid = pid; 1356 p->signal->leader_pid = pid;
@@ -1283,7 +1359,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1283 attach_pid(p, PIDTYPE_SID, task_session(current)); 1359 attach_pid(p, PIDTYPE_SID, task_session(current));
1284 list_add_tail(&p->sibling, &p->real_parent->children); 1360 list_add_tail(&p->sibling, &p->real_parent->children);
1285 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1361 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1286 __get_cpu_var(process_counts)++; 1362 __this_cpu_inc(process_counts);
1287 } 1363 }
1288 attach_pid(p, PIDTYPE_PID, pid); 1364 attach_pid(p, PIDTYPE_PID, pid);
1289 nr_threads++; 1365 nr_threads++;
@@ -1294,6 +1370,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1294 write_unlock_irq(&tasklist_lock); 1370 write_unlock_irq(&tasklist_lock);
1295 proc_fork_connector(p); 1371 proc_fork_connector(p);
1296 cgroup_post_fork(p); 1372 cgroup_post_fork(p);
1373 if (clone_flags & CLONE_THREAD)
1374 threadgroup_fork_read_unlock(current);
1297 perf_event_fork(p); 1375 perf_event_fork(p);
1298 return p; 1376 return p;
1299 1377
@@ -1306,8 +1384,13 @@ bad_fork_cleanup_io:
1306bad_fork_cleanup_namespaces: 1384bad_fork_cleanup_namespaces:
1307 exit_task_namespaces(p); 1385 exit_task_namespaces(p);
1308bad_fork_cleanup_mm: 1386bad_fork_cleanup_mm:
1309 if (p->mm) 1387 if (p->mm) {
1388 task_lock(p);
1389 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1390 atomic_dec(&p->mm->oom_disable_count);
1391 task_unlock(p);
1310 mmput(p->mm); 1392 mmput(p->mm);
1393 }
1311bad_fork_cleanup_signal: 1394bad_fork_cleanup_signal:
1312 if (!(clone_flags & CLONE_THREAD)) 1395 if (!(clone_flags & CLONE_THREAD))
1313 free_signal_struct(p->signal); 1396 free_signal_struct(p->signal);
@@ -1327,6 +1410,8 @@ bad_fork_cleanup_policy:
1327 mpol_put(p->mempolicy); 1410 mpol_put(p->mempolicy);
1328bad_fork_cleanup_cgroup: 1411bad_fork_cleanup_cgroup:
1329#endif 1412#endif
1413 if (clone_flags & CLONE_THREAD)
1414 threadgroup_fork_read_unlock(current);
1330 cgroup_exit(p, cgroup_callbacks_done); 1415 cgroup_exit(p, cgroup_callbacks_done);
1331 delayacct_tsk_free(p); 1416 delayacct_tsk_free(p);
1332 module_put(task_thread_info(p)->exec_domain->module); 1417 module_put(task_thread_info(p)->exec_domain->module);
@@ -1403,23 +1488,6 @@ long do_fork(unsigned long clone_flags,
1403 } 1488 }
1404 1489
1405 /* 1490 /*
1406 * We hope to recycle these flags after 2.6.26
1407 */
1408 if (unlikely(clone_flags & CLONE_STOPPED)) {
1409 static int __read_mostly count = 100;
1410
1411 if (count > 0 && printk_ratelimit()) {
1412 char comm[TASK_COMM_LEN];
1413
1414 count--;
1415 printk(KERN_INFO "fork(): process `%s' used deprecated "
1416 "clone flags 0x%lx\n",
1417 get_task_comm(comm, current),
1418 clone_flags & CLONE_STOPPED);
1419 }
1420 }
1421
1422 /*
1423 * When called from kernel_thread, don't do user tracing stuff. 1491 * When called from kernel_thread, don't do user tracing stuff.
1424 */ 1492 */
1425 if (likely(user_mode(regs))) 1493 if (likely(user_mode(regs)))
@@ -1457,16 +1525,7 @@ long do_fork(unsigned long clone_flags,
1457 */ 1525 */
1458 p->flags &= ~PF_STARTING; 1526 p->flags &= ~PF_STARTING;
1459 1527
1460 if (unlikely(clone_flags & CLONE_STOPPED)) { 1528 wake_up_new_task(p);
1461 /*
1462 * We'll start up with an immediate SIGSTOP.
1463 */
1464 sigaddset(&p->pending.signal, SIGSTOP);
1465 set_tsk_thread_flag(p, TIF_SIGPENDING);
1466 __set_task_state(p, TASK_STOPPED);
1467 } else {
1468 wake_up_new_task(p, clone_flags);
1469 }
1470 1529
1471 tracehook_report_clone_complete(trace, regs, 1530 tracehook_report_clone_complete(trace, regs,
1472 clone_flags, nr, p); 1531 clone_flags, nr, p);
@@ -1510,6 +1569,13 @@ void __init proc_caches_init(void)
1510 fs_cachep = kmem_cache_create("fs_cache", 1569 fs_cachep = kmem_cache_create("fs_cache",
1511 sizeof(struct fs_struct), 0, 1570 sizeof(struct fs_struct), 0,
1512 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); 1571 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1572 /*
1573 * FIXME! The "sizeof(struct mm_struct)" currently includes the
1574 * whole struct cpumask for the OFFSTACK case. We could change
1575 * this to *only* allocate as much of it as required by the
1576 * maximum number of CPU's we can ever have. The cpumask_allocation
1577 * is at the end of the structure, exactly for that reason.
1578 */
1513 mm_cachep = kmem_cache_create("mm_struct", 1579 mm_cachep = kmem_cache_create("mm_struct",
1514 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1580 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1515 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); 1581 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
@@ -1518,38 +1584,24 @@ void __init proc_caches_init(void)
1518} 1584}
1519 1585
1520/* 1586/*
1521 * Check constraints on flags passed to the unshare system call and 1587 * Check constraints on flags passed to the unshare system call.
1522 * force unsharing of additional process context as appropriate.
1523 */ 1588 */
1524static void check_unshare_flags(unsigned long *flags_ptr) 1589static int check_unshare_flags(unsigned long unshare_flags)
1525{ 1590{
1591 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1592 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1593 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
1594 return -EINVAL;
1526 /* 1595 /*
1527 * If unsharing a thread from a thread group, must also 1596 * Not implemented, but pretend it works if there is nothing to
1528 * unshare vm. 1597 * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
1529 */ 1598 * needs to unshare vm.
1530 if (*flags_ptr & CLONE_THREAD)
1531 *flags_ptr |= CLONE_VM;
1532
1533 /*
1534 * If unsharing vm, must also unshare signal handlers.
1535 */
1536 if (*flags_ptr & CLONE_VM)
1537 *flags_ptr |= CLONE_SIGHAND;
1538
1539 /*
1540 * If unsharing namespace, must also unshare filesystem information.
1541 */ 1599 */
1542 if (*flags_ptr & CLONE_NEWNS) 1600 if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
1543 *flags_ptr |= CLONE_FS; 1601 /* FIXME: get_task_mm() increments ->mm_users */
1544} 1602 if (atomic_read(&current->mm->mm_users) > 1)
1545 1603 return -EINVAL;
1546/* 1604 }
1547 * Unsharing of tasks created with CLONE_THREAD is not supported yet
1548 */
1549static int unshare_thread(unsigned long unshare_flags)
1550{
1551 if (unshare_flags & CLONE_THREAD)
1552 return -EINVAL;
1553 1605
1554 return 0; 1606 return 0;
1555} 1607}
@@ -1576,34 +1628,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1576} 1628}
1577 1629
1578/* 1630/*
1579 * Unsharing of sighand is not supported yet
1580 */
1581static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
1582{
1583 struct sighand_struct *sigh = current->sighand;
1584
1585 if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
1586 return -EINVAL;
1587 else
1588 return 0;
1589}
1590
1591/*
1592 * Unshare vm if it is being shared
1593 */
1594static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
1595{
1596 struct mm_struct *mm = current->mm;
1597
1598 if ((unshare_flags & CLONE_VM) &&
1599 (mm && atomic_read(&mm->mm_users) > 1)) {
1600 return -EINVAL;
1601 }
1602
1603 return 0;
1604}
1605
1606/*
1607 * Unshare file descriptor table if it is being shared 1631 * Unshare file descriptor table if it is being shared
1608 */ 1632 */
1609static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) 1633static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
@@ -1631,45 +1655,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
1631 */ 1655 */
1632SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) 1656SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1633{ 1657{
1634 int err = 0;
1635 struct fs_struct *fs, *new_fs = NULL; 1658 struct fs_struct *fs, *new_fs = NULL;
1636 struct sighand_struct *new_sigh = NULL;
1637 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1638 struct files_struct *fd, *new_fd = NULL; 1659 struct files_struct *fd, *new_fd = NULL;
1639 struct nsproxy *new_nsproxy = NULL; 1660 struct nsproxy *new_nsproxy = NULL;
1640 int do_sysvsem = 0; 1661 int do_sysvsem = 0;
1662 int err;
1641 1663
1642 check_unshare_flags(&unshare_flags); 1664 err = check_unshare_flags(unshare_flags);
1643 1665 if (err)
1644 /* Return -EINVAL for all unsupported flags */
1645 err = -EINVAL;
1646 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1647 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1648 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
1649 goto bad_unshare_out; 1666 goto bad_unshare_out;
1650 1667
1651 /* 1668 /*
1669 * If unsharing namespace, must also unshare filesystem information.
1670 */
1671 if (unshare_flags & CLONE_NEWNS)
1672 unshare_flags |= CLONE_FS;
1673 /*
1652 * CLONE_NEWIPC must also detach from the undolist: after switching 1674 * CLONE_NEWIPC must also detach from the undolist: after switching
1653 * to a new ipc namespace, the semaphore arrays from the old 1675 * to a new ipc namespace, the semaphore arrays from the old
1654 * namespace are unreachable. 1676 * namespace are unreachable.
1655 */ 1677 */
1656 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) 1678 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
1657 do_sysvsem = 1; 1679 do_sysvsem = 1;
1658 if ((err = unshare_thread(unshare_flags)))
1659 goto bad_unshare_out;
1660 if ((err = unshare_fs(unshare_flags, &new_fs))) 1680 if ((err = unshare_fs(unshare_flags, &new_fs)))
1661 goto bad_unshare_cleanup_thread; 1681 goto bad_unshare_out;
1662 if ((err = unshare_sighand(unshare_flags, &new_sigh)))
1663 goto bad_unshare_cleanup_fs;
1664 if ((err = unshare_vm(unshare_flags, &new_mm)))
1665 goto bad_unshare_cleanup_sigh;
1666 if ((err = unshare_fd(unshare_flags, &new_fd))) 1682 if ((err = unshare_fd(unshare_flags, &new_fd)))
1667 goto bad_unshare_cleanup_vm; 1683 goto bad_unshare_cleanup_fs;
1668 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, 1684 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1669 new_fs))) 1685 new_fs)))
1670 goto bad_unshare_cleanup_fd; 1686 goto bad_unshare_cleanup_fd;
1671 1687
1672 if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { 1688 if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
1673 if (do_sysvsem) { 1689 if (do_sysvsem) {
1674 /* 1690 /*
1675 * CLONE_SYSVSEM is equivalent to sys_exit(). 1691 * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1695,15 +1711,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1695 spin_unlock(&fs->lock); 1711 spin_unlock(&fs->lock);
1696 } 1712 }
1697 1713
1698 if (new_mm) {
1699 mm = current->mm;
1700 active_mm = current->active_mm;
1701 current->mm = new_mm;
1702 current->active_mm = new_mm;
1703 activate_mm(active_mm, new_mm);
1704 new_mm = mm;
1705 }
1706
1707 if (new_fd) { 1714 if (new_fd) {
1708 fd = current->files; 1715 fd = current->files;
1709 current->files = new_fd; 1716 current->files = new_fd;
@@ -1720,20 +1727,10 @@ bad_unshare_cleanup_fd:
1720 if (new_fd) 1727 if (new_fd)
1721 put_files_struct(new_fd); 1728 put_files_struct(new_fd);
1722 1729
1723bad_unshare_cleanup_vm:
1724 if (new_mm)
1725 mmput(new_mm);
1726
1727bad_unshare_cleanup_sigh:
1728 if (new_sigh)
1729 if (atomic_dec_and_test(&new_sigh->count))
1730 kmem_cache_free(sighand_cachep, new_sigh);
1731
1732bad_unshare_cleanup_fs: 1730bad_unshare_cleanup_fs:
1733 if (new_fs) 1731 if (new_fs)
1734 free_fs_struct(new_fs); 1732 free_fs_struct(new_fs);
1735 1733
1736bad_unshare_cleanup_thread:
1737bad_unshare_out: 1734bad_unshare_out:
1738 return err; 1735 return err;
1739} 1736}
diff --git a/kernel/freezer.c b/kernel/freezer.c
index bd1d42b17cb2..7b01de98bb6a 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -17,7 +17,7 @@ static inline void frozen_process(void)
17{ 17{
18 if (!unlikely(current->flags & PF_NOFREEZE)) { 18 if (!unlikely(current->flags & PF_NOFREEZE)) {
19 current->flags |= PF_FROZEN; 19 current->flags |= PF_FROZEN;
20 wmb(); 20 smp_wmb();
21 } 21 }
22 clear_freeze_flag(current); 22 clear_freeze_flag(current);
23} 23}
@@ -93,7 +93,7 @@ bool freeze_task(struct task_struct *p, bool sig_only)
93 * the task as frozen and next clears its TIF_FREEZE. 93 * the task as frozen and next clears its TIF_FREEZE.
94 */ 94 */
95 if (!freezing(p)) { 95 if (!freezing(p)) {
96 rmb(); 96 smp_rmb();
97 if (frozen(p)) 97 if (frozen(p))
98 return false; 98 return false;
99 99
@@ -104,8 +104,13 @@ bool freeze_task(struct task_struct *p, bool sig_only)
104 } 104 }
105 105
106 if (should_send_signal(p)) { 106 if (should_send_signal(p)) {
107 if (!signal_pending(p)) 107 fake_signal_wake_up(p);
108 fake_signal_wake_up(p); 108 /*
109 * fake_signal_wake_up() goes through p's scheduler
110 * lock and guarantees that TASK_STOPPED/TRACED ->
111 * TASK_RUNNING transition can't race with task state
112 * testing in try_to_freeze_tasks().
113 */
109 } else if (sig_only) { 114 } else if (sig_only) {
110 return false; 115 return false;
111 } else { 116 } else {
diff --git a/kernel/futex.c b/kernel/futex.c
index 6a3a5fa1526d..fe28dc282eae 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled;
69#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 69#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
70 70
71/* 71/*
72 * Futex flags used to encode options to functions and preserve them across
73 * restarts.
74 */
75#define FLAGS_SHARED 0x01
76#define FLAGS_CLOCKRT 0x02
77#define FLAGS_HAS_TIMEOUT 0x04
78
79/*
72 * Priority Inheritance state: 80 * Priority Inheritance state:
73 */ 81 */
74struct futex_pi_state { 82struct futex_pi_state {
@@ -91,6 +99,7 @@ struct futex_pi_state {
91 99
92/** 100/**
93 * struct futex_q - The hashed futex queue entry, one per waiting task 101 * struct futex_q - The hashed futex queue entry, one per waiting task
102 * @list: priority-sorted list of tasks waiting on this futex
94 * @task: the task waiting on the futex 103 * @task: the task waiting on the futex
95 * @lock_ptr: the hash bucket lock 104 * @lock_ptr: the hash bucket lock
96 * @key: the key the futex is hashed on 105 * @key: the key the futex is hashed on
@@ -104,7 +113,7 @@ struct futex_pi_state {
104 * 113 *
105 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 114 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
106 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 115 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
107 * The order of wakup is always to make the first condition true, then 116 * The order of wakeup is always to make the first condition true, then
108 * the second. 117 * the second.
109 * 118 *
110 * PI futexes are typically woken before they are removed from the hash list via 119 * PI futexes are typically woken before they are removed from the hash list via
@@ -122,6 +131,12 @@ struct futex_q {
122 u32 bitset; 131 u32 bitset;
123}; 132};
124 133
134static const struct futex_q futex_q_init = {
135 /* list gets initialized in queue_me()*/
136 .key = FUTEX_KEY_INIT,
137 .bitset = FUTEX_BITSET_MATCH_ANY
138};
139
125/* 140/*
126 * Hash buckets are shared by all the futex_keys that hash to the same 141 * Hash buckets are shared by all the futex_keys that hash to the same
127 * location. Each key may have multiple futex_q structures, one for each task 142 * location. Each key may have multiple futex_q structures, one for each task
@@ -168,7 +183,7 @@ static void get_futex_key_refs(union futex_key *key)
168 183
169 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 184 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
170 case FUT_OFF_INODE: 185 case FUT_OFF_INODE:
171 atomic_inc(&key->shared.inode->i_count); 186 ihold(key->shared.inode);
172 break; 187 break;
173 case FUT_OFF_MMSHARED: 188 case FUT_OFF_MMSHARED:
174 atomic_inc(&key->private.mm->mm_count); 189 atomic_inc(&key->private.mm->mm_count);
@@ -218,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
218{ 233{
219 unsigned long address = (unsigned long)uaddr; 234 unsigned long address = (unsigned long)uaddr;
220 struct mm_struct *mm = current->mm; 235 struct mm_struct *mm = current->mm;
221 struct page *page; 236 struct page *page, *page_head;
222 int err; 237 int err;
223 238
224 /* 239 /*
@@ -250,11 +265,46 @@ again:
250 if (err < 0) 265 if (err < 0)
251 return err; 266 return err;
252 267
253 page = compound_head(page); 268#ifdef CONFIG_TRANSPARENT_HUGEPAGE
254 lock_page(page); 269 page_head = page;
255 if (!page->mapping) { 270 if (unlikely(PageTail(page))) {
256 unlock_page(page);
257 put_page(page); 271 put_page(page);
272 /* serialize against __split_huge_page_splitting() */
273 local_irq_disable();
274 if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
275 page_head = compound_head(page);
276 /*
277 * page_head is valid pointer but we must pin
278 * it before taking the PG_lock and/or
279 * PG_compound_lock. The moment we re-enable
280 * irqs __split_huge_page_splitting() can
281 * return and the head page can be freed from
282 * under us. We can't take the PG_lock and/or
283 * PG_compound_lock on a page that could be
284 * freed from under us.
285 */
286 if (page != page_head) {
287 get_page(page_head);
288 put_page(page);
289 }
290 local_irq_enable();
291 } else {
292 local_irq_enable();
293 goto again;
294 }
295 }
296#else
297 page_head = compound_head(page);
298 if (page != page_head) {
299 get_page(page_head);
300 put_page(page);
301 }
302#endif
303
304 lock_page(page_head);
305 if (!page_head->mapping) {
306 unlock_page(page_head);
307 put_page(page_head);
258 goto again; 308 goto again;
259 } 309 }
260 310
@@ -265,25 +315,24 @@ again:
265 * it's a read-only handle, it's expected that futexes attach to 315 * it's a read-only handle, it's expected that futexes attach to
266 * the object not the particular process. 316 * the object not the particular process.
267 */ 317 */
268 if (PageAnon(page)) { 318 if (PageAnon(page_head)) {
269 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 319 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
270 key->private.mm = mm; 320 key->private.mm = mm;
271 key->private.address = address; 321 key->private.address = address;
272 } else { 322 } else {
273 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 323 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
274 key->shared.inode = page->mapping->host; 324 key->shared.inode = page_head->mapping->host;
275 key->shared.pgoff = page->index; 325 key->shared.pgoff = page_head->index;
276 } 326 }
277 327
278 get_futex_key_refs(key); 328 get_futex_key_refs(key);
279 329
280 unlock_page(page); 330 unlock_page(page_head);
281 put_page(page); 331 put_page(page_head);
282 return 0; 332 return 0;
283} 333}
284 334
285static inline 335static inline void put_futex_key(union futex_key *key)
286void put_futex_key(int fshared, union futex_key *key)
287{ 336{
288 drop_futex_key_refs(key); 337 drop_futex_key_refs(key);
289} 338}
@@ -295,7 +344,7 @@ void put_futex_key(int fshared, union futex_key *key)
295 * Slow path to fixup the fault we just took in the atomic write 344 * Slow path to fixup the fault we just took in the atomic write
296 * access to @uaddr. 345 * access to @uaddr.
297 * 346 *
298 * We have no generic implementation of a non destructive write to the 347 * We have no generic implementation of a non-destructive write to the
299 * user address. We know that we faulted in the atomic pagefault 348 * user address. We know that we faulted in the atomic pagefault
300 * disabled section so we can as well avoid the #PF overhead by 349 * disabled section so we can as well avoid the #PF overhead by
301 * calling get_user_pages() right away. 350 * calling get_user_pages() right away.
@@ -332,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
332 return NULL; 381 return NULL;
333} 382}
334 383
335static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 384static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
385 u32 uval, u32 newval)
336{ 386{
337 u32 curval; 387 int ret;
338 388
339 pagefault_disable(); 389 pagefault_disable();
340 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 390 ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
341 pagefault_enable(); 391 pagefault_enable();
342 392
343 return curval; 393 return ret;
344} 394}
345 395
346static int get_futex_value_locked(u32 *dest, u32 __user *from) 396static int get_futex_value_locked(u32 *dest, u32 __user *from)
@@ -515,7 +565,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
515 */ 565 */
516 pi_state = this->pi_state; 566 pi_state = this->pi_state;
517 /* 567 /*
518 * Userspace might have messed up non PI and PI futexes 568 * Userspace might have messed up non-PI and PI futexes
519 */ 569 */
520 if (unlikely(!pi_state)) 570 if (unlikely(!pi_state))
521 return -EINVAL; 571 return -EINVAL;
@@ -625,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
625 struct task_struct *task, int set_waiters) 675 struct task_struct *task, int set_waiters)
626{ 676{
627 int lock_taken, ret, ownerdied = 0; 677 int lock_taken, ret, ownerdied = 0;
628 u32 uval, newval, curval; 678 u32 uval, newval, curval, vpid = task_pid_vnr(task);
629 679
630retry: 680retry:
631 ret = lock_taken = 0; 681 ret = lock_taken = 0;
@@ -635,19 +685,17 @@ retry:
635 * (by doing a 0 -> TID atomic cmpxchg), while holding all 685 * (by doing a 0 -> TID atomic cmpxchg), while holding all
636 * the locks. It will most likely not succeed. 686 * the locks. It will most likely not succeed.
637 */ 687 */
638 newval = task_pid_vnr(task); 688 newval = vpid;
639 if (set_waiters) 689 if (set_waiters)
640 newval |= FUTEX_WAITERS; 690 newval |= FUTEX_WAITERS;
641 691
642 curval = cmpxchg_futex_value_locked(uaddr, 0, newval); 692 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
643
644 if (unlikely(curval == -EFAULT))
645 return -EFAULT; 693 return -EFAULT;
646 694
647 /* 695 /*
648 * Detect deadlocks. 696 * Detect deadlocks.
649 */ 697 */
650 if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) 698 if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
651 return -EDEADLK; 699 return -EDEADLK;
652 700
653 /* 701 /*
@@ -674,14 +722,12 @@ retry:
674 */ 722 */
675 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { 723 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
676 /* Keep the OWNER_DIED bit */ 724 /* Keep the OWNER_DIED bit */
677 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); 725 newval = (curval & ~FUTEX_TID_MASK) | vpid;
678 ownerdied = 0; 726 ownerdied = 0;
679 lock_taken = 1; 727 lock_taken = 1;
680 } 728 }
681 729
682 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 730 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
683
684 if (unlikely(curval == -EFAULT))
685 return -EFAULT; 731 return -EFAULT;
686 if (unlikely(curval != uval)) 732 if (unlikely(curval != uval))
687 goto retry; 733 goto retry;
@@ -726,6 +772,24 @@ retry:
726 return ret; 772 return ret;
727} 773}
728 774
775/**
776 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
777 * @q: The futex_q to unqueue
778 *
779 * The q->lock_ptr must not be NULL and must be held by the caller.
780 */
781static void __unqueue_futex(struct futex_q *q)
782{
783 struct futex_hash_bucket *hb;
784
785 if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
786 || WARN_ON(plist_node_empty(&q->list)))
787 return;
788
789 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
790 plist_del(&q->list, &hb->chain);
791}
792
729/* 793/*
730 * The hash bucket lock must be held when this is called. 794 * The hash bucket lock must be held when this is called.
731 * Afterwards, the futex_q must not be accessed. 795 * Afterwards, the futex_q must not be accessed.
@@ -736,14 +800,14 @@ static void wake_futex(struct futex_q *q)
736 800
737 /* 801 /*
738 * We set q->lock_ptr = NULL _before_ we wake up the task. If 802 * We set q->lock_ptr = NULL _before_ we wake up the task. If
739 * a non futex wake up happens on another CPU then the task 803 * a non-futex wake up happens on another CPU then the task
740 * might exit and p would dereference a non existing task 804 * might exit and p would dereference a non-existing task
741 * struct. Prevent this by holding a reference on p across the 805 * struct. Prevent this by holding a reference on p across the
742 * wake up. 806 * wake up.
743 */ 807 */
744 get_task_struct(p); 808 get_task_struct(p);
745 809
746 plist_del(&q->list, &q->list.plist); 810 __unqueue_futex(q);
747 /* 811 /*
748 * The waiting task can free the futex_q as soon as 812 * The waiting task can free the futex_q as soon as
749 * q->lock_ptr = NULL is written, without taking any locks. A 813 * q->lock_ptr = NULL is written, without taking any locks. A
@@ -777,10 +841,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
777 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 841 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
778 842
779 /* 843 /*
780 * This happens when we have stolen the lock and the original 844 * It is possible that the next waiter (the one that brought
781 * pending owner did not enqueue itself back on the rt_mutex. 845 * this owner to the kernel) timed out and is no longer
782 * Thats not a tragedy. We know that way, that a lock waiter 846 * waiting on the lock.
783 * is on the fly. We make the futex_q waiter the pending owner.
784 */ 847 */
785 if (!new_owner) 848 if (!new_owner)
786 new_owner = this->task; 849 new_owner = this->task;
@@ -795,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
795 858
796 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 859 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
797 860
798 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 861 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
799
800 if (curval == -EFAULT)
801 ret = -EFAULT; 862 ret = -EFAULT;
802 else if (curval != uval) 863 else if (curval != uval)
803 ret = -EINVAL; 864 ret = -EINVAL;
@@ -832,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
832 * There is no waiter, so we unlock the futex. The owner died 893 * There is no waiter, so we unlock the futex. The owner died
833 * bit has not to be preserved here. We are the owner: 894 * bit has not to be preserved here. We are the owner:
834 */ 895 */
835 oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); 896 if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
836 897 return -EFAULT;
837 if (oldval == -EFAULT)
838 return oldval;
839 if (oldval != uval) 898 if (oldval != uval)
840 return -EAGAIN; 899 return -EAGAIN;
841 900
@@ -869,7 +928,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
869/* 928/*
870 * Wake up waiters matching bitset queued on this futex (uaddr). 929 * Wake up waiters matching bitset queued on this futex (uaddr).
871 */ 930 */
872static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) 931static int
932futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
873{ 933{
874 struct futex_hash_bucket *hb; 934 struct futex_hash_bucket *hb;
875 struct futex_q *this, *next; 935 struct futex_q *this, *next;
@@ -880,7 +940,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
880 if (!bitset) 940 if (!bitset)
881 return -EINVAL; 941 return -EINVAL;
882 942
883 ret = get_futex_key(uaddr, fshared, &key); 943 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
884 if (unlikely(ret != 0)) 944 if (unlikely(ret != 0))
885 goto out; 945 goto out;
886 946
@@ -906,7 +966,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
906 } 966 }
907 967
908 spin_unlock(&hb->lock); 968 spin_unlock(&hb->lock);
909 put_futex_key(fshared, &key); 969 put_futex_key(&key);
910out: 970out:
911 return ret; 971 return ret;
912} 972}
@@ -916,7 +976,7 @@ out:
916 * to this virtual address: 976 * to this virtual address:
917 */ 977 */
918static int 978static int
919futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, 979futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
920 int nr_wake, int nr_wake2, int op) 980 int nr_wake, int nr_wake2, int op)
921{ 981{
922 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 982 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -926,10 +986,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
926 int ret, op_ret; 986 int ret, op_ret;
927 987
928retry: 988retry:
929 ret = get_futex_key(uaddr1, fshared, &key1); 989 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
930 if (unlikely(ret != 0)) 990 if (unlikely(ret != 0))
931 goto out; 991 goto out;
932 ret = get_futex_key(uaddr2, fshared, &key2); 992 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
933 if (unlikely(ret != 0)) 993 if (unlikely(ret != 0))
934 goto out_put_key1; 994 goto out_put_key1;
935 995
@@ -961,11 +1021,11 @@ retry_private:
961 if (ret) 1021 if (ret)
962 goto out_put_keys; 1022 goto out_put_keys;
963 1023
964 if (!fshared) 1024 if (!(flags & FLAGS_SHARED))
965 goto retry_private; 1025 goto retry_private;
966 1026
967 put_futex_key(fshared, &key2); 1027 put_futex_key(&key2);
968 put_futex_key(fshared, &key1); 1028 put_futex_key(&key1);
969 goto retry; 1029 goto retry;
970 } 1030 }
971 1031
@@ -995,9 +1055,9 @@ retry_private:
995 1055
996 double_unlock_hb(hb1, hb2); 1056 double_unlock_hb(hb1, hb2);
997out_put_keys: 1057out_put_keys:
998 put_futex_key(fshared, &key2); 1058 put_futex_key(&key2);
999out_put_key1: 1059out_put_key1:
1000 put_futex_key(fshared, &key1); 1060 put_futex_key(&key1);
1001out: 1061out:
1002 return ret; 1062 return ret;
1003} 1063}
@@ -1022,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1022 plist_del(&q->list, &hb1->chain); 1082 plist_del(&q->list, &hb1->chain);
1023 plist_add(&q->list, &hb2->chain); 1083 plist_add(&q->list, &hb2->chain);
1024 q->lock_ptr = &hb2->lock; 1084 q->lock_ptr = &hb2->lock;
1025#ifdef CONFIG_DEBUG_PI_LIST
1026 q->list.plist.spinlock = &hb2->lock;
1027#endif
1028 } 1085 }
1029 get_futex_key_refs(key2); 1086 get_futex_key_refs(key2);
1030 q->key = *key2; 1087 q->key = *key2;
@@ -1051,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1051 get_futex_key_refs(key); 1108 get_futex_key_refs(key);
1052 q->key = *key; 1109 q->key = *key;
1053 1110
1054 WARN_ON(plist_node_empty(&q->list)); 1111 __unqueue_futex(q);
1055 plist_del(&q->list, &q->list.plist);
1056 1112
1057 WARN_ON(!q->rt_waiter); 1113 WARN_ON(!q->rt_waiter);
1058 q->rt_waiter = NULL; 1114 q->rt_waiter = NULL;
1059 1115
1060 q->lock_ptr = &hb->lock; 1116 q->lock_ptr = &hb->lock;
1061#ifdef CONFIG_DEBUG_PI_LIST
1062 q->list.plist.spinlock = &hb->lock;
1063#endif
1064 1117
1065 wake_up_state(q->task, TASK_NORMAL); 1118 wake_up_state(q->task, TASK_NORMAL);
1066} 1119}
@@ -1131,12 +1184,14 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1131 1184
1132/** 1185/**
1133 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 1186 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1134 * uaddr1: source futex user address 1187 * @uaddr1: source futex user address
1135 * uaddr2: target futex user address 1188 * @flags: futex flags (FLAGS_SHARED, etc.)
1136 * nr_wake: number of waiters to wake (must be 1 for requeue_pi) 1189 * @uaddr2: target futex user address
1137 * nr_requeue: number of waiters to requeue (0-INT_MAX) 1190 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1138 * requeue_pi: if we are attempting to requeue from a non-pi futex to a 1191 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1139 * pi futex (pi to pi requeue is not supported) 1192 * @cmpval: @uaddr1 expected value (or %NULL)
1193 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1194 * pi futex (pi to pi requeue is not supported)
1140 * 1195 *
1141 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire 1196 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1142 * uaddr2 atomically on behalf of the top waiter. 1197 * uaddr2 atomically on behalf of the top waiter.
@@ -1145,9 +1200,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1145 * >=0 - on success, the number of tasks requeued or woken 1200 * >=0 - on success, the number of tasks requeued or woken
1146 * <0 - on error 1201 * <0 - on error
1147 */ 1202 */
1148static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, 1203static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1149 int nr_wake, int nr_requeue, u32 *cmpval, 1204 u32 __user *uaddr2, int nr_wake, int nr_requeue,
1150 int requeue_pi) 1205 u32 *cmpval, int requeue_pi)
1151{ 1206{
1152 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1207 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1153 int drop_count = 0, task_count = 0, ret; 1208 int drop_count = 0, task_count = 0, ret;
@@ -1188,10 +1243,10 @@ retry:
1188 pi_state = NULL; 1243 pi_state = NULL;
1189 } 1244 }
1190 1245
1191 ret = get_futex_key(uaddr1, fshared, &key1); 1246 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
1192 if (unlikely(ret != 0)) 1247 if (unlikely(ret != 0))
1193 goto out; 1248 goto out;
1194 ret = get_futex_key(uaddr2, fshared, &key2); 1249 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
1195 if (unlikely(ret != 0)) 1250 if (unlikely(ret != 0))
1196 goto out_put_key1; 1251 goto out_put_key1;
1197 1252
@@ -1213,11 +1268,11 @@ retry_private:
1213 if (ret) 1268 if (ret)
1214 goto out_put_keys; 1269 goto out_put_keys;
1215 1270
1216 if (!fshared) 1271 if (!(flags & FLAGS_SHARED))
1217 goto retry_private; 1272 goto retry_private;
1218 1273
1219 put_futex_key(fshared, &key2); 1274 put_futex_key(&key2);
1220 put_futex_key(fshared, &key1); 1275 put_futex_key(&key1);
1221 goto retry; 1276 goto retry;
1222 } 1277 }
1223 if (curval != *cmpval) { 1278 if (curval != *cmpval) {
@@ -1257,8 +1312,8 @@ retry_private:
1257 break; 1312 break;
1258 case -EFAULT: 1313 case -EFAULT:
1259 double_unlock_hb(hb1, hb2); 1314 double_unlock_hb(hb1, hb2);
1260 put_futex_key(fshared, &key2); 1315 put_futex_key(&key2);
1261 put_futex_key(fshared, &key1); 1316 put_futex_key(&key1);
1262 ret = fault_in_user_writeable(uaddr2); 1317 ret = fault_in_user_writeable(uaddr2);
1263 if (!ret) 1318 if (!ret)
1264 goto retry; 1319 goto retry;
@@ -1266,8 +1321,8 @@ retry_private:
1266 case -EAGAIN: 1321 case -EAGAIN:
1267 /* The owner was exiting, try again. */ 1322 /* The owner was exiting, try again. */
1268 double_unlock_hb(hb1, hb2); 1323 double_unlock_hb(hb1, hb2);
1269 put_futex_key(fshared, &key2); 1324 put_futex_key(&key2);
1270 put_futex_key(fshared, &key1); 1325 put_futex_key(&key1);
1271 cond_resched(); 1326 cond_resched();
1272 goto retry; 1327 goto retry;
1273 default: 1328 default:
@@ -1349,9 +1404,9 @@ out_unlock:
1349 drop_futex_key_refs(&key1); 1404 drop_futex_key_refs(&key1);
1350 1405
1351out_put_keys: 1406out_put_keys:
1352 put_futex_key(fshared, &key2); 1407 put_futex_key(&key2);
1353out_put_key1: 1408out_put_key1:
1354 put_futex_key(fshared, &key1); 1409 put_futex_key(&key1);
1355out: 1410out:
1356 if (pi_state != NULL) 1411 if (pi_state != NULL)
1357 free_pi_state(pi_state); 1412 free_pi_state(pi_state);
@@ -1360,10 +1415,10 @@ out:
1360 1415
1361/* The key must be already stored in q->key. */ 1416/* The key must be already stored in q->key. */
1362static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) 1417static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1418 __acquires(&hb->lock)
1363{ 1419{
1364 struct futex_hash_bucket *hb; 1420 struct futex_hash_bucket *hb;
1365 1421
1366 get_futex_key_refs(&q->key);
1367 hb = hash_futex(&q->key); 1422 hb = hash_futex(&q->key);
1368 q->lock_ptr = &hb->lock; 1423 q->lock_ptr = &hb->lock;
1369 1424
@@ -1373,9 +1428,9 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1373 1428
1374static inline void 1429static inline void
1375queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) 1430queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1431 __releases(&hb->lock)
1376{ 1432{
1377 spin_unlock(&hb->lock); 1433 spin_unlock(&hb->lock);
1378 drop_futex_key_refs(&q->key);
1379} 1434}
1380 1435
1381/** 1436/**
@@ -1391,6 +1446,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1391 * an example). 1446 * an example).
1392 */ 1447 */
1393static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 1448static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1449 __releases(&hb->lock)
1394{ 1450{
1395 int prio; 1451 int prio;
1396 1452
@@ -1405,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1405 prio = min(current->normal_prio, MAX_RT_PRIO); 1461 prio = min(current->normal_prio, MAX_RT_PRIO);
1406 1462
1407 plist_node_init(&q->list, prio); 1463 plist_node_init(&q->list, prio);
1408#ifdef CONFIG_DEBUG_PI_LIST
1409 q->list.plist.spinlock = &hb->lock;
1410#endif
1411 plist_add(&q->list, &hb->chain); 1464 plist_add(&q->list, &hb->chain);
1412 q->task = current; 1465 q->task = current;
1413 spin_unlock(&hb->lock); 1466 spin_unlock(&hb->lock);
@@ -1452,8 +1505,7 @@ retry:
1452 spin_unlock(lock_ptr); 1505 spin_unlock(lock_ptr);
1453 goto retry; 1506 goto retry;
1454 } 1507 }
1455 WARN_ON(plist_node_empty(&q->list)); 1508 __unqueue_futex(q);
1456 plist_del(&q->list, &q->list.plist);
1457 1509
1458 BUG_ON(q->pi_state); 1510 BUG_ON(q->pi_state);
1459 1511
@@ -1471,17 +1523,15 @@ retry:
1471 * and dropped here. 1523 * and dropped here.
1472 */ 1524 */
1473static void unqueue_me_pi(struct futex_q *q) 1525static void unqueue_me_pi(struct futex_q *q)
1526 __releases(q->lock_ptr)
1474{ 1527{
1475 WARN_ON(plist_node_empty(&q->list)); 1528 __unqueue_futex(q);
1476 plist_del(&q->list, &q->list.plist);
1477 1529
1478 BUG_ON(!q->pi_state); 1530 BUG_ON(!q->pi_state);
1479 free_pi_state(q->pi_state); 1531 free_pi_state(q->pi_state);
1480 q->pi_state = NULL; 1532 q->pi_state = NULL;
1481 1533
1482 spin_unlock(q->lock_ptr); 1534 spin_unlock(q->lock_ptr);
1483
1484 drop_futex_key_refs(&q->key);
1485} 1535}
1486 1536
1487/* 1537/*
@@ -1491,7 +1541,7 @@ static void unqueue_me_pi(struct futex_q *q)
1491 * private futexes. 1541 * private futexes.
1492 */ 1542 */
1493static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1543static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1494 struct task_struct *newowner, int fshared) 1544 struct task_struct *newowner)
1495{ 1545{
1496 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1546 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1497 struct futex_pi_state *pi_state = q->pi_state; 1547 struct futex_pi_state *pi_state = q->pi_state;
@@ -1505,10 +1555,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1505 1555
1506 /* 1556 /*
1507 * We are here either because we stole the rtmutex from the 1557 * We are here either because we stole the rtmutex from the
1508 * pending owner or we are the pending owner which failed to 1558 * previous highest priority waiter or we are the highest priority
1509 * get the rtmutex. We have to replace the pending owner TID 1559 * waiter but failed to get the rtmutex the first time.
1510 * in the user space variable. This must be atomic as we have 1560 * We have to replace the newowner TID in the user space variable.
1511 * to preserve the owner died bit here. 1561 * This must be atomic as we have to preserve the owner died bit here.
1512 * 1562 *
1513 * Note: We write the user space value _before_ changing the pi_state 1563 * Note: We write the user space value _before_ changing the pi_state
1514 * because we can fault here. Imagine swapped out pages or a fork 1564 * because we can fault here. Imagine swapped out pages or a fork
@@ -1527,9 +1577,7 @@ retry:
1527 while (1) { 1577 while (1) {
1528 newval = (uval & FUTEX_OWNER_DIED) | newtid; 1578 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1529 1579
1530 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 1580 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1531
1532 if (curval == -EFAULT)
1533 goto handle_fault; 1581 goto handle_fault;
1534 if (curval == uval) 1582 if (curval == uval)
1535 break; 1583 break;
@@ -1557,8 +1605,8 @@ retry:
1557 1605
1558 /* 1606 /*
1559 * To handle the page fault we need to drop the hash bucket 1607 * To handle the page fault we need to drop the hash bucket
1560 * lock here. That gives the other task (either the pending 1608 * lock here. That gives the other task (either the highest priority
1561 * owner itself or the task which stole the rtmutex) the 1609 * waiter itself or the task which stole the rtmutex) the
1562 * chance to try the fixup of the pi_state. So once we are 1610 * chance to try the fixup of the pi_state. So once we are
1563 * back from handling the fault we need to check the pi_state 1611 * back from handling the fault we need to check the pi_state
1564 * after reacquiring the hash bucket lock and before trying to 1612 * after reacquiring the hash bucket lock and before trying to
@@ -1584,20 +1632,11 @@ handle_fault:
1584 goto retry; 1632 goto retry;
1585} 1633}
1586 1634
1587/*
1588 * In case we must use restart_block to restart a futex_wait,
1589 * we encode in the 'flags' shared capability
1590 */
1591#define FLAGS_SHARED 0x01
1592#define FLAGS_CLOCKRT 0x02
1593#define FLAGS_HAS_TIMEOUT 0x04
1594
1595static long futex_wait_restart(struct restart_block *restart); 1635static long futex_wait_restart(struct restart_block *restart);
1596 1636
1597/** 1637/**
1598 * fixup_owner() - Post lock pi_state and corner case management 1638 * fixup_owner() - Post lock pi_state and corner case management
1599 * @uaddr: user address of the futex 1639 * @uaddr: user address of the futex
1600 * @fshared: whether the futex is shared (1) or not (0)
1601 * @q: futex_q (contains pi_state and access to the rt_mutex) 1640 * @q: futex_q (contains pi_state and access to the rt_mutex)
1602 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) 1641 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
1603 * 1642 *
@@ -1610,8 +1649,7 @@ static long futex_wait_restart(struct restart_block *restart);
1610 * 0 - success, lock not taken 1649 * 0 - success, lock not taken
1611 * <0 - on error (-EFAULT) 1650 * <0 - on error (-EFAULT)
1612 */ 1651 */
1613static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, 1652static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1614 int locked)
1615{ 1653{
1616 struct task_struct *owner; 1654 struct task_struct *owner;
1617 int ret = 0; 1655 int ret = 0;
@@ -1622,7 +1660,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
1622 * did a lock-steal - fix up the PI-state in that case: 1660 * did a lock-steal - fix up the PI-state in that case:
1623 */ 1661 */
1624 if (q->pi_state->owner != current) 1662 if (q->pi_state->owner != current)
1625 ret = fixup_pi_state_owner(uaddr, q, current, fshared); 1663 ret = fixup_pi_state_owner(uaddr, q, current);
1626 goto out; 1664 goto out;
1627 } 1665 }
1628 1666
@@ -1644,18 +1682,20 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
1644 /* 1682 /*
1645 * pi_state is incorrect, some other task did a lock steal and 1683 * pi_state is incorrect, some other task did a lock steal and
1646 * we returned due to timeout or signal without taking the 1684 * we returned due to timeout or signal without taking the
1647 * rt_mutex. Too late. We can access the rt_mutex_owner without 1685 * rt_mutex. Too late.
1648 * locking, as the other task is now blocked on the hash bucket
1649 * lock. Fix the state up.
1650 */ 1686 */
1687 raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
1651 owner = rt_mutex_owner(&q->pi_state->pi_mutex); 1688 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1652 ret = fixup_pi_state_owner(uaddr, q, owner, fshared); 1689 if (!owner)
1690 owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
1691 raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
1692 ret = fixup_pi_state_owner(uaddr, q, owner);
1653 goto out; 1693 goto out;
1654 } 1694 }
1655 1695
1656 /* 1696 /*
1657 * Paranoia check. If we did not take the lock, then we should not be 1697 * Paranoia check. If we did not take the lock, then we should not be
1658 * the owner, nor the pending owner, of the rt_mutex. 1698 * the owner of the rt_mutex.
1659 */ 1699 */
1660 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) 1700 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1661 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " 1701 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
@@ -1712,7 +1752,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1712 * futex_wait_setup() - Prepare to wait on a futex 1752 * futex_wait_setup() - Prepare to wait on a futex
1713 * @uaddr: the futex userspace address 1753 * @uaddr: the futex userspace address
1714 * @val: the expected value 1754 * @val: the expected value
1715 * @fshared: whether the futex is shared (1) or not (0) 1755 * @flags: futex flags (FLAGS_SHARED, etc.)
1716 * @q: the associated futex_q 1756 * @q: the associated futex_q
1717 * @hb: storage for hash_bucket pointer to be returned to caller 1757 * @hb: storage for hash_bucket pointer to be returned to caller
1718 * 1758 *
@@ -1725,7 +1765,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1725 * 0 - uaddr contains val and hb has been locked 1765 * 0 - uaddr contains val and hb has been locked
1726 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked 1766 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
1727 */ 1767 */
1728static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, 1768static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1729 struct futex_q *q, struct futex_hash_bucket **hb) 1769 struct futex_q *q, struct futex_hash_bucket **hb)
1730{ 1770{
1731 u32 uval; 1771 u32 uval;
@@ -1740,17 +1780,17 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
1740 * 1780 *
1741 * The basic logical guarantee of a futex is that it blocks ONLY 1781 * The basic logical guarantee of a futex is that it blocks ONLY
1742 * if cond(var) is known to be true at the time of blocking, for 1782 * if cond(var) is known to be true at the time of blocking, for
1743 * any cond. If we queued after testing *uaddr, that would open 1783 * any cond. If we locked the hash-bucket after testing *uaddr, that
1744 * a race condition where we could block indefinitely with 1784 * would open a race condition where we could block indefinitely with
1745 * cond(var) false, which would violate the guarantee. 1785 * cond(var) false, which would violate the guarantee.
1746 * 1786 *
1747 * A consequence is that futex_wait() can return zero and absorb 1787 * On the other hand, we insert q and release the hash-bucket only
1748 * a wakeup when *uaddr != val on entry to the syscall. This is 1788 * after testing *uaddr. This guarantees that futex_wait() will NOT
1749 * rare, but normal. 1789 * absorb a wakeup if *uaddr does not match the desired values
1790 * while the syscall executes.
1750 */ 1791 */
1751retry: 1792retry:
1752 q->key = FUTEX_KEY_INIT; 1793 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
1753 ret = get_futex_key(uaddr, fshared, &q->key);
1754 if (unlikely(ret != 0)) 1794 if (unlikely(ret != 0))
1755 return ret; 1795 return ret;
1756 1796
@@ -1766,10 +1806,10 @@ retry_private:
1766 if (ret) 1806 if (ret)
1767 goto out; 1807 goto out;
1768 1808
1769 if (!fshared) 1809 if (!(flags & FLAGS_SHARED))
1770 goto retry_private; 1810 goto retry_private;
1771 1811
1772 put_futex_key(fshared, &q->key); 1812 put_futex_key(&q->key);
1773 goto retry; 1813 goto retry;
1774 } 1814 }
1775 1815
@@ -1780,40 +1820,40 @@ retry_private:
1780 1820
1781out: 1821out:
1782 if (ret) 1822 if (ret)
1783 put_futex_key(fshared, &q->key); 1823 put_futex_key(&q->key);
1784 return ret; 1824 return ret;
1785} 1825}
1786 1826
1787static int futex_wait(u32 __user *uaddr, int fshared, 1827static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
1788 u32 val, ktime_t *abs_time, u32 bitset, int clockrt) 1828 ktime_t *abs_time, u32 bitset)
1789{ 1829{
1790 struct hrtimer_sleeper timeout, *to = NULL; 1830 struct hrtimer_sleeper timeout, *to = NULL;
1791 struct restart_block *restart; 1831 struct restart_block *restart;
1792 struct futex_hash_bucket *hb; 1832 struct futex_hash_bucket *hb;
1793 struct futex_q q; 1833 struct futex_q q = futex_q_init;
1794 int ret; 1834 int ret;
1795 1835
1796 if (!bitset) 1836 if (!bitset)
1797 return -EINVAL; 1837 return -EINVAL;
1798
1799 q.pi_state = NULL;
1800 q.bitset = bitset; 1838 q.bitset = bitset;
1801 q.rt_waiter = NULL;
1802 q.requeue_pi_key = NULL;
1803 1839
1804 if (abs_time) { 1840 if (abs_time) {
1805 to = &timeout; 1841 to = &timeout;
1806 1842
1807 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : 1843 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
1808 CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 1844 CLOCK_REALTIME : CLOCK_MONOTONIC,
1845 HRTIMER_MODE_ABS);
1809 hrtimer_init_sleeper(to, current); 1846 hrtimer_init_sleeper(to, current);
1810 hrtimer_set_expires_range_ns(&to->timer, *abs_time, 1847 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
1811 current->timer_slack_ns); 1848 current->timer_slack_ns);
1812 } 1849 }
1813 1850
1814retry: 1851retry:
1815 /* Prepare to wait on uaddr. */ 1852 /*
1816 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 1853 * Prepare to wait on uaddr. On success, holds hb lock and increments
1854 * q.key refs.
1855 */
1856 ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
1817 if (ret) 1857 if (ret)
1818 goto out; 1858 goto out;
1819 1859
@@ -1822,42 +1862,34 @@ retry:
1822 1862
1823 /* If we were woken (and unqueued), we succeeded, whatever. */ 1863 /* If we were woken (and unqueued), we succeeded, whatever. */
1824 ret = 0; 1864 ret = 0;
1865 /* unqueue_me() drops q.key ref */
1825 if (!unqueue_me(&q)) 1866 if (!unqueue_me(&q))
1826 goto out_put_key; 1867 goto out;
1827 ret = -ETIMEDOUT; 1868 ret = -ETIMEDOUT;
1828 if (to && !to->task) 1869 if (to && !to->task)
1829 goto out_put_key; 1870 goto out;
1830 1871
1831 /* 1872 /*
1832 * We expect signal_pending(current), but we might be the 1873 * We expect signal_pending(current), but we might be the
1833 * victim of a spurious wakeup as well. 1874 * victim of a spurious wakeup as well.
1834 */ 1875 */
1835 if (!signal_pending(current)) { 1876 if (!signal_pending(current))
1836 put_futex_key(fshared, &q.key);
1837 goto retry; 1877 goto retry;
1838 }
1839 1878
1840 ret = -ERESTARTSYS; 1879 ret = -ERESTARTSYS;
1841 if (!abs_time) 1880 if (!abs_time)
1842 goto out_put_key; 1881 goto out;
1843 1882
1844 restart = &current_thread_info()->restart_block; 1883 restart = &current_thread_info()->restart_block;
1845 restart->fn = futex_wait_restart; 1884 restart->fn = futex_wait_restart;
1846 restart->futex.uaddr = (u32 *)uaddr; 1885 restart->futex.uaddr = uaddr;
1847 restart->futex.val = val; 1886 restart->futex.val = val;
1848 restart->futex.time = abs_time->tv64; 1887 restart->futex.time = abs_time->tv64;
1849 restart->futex.bitset = bitset; 1888 restart->futex.bitset = bitset;
1850 restart->futex.flags = FLAGS_HAS_TIMEOUT; 1889 restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
1851
1852 if (fshared)
1853 restart->futex.flags |= FLAGS_SHARED;
1854 if (clockrt)
1855 restart->futex.flags |= FLAGS_CLOCKRT;
1856 1890
1857 ret = -ERESTART_RESTARTBLOCK; 1891 ret = -ERESTART_RESTARTBLOCK;
1858 1892
1859out_put_key:
1860 put_futex_key(fshared, &q.key);
1861out: 1893out:
1862 if (to) { 1894 if (to) {
1863 hrtimer_cancel(&to->timer); 1895 hrtimer_cancel(&to->timer);
@@ -1869,8 +1901,7 @@ out:
1869 1901
1870static long futex_wait_restart(struct restart_block *restart) 1902static long futex_wait_restart(struct restart_block *restart)
1871{ 1903{
1872 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1904 u32 __user *uaddr = restart->futex.uaddr;
1873 int fshared = 0;
1874 ktime_t t, *tp = NULL; 1905 ktime_t t, *tp = NULL;
1875 1906
1876 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { 1907 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
@@ -1878,11 +1909,9 @@ static long futex_wait_restart(struct restart_block *restart)
1878 tp = &t; 1909 tp = &t;
1879 } 1910 }
1880 restart->fn = do_no_restart_syscall; 1911 restart->fn = do_no_restart_syscall;
1881 if (restart->futex.flags & FLAGS_SHARED) 1912
1882 fshared = 1; 1913 return (long)futex_wait(uaddr, restart->futex.flags,
1883 return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, 1914 restart->futex.val, tp, restart->futex.bitset);
1884 restart->futex.bitset,
1885 restart->futex.flags & FLAGS_CLOCKRT);
1886} 1915}
1887 1916
1888 1917
@@ -1892,12 +1921,12 @@ static long futex_wait_restart(struct restart_block *restart)
1892 * if there are waiters then it will block, it does PI, etc. (Due to 1921 * if there are waiters then it will block, it does PI, etc. (Due to
1893 * races the kernel might see a 0 value of the futex too.) 1922 * races the kernel might see a 0 value of the futex too.)
1894 */ 1923 */
1895static int futex_lock_pi(u32 __user *uaddr, int fshared, 1924static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
1896 int detect, ktime_t *time, int trylock) 1925 ktime_t *time, int trylock)
1897{ 1926{
1898 struct hrtimer_sleeper timeout, *to = NULL; 1927 struct hrtimer_sleeper timeout, *to = NULL;
1899 struct futex_hash_bucket *hb; 1928 struct futex_hash_bucket *hb;
1900 struct futex_q q; 1929 struct futex_q q = futex_q_init;
1901 int res, ret; 1930 int res, ret;
1902 1931
1903 if (refill_pi_state_cache()) 1932 if (refill_pi_state_cache())
@@ -1911,12 +1940,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1911 hrtimer_set_expires(&to->timer, *time); 1940 hrtimer_set_expires(&to->timer, *time);
1912 } 1941 }
1913 1942
1914 q.pi_state = NULL;
1915 q.rt_waiter = NULL;
1916 q.requeue_pi_key = NULL;
1917retry: 1943retry:
1918 q.key = FUTEX_KEY_INIT; 1944 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
1919 ret = get_futex_key(uaddr, fshared, &q.key);
1920 if (unlikely(ret != 0)) 1945 if (unlikely(ret != 0))
1921 goto out; 1946 goto out;
1922 1947
@@ -1938,7 +1963,7 @@ retry_private:
1938 * exit to complete. 1963 * exit to complete.
1939 */ 1964 */
1940 queue_unlock(&q, hb); 1965 queue_unlock(&q, hb);
1941 put_futex_key(fshared, &q.key); 1966 put_futex_key(&q.key);
1942 cond_resched(); 1967 cond_resched();
1943 goto retry; 1968 goto retry;
1944 default: 1969 default:
@@ -1968,7 +1993,7 @@ retry_private:
1968 * Fixup the pi_state owner and possibly acquire the lock if we 1993 * Fixup the pi_state owner and possibly acquire the lock if we
1969 * haven't already. 1994 * haven't already.
1970 */ 1995 */
1971 res = fixup_owner(uaddr, fshared, &q, !ret); 1996 res = fixup_owner(uaddr, &q, !ret);
1972 /* 1997 /*
1973 * If fixup_owner() returned an error, proprogate that. If it acquired 1998 * If fixup_owner() returned an error, proprogate that. If it acquired
1974 * the lock, clear our -ETIMEDOUT or -EINTR. 1999 * the lock, clear our -ETIMEDOUT or -EINTR.
@@ -1992,7 +2017,7 @@ out_unlock_put_key:
1992 queue_unlock(&q, hb); 2017 queue_unlock(&q, hb);
1993 2018
1994out_put_key: 2019out_put_key:
1995 put_futex_key(fshared, &q.key); 2020 put_futex_key(&q.key);
1996out: 2021out:
1997 if (to) 2022 if (to)
1998 destroy_hrtimer_on_stack(&to->timer); 2023 destroy_hrtimer_on_stack(&to->timer);
@@ -2005,10 +2030,10 @@ uaddr_faulted:
2005 if (ret) 2030 if (ret)
2006 goto out_put_key; 2031 goto out_put_key;
2007 2032
2008 if (!fshared) 2033 if (!(flags & FLAGS_SHARED))
2009 goto retry_private; 2034 goto retry_private;
2010 2035
2011 put_futex_key(fshared, &q.key); 2036 put_futex_key(&q.key);
2012 goto retry; 2037 goto retry;
2013} 2038}
2014 2039
@@ -2017,13 +2042,13 @@ uaddr_faulted:
2017 * This is the in-kernel slowpath: we look up the PI state (if any), 2042 * This is the in-kernel slowpath: we look up the PI state (if any),
2018 * and do the rt-mutex unlock. 2043 * and do the rt-mutex unlock.
2019 */ 2044 */
2020static int futex_unlock_pi(u32 __user *uaddr, int fshared) 2045static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2021{ 2046{
2022 struct futex_hash_bucket *hb; 2047 struct futex_hash_bucket *hb;
2023 struct futex_q *this, *next; 2048 struct futex_q *this, *next;
2024 u32 uval;
2025 struct plist_head *head; 2049 struct plist_head *head;
2026 union futex_key key = FUTEX_KEY_INIT; 2050 union futex_key key = FUTEX_KEY_INIT;
2051 u32 uval, vpid = task_pid_vnr(current);
2027 int ret; 2052 int ret;
2028 2053
2029retry: 2054retry:
@@ -2032,10 +2057,10 @@ retry:
2032 /* 2057 /*
2033 * We release only a lock we actually own: 2058 * We release only a lock we actually own:
2034 */ 2059 */
2035 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 2060 if ((uval & FUTEX_TID_MASK) != vpid)
2036 return -EPERM; 2061 return -EPERM;
2037 2062
2038 ret = get_futex_key(uaddr, fshared, &key); 2063 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
2039 if (unlikely(ret != 0)) 2064 if (unlikely(ret != 0))
2040 goto out; 2065 goto out;
2041 2066
@@ -2047,17 +2072,14 @@ retry:
2047 * again. If it succeeds then we can return without waking 2072 * again. If it succeeds then we can return without waking
2048 * anyone else up: 2073 * anyone else up:
2049 */ 2074 */
2050 if (!(uval & FUTEX_OWNER_DIED)) 2075 if (!(uval & FUTEX_OWNER_DIED) &&
2051 uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); 2076 cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2052
2053
2054 if (unlikely(uval == -EFAULT))
2055 goto pi_faulted; 2077 goto pi_faulted;
2056 /* 2078 /*
2057 * Rare case: we managed to release the lock atomically, 2079 * Rare case: we managed to release the lock atomically,
2058 * no need to wake anyone else up: 2080 * no need to wake anyone else up:
2059 */ 2081 */
2060 if (unlikely(uval == task_pid_vnr(current))) 2082 if (unlikely(uval == vpid))
2061 goto out_unlock; 2083 goto out_unlock;
2062 2084
2063 /* 2085 /*
@@ -2090,14 +2112,14 @@ retry:
2090 2112
2091out_unlock: 2113out_unlock:
2092 spin_unlock(&hb->lock); 2114 spin_unlock(&hb->lock);
2093 put_futex_key(fshared, &key); 2115 put_futex_key(&key);
2094 2116
2095out: 2117out:
2096 return ret; 2118 return ret;
2097 2119
2098pi_faulted: 2120pi_faulted:
2099 spin_unlock(&hb->lock); 2121 spin_unlock(&hb->lock);
2100 put_futex_key(fshared, &key); 2122 put_futex_key(&key);
2101 2123
2102 ret = fault_in_user_writeable(uaddr); 2124 ret = fault_in_user_writeable(uaddr);
2103 if (!ret) 2125 if (!ret)
@@ -2142,7 +2164,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2142 * We were woken prior to requeue by a timeout or a signal. 2164 * We were woken prior to requeue by a timeout or a signal.
2143 * Unqueue the futex_q and determine which it was. 2165 * Unqueue the futex_q and determine which it was.
2144 */ 2166 */
2145 plist_del(&q->list, &q->list.plist); 2167 plist_del(&q->list, &hb->chain);
2146 2168
2147 /* Handle spurious wakeups gracefully */ 2169 /* Handle spurious wakeups gracefully */
2148 ret = -EWOULDBLOCK; 2170 ret = -EWOULDBLOCK;
@@ -2157,7 +2179,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2157/** 2179/**
2158 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 2180 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2159 * @uaddr: the futex we initially wait on (non-pi) 2181 * @uaddr: the futex we initially wait on (non-pi)
2160 * @fshared: whether the futexes are shared (1) or not (0). They must be 2182 * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
2161 * the same type, no requeueing from private to shared, etc. 2183 * the same type, no requeueing from private to shared, etc.
2162 * @val: the expected value of uaddr 2184 * @val: the expected value of uaddr
2163 * @abs_time: absolute timeout 2185 * @abs_time: absolute timeout
@@ -2195,16 +2217,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2195 * 0 - On success 2217 * 0 - On success
2196 * <0 - On error 2218 * <0 - On error
2197 */ 2219 */
2198static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, 2220static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2199 u32 val, ktime_t *abs_time, u32 bitset, 2221 u32 val, ktime_t *abs_time, u32 bitset,
2200 int clockrt, u32 __user *uaddr2) 2222 u32 __user *uaddr2)
2201{ 2223{
2202 struct hrtimer_sleeper timeout, *to = NULL; 2224 struct hrtimer_sleeper timeout, *to = NULL;
2203 struct rt_mutex_waiter rt_waiter; 2225 struct rt_mutex_waiter rt_waiter;
2204 struct rt_mutex *pi_mutex = NULL; 2226 struct rt_mutex *pi_mutex = NULL;
2205 struct futex_hash_bucket *hb; 2227 struct futex_hash_bucket *hb;
2206 union futex_key key2; 2228 union futex_key key2 = FUTEX_KEY_INIT;
2207 struct futex_q q; 2229 struct futex_q q = futex_q_init;
2208 int res, ret; 2230 int res, ret;
2209 2231
2210 if (!bitset) 2232 if (!bitset)
@@ -2212,8 +2234,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2212 2234
2213 if (abs_time) { 2235 if (abs_time) {
2214 to = &timeout; 2236 to = &timeout;
2215 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : 2237 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2216 CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 2238 CLOCK_REALTIME : CLOCK_MONOTONIC,
2239 HRTIMER_MODE_ABS);
2217 hrtimer_init_sleeper(to, current); 2240 hrtimer_init_sleeper(to, current);
2218 hrtimer_set_expires_range_ns(&to->timer, *abs_time, 2241 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2219 current->timer_slack_ns); 2242 current->timer_slack_ns);
@@ -2226,18 +2249,19 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2226 debug_rt_mutex_init_waiter(&rt_waiter); 2249 debug_rt_mutex_init_waiter(&rt_waiter);
2227 rt_waiter.task = NULL; 2250 rt_waiter.task = NULL;
2228 2251
2229 key2 = FUTEX_KEY_INIT; 2252 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
2230 ret = get_futex_key(uaddr2, fshared, &key2);
2231 if (unlikely(ret != 0)) 2253 if (unlikely(ret != 0))
2232 goto out; 2254 goto out;
2233 2255
2234 q.pi_state = NULL;
2235 q.bitset = bitset; 2256 q.bitset = bitset;
2236 q.rt_waiter = &rt_waiter; 2257 q.rt_waiter = &rt_waiter;
2237 q.requeue_pi_key = &key2; 2258 q.requeue_pi_key = &key2;
2238 2259
2239 /* Prepare to wait on uaddr. */ 2260 /*
2240 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 2261 * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2262 * count.
2263 */
2264 ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2241 if (ret) 2265 if (ret)
2242 goto out_key2; 2266 goto out_key2;
2243 2267
@@ -2254,7 +2278,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2254 * In order for us to be here, we know our q.key == key2, and since 2278 * In order for us to be here, we know our q.key == key2, and since
2255 * we took the hb->lock above, we also know that futex_requeue() has 2279 * we took the hb->lock above, we also know that futex_requeue() has
2256 * completed and we no longer have to concern ourselves with a wakeup 2280 * completed and we no longer have to concern ourselves with a wakeup
2257 * race with the atomic proxy lock acquition by the requeue code. 2281 * race with the atomic proxy lock acquisition by the requeue code. The
2282 * futex_requeue dropped our key1 reference and incremented our key2
2283 * reference count.
2258 */ 2284 */
2259 2285
2260 /* Check if the requeue code acquired the second futex for us. */ 2286 /* Check if the requeue code acquired the second futex for us. */
@@ -2265,8 +2291,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2265 */ 2291 */
2266 if (q.pi_state && (q.pi_state->owner != current)) { 2292 if (q.pi_state && (q.pi_state->owner != current)) {
2267 spin_lock(q.lock_ptr); 2293 spin_lock(q.lock_ptr);
2268 ret = fixup_pi_state_owner(uaddr2, &q, current, 2294 ret = fixup_pi_state_owner(uaddr2, &q, current);
2269 fshared);
2270 spin_unlock(q.lock_ptr); 2295 spin_unlock(q.lock_ptr);
2271 } 2296 }
2272 } else { 2297 } else {
@@ -2285,7 +2310,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2285 * Fixup the pi_state owner and possibly acquire the lock if we 2310 * Fixup the pi_state owner and possibly acquire the lock if we
2286 * haven't already. 2311 * haven't already.
2287 */ 2312 */
2288 res = fixup_owner(uaddr2, fshared, &q, !ret); 2313 res = fixup_owner(uaddr2, &q, !ret);
2289 /* 2314 /*
2290 * If fixup_owner() returned an error, proprogate that. If it 2315 * If fixup_owner() returned an error, proprogate that. If it
2291 * acquired the lock, clear -ETIMEDOUT or -EINTR. 2316 * acquired the lock, clear -ETIMEDOUT or -EINTR.
@@ -2316,9 +2341,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2316 } 2341 }
2317 2342
2318out_put_keys: 2343out_put_keys:
2319 put_futex_key(fshared, &q.key); 2344 put_futex_key(&q.key);
2320out_key2: 2345out_key2:
2321 put_futex_key(fshared, &key2); 2346 put_futex_key(&key2);
2322 2347
2323out: 2348out:
2324 if (to) { 2349 if (to) {
@@ -2393,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
2393 goto err_unlock; 2418 goto err_unlock;
2394 ret = -EPERM; 2419 ret = -EPERM;
2395 pcred = __task_cred(p); 2420 pcred = __task_cred(p);
2421 /* If victim is in different user_ns, then uids are not
2422 comparable, so we must have CAP_SYS_PTRACE */
2423 if (cred->user->user_ns != pcred->user->user_ns) {
2424 if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
2425 goto err_unlock;
2426 goto ok;
2427 }
2428 /* If victim is in same user_ns, then uids are comparable */
2396 if (cred->euid != pcred->euid && 2429 if (cred->euid != pcred->euid &&
2397 cred->euid != pcred->uid && 2430 cred->euid != pcred->uid &&
2398 !capable(CAP_SYS_PTRACE)) 2431 !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
2399 goto err_unlock; 2432 goto err_unlock;
2433ok:
2400 head = p->robust_list; 2434 head = p->robust_list;
2401 rcu_read_unlock(); 2435 rcu_read_unlock();
2402 } 2436 }
@@ -2435,11 +2469,20 @@ retry:
2435 * userspace. 2469 * userspace.
2436 */ 2470 */
2437 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 2471 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2438 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); 2472 /*
2439 2473 * We are not holding a lock here, but we want to have
2440 if (nval == -EFAULT) 2474 * the pagefault_disable/enable() protection because
2441 return -1; 2475 * we want to handle the fault gracefully. If the
2442 2476 * access fails we try to fault in the futex with R/W
2477 * verification via get_user_pages. get_user() above
2478 * does not guarantee R/W access. If that fails we
2479 * give up and leave the futex locked.
2480 */
2481 if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
2482 if (fault_in_user_writeable(uaddr))
2483 return -1;
2484 goto retry;
2485 }
2443 if (nval != uval) 2486 if (nval != uval)
2444 goto retry; 2487 goto retry;
2445 2488
@@ -2458,7 +2501,7 @@ retry:
2458 */ 2501 */
2459static inline int fetch_robust_entry(struct robust_list __user **entry, 2502static inline int fetch_robust_entry(struct robust_list __user **entry,
2460 struct robust_list __user * __user *head, 2503 struct robust_list __user * __user *head,
2461 int *pi) 2504 unsigned int *pi)
2462{ 2505{
2463 unsigned long uentry; 2506 unsigned long uentry;
2464 2507
@@ -2481,7 +2524,8 @@ void exit_robust_list(struct task_struct *curr)
2481{ 2524{
2482 struct robust_list_head __user *head = curr->robust_list; 2525 struct robust_list_head __user *head = curr->robust_list;
2483 struct robust_list __user *entry, *next_entry, *pending; 2526 struct robust_list __user *entry, *next_entry, *pending;
2484 unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; 2527 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
2528 unsigned int uninitialized_var(next_pi);
2485 unsigned long futex_offset; 2529 unsigned long futex_offset;
2486 int rc; 2530 int rc;
2487 2531
@@ -2542,58 +2586,57 @@ void exit_robust_list(struct task_struct *curr)
2542long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 2586long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2543 u32 __user *uaddr2, u32 val2, u32 val3) 2587 u32 __user *uaddr2, u32 val2, u32 val3)
2544{ 2588{
2545 int clockrt, ret = -ENOSYS; 2589 int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
2546 int cmd = op & FUTEX_CMD_MASK; 2590 unsigned int flags = 0;
2547 int fshared = 0;
2548 2591
2549 if (!(op & FUTEX_PRIVATE_FLAG)) 2592 if (!(op & FUTEX_PRIVATE_FLAG))
2550 fshared = 1; 2593 flags |= FLAGS_SHARED;
2551 2594
2552 clockrt = op & FUTEX_CLOCK_REALTIME; 2595 if (op & FUTEX_CLOCK_REALTIME) {
2553 if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) 2596 flags |= FLAGS_CLOCKRT;
2554 return -ENOSYS; 2597 if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
2598 return -ENOSYS;
2599 }
2555 2600
2556 switch (cmd) { 2601 switch (cmd) {
2557 case FUTEX_WAIT: 2602 case FUTEX_WAIT:
2558 val3 = FUTEX_BITSET_MATCH_ANY; 2603 val3 = FUTEX_BITSET_MATCH_ANY;
2559 case FUTEX_WAIT_BITSET: 2604 case FUTEX_WAIT_BITSET:
2560 ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt); 2605 ret = futex_wait(uaddr, flags, val, timeout, val3);
2561 break; 2606 break;
2562 case FUTEX_WAKE: 2607 case FUTEX_WAKE:
2563 val3 = FUTEX_BITSET_MATCH_ANY; 2608 val3 = FUTEX_BITSET_MATCH_ANY;
2564 case FUTEX_WAKE_BITSET: 2609 case FUTEX_WAKE_BITSET:
2565 ret = futex_wake(uaddr, fshared, val, val3); 2610 ret = futex_wake(uaddr, flags, val, val3);
2566 break; 2611 break;
2567 case FUTEX_REQUEUE: 2612 case FUTEX_REQUEUE:
2568 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); 2613 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
2569 break; 2614 break;
2570 case FUTEX_CMP_REQUEUE: 2615 case FUTEX_CMP_REQUEUE:
2571 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 2616 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
2572 0);
2573 break; 2617 break;
2574 case FUTEX_WAKE_OP: 2618 case FUTEX_WAKE_OP:
2575 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); 2619 ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2576 break; 2620 break;
2577 case FUTEX_LOCK_PI: 2621 case FUTEX_LOCK_PI:
2578 if (futex_cmpxchg_enabled) 2622 if (futex_cmpxchg_enabled)
2579 ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); 2623 ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
2580 break; 2624 break;
2581 case FUTEX_UNLOCK_PI: 2625 case FUTEX_UNLOCK_PI:
2582 if (futex_cmpxchg_enabled) 2626 if (futex_cmpxchg_enabled)
2583 ret = futex_unlock_pi(uaddr, fshared); 2627 ret = futex_unlock_pi(uaddr, flags);
2584 break; 2628 break;
2585 case FUTEX_TRYLOCK_PI: 2629 case FUTEX_TRYLOCK_PI:
2586 if (futex_cmpxchg_enabled) 2630 if (futex_cmpxchg_enabled)
2587 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); 2631 ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
2588 break; 2632 break;
2589 case FUTEX_WAIT_REQUEUE_PI: 2633 case FUTEX_WAIT_REQUEUE_PI:
2590 val3 = FUTEX_BITSET_MATCH_ANY; 2634 val3 = FUTEX_BITSET_MATCH_ANY;
2591 ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, 2635 ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
2592 clockrt, uaddr2); 2636 uaddr2);
2593 break; 2637 break;
2594 case FUTEX_CMP_REQUEUE_PI: 2638 case FUTEX_CMP_REQUEUE_PI:
2595 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 2639 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
2596 1);
2597 break; 2640 break;
2598 default: 2641 default:
2599 ret = -ENOSYS; 2642 ret = -ENOSYS;
@@ -2647,11 +2690,10 @@ static int __init futex_init(void)
2647 * of the complex code paths. Also we want to prevent 2690 * of the complex code paths. Also we want to prevent
2648 * registration of robust lists in that case. NULL is 2691 * registration of robust lists in that case. NULL is
2649 * guaranteed to fault and we get -EFAULT on functional 2692 * guaranteed to fault and we get -EFAULT on functional
2650 * implementation, the non functional ones will return 2693 * implementation, the non-functional ones will return
2651 * -ENOSYS. 2694 * -ENOSYS.
2652 */ 2695 */
2653 curval = cmpxchg_futex_value_locked(NULL, 0, 0); 2696 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2654 if (curval == -EFAULT)
2655 futex_cmpxchg_enabled = 1; 2697 futex_cmpxchg_enabled = 1;
2656 2698
2657 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 2699 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d49afb2395e5..5f9e689dc8f0 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -19,7 +19,7 @@
19 */ 19 */
20static inline int 20static inline int
21fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, 21fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
22 compat_uptr_t __user *head, int *pi) 22 compat_uptr_t __user *head, unsigned int *pi)
23{ 23{
24 if (get_user(*uentry, head)) 24 if (get_user(*uentry, head))
25 return -EFAULT; 25 return -EFAULT;
@@ -49,7 +49,8 @@ void compat_exit_robust_list(struct task_struct *curr)
49{ 49{
50 struct compat_robust_list_head __user *head = curr->compat_robust_list; 50 struct compat_robust_list_head __user *head = curr->compat_robust_list;
51 struct robust_list __user *entry, *next_entry, *pending; 51 struct robust_list __user *entry, *next_entry, *pending;
52 unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; 52 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
53 unsigned int uninitialized_var(next_pi);
53 compat_uptr_t uentry, next_uentry, upending; 54 compat_uptr_t uentry, next_uentry, upending;
54 compat_long_t futex_offset; 55 compat_long_t futex_offset;
55 int rc; 56 int rc;
@@ -152,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
152 goto err_unlock; 153 goto err_unlock;
153 ret = -EPERM; 154 ret = -EPERM;
154 pcred = __task_cred(p); 155 pcred = __task_cred(p);
156 /* If victim is in different user_ns, then uids are not
157 comparable, so we must have CAP_SYS_PTRACE */
158 if (cred->user->user_ns != pcred->user->user_ns) {
159 if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
160 goto err_unlock;
161 goto ok;
162 }
163 /* If victim is in same user_ns, then uids are comparable */
155 if (cred->euid != pcred->euid && 164 if (cred->euid != pcred->euid &&
156 cred->euid != pcred->uid && 165 cred->euid != pcred->uid &&
157 !capable(CAP_SYS_PTRACE)) 166 !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
158 goto err_unlock; 167 goto err_unlock;
168ok:
159 head = p->compat_robust_list; 169 head = p->compat_robust_list;
160 rcu_read_unlock(); 170 rcu_read_unlock();
161 } 171 }
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 70a298d6da71..5bf924d80b5c 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -2,7 +2,8 @@ menu "GCOV-based kernel profiling"
2 2
3config GCOV_KERNEL 3config GCOV_KERNEL
4 bool "Enable gcov-based kernel profiling" 4 bool "Enable gcov-based kernel profiling"
5 depends on DEBUG_FS && CONSTRUCTORS 5 depends on DEBUG_FS
6 select CONSTRUCTORS
6 default n 7 default n
7 ---help--- 8 ---help---
8 This option enables gcov-based code profiling (e.g. for code coverage 9 This option enables gcov-based code profiling (e.g. for code coverage
@@ -34,7 +35,7 @@ config GCOV_KERNEL
34config GCOV_PROFILE_ALL 35config GCOV_PROFILE_ALL
35 bool "Profile entire Kernel" 36 bool "Profile entire Kernel"
36 depends on GCOV_KERNEL 37 depends on GCOV_KERNEL
37 depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE 38 depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
38 default n 39 default n
39 ---help--- 40 ---help---
40 This options activates profiling for the entire kernel. 41 This options activates profiling for the entire kernel.
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 3f761001d517..e97ca59e2520 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,3 +1,3 @@
1EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' 1ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
2 2
3obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o 3obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index f83972b16564..9bd0934f6c33 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -561,6 +561,7 @@ static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
561static const struct file_operations gcov_reset_fops = { 561static const struct file_operations gcov_reset_fops = {
562 .write = reset_write, 562 .write = reset_write,
563 .read = reset_read, 563 .read = reset_read,
564 .llseek = noop_llseek,
564}; 565};
565 566
566/* 567/*
diff --git a/kernel/groups.c b/kernel/groups.c
index 253dc0f35cf4..1cc476d52dd3 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
233 struct group_info *group_info; 233 struct group_info *group_info;
234 int retval; 234 int retval;
235 235
236 if (!capable(CAP_SETGID)) 236 if (!nsown_capable(CAP_SETGID))
237 return -EPERM; 237 return -EPERM;
238 if ((unsigned)gidsetsize > NGROUPS_MAX) 238 if ((unsigned)gidsetsize > NGROUPS_MAX)
239 return -EINVAL; 239 return -EINVAL;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cb49883b64e5..11e896903828 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -55,11 +55,10 @@
55/* 55/*
56 * The timer bases: 56 * The timer bases:
57 * 57 *
58 * Note: If we want to add new timer bases, we have to skip the two 58 * There are more clockids then hrtimer bases. Thus, we index
59 * clock ids captured by the cpu-timers. We do this by holding empty 59 * into the timer bases by the hrtimer_base_type enum. When trying
60 * entries rather than doing math adjustment of the clock ids. 60 * to reach a base using a clockid, hrtimer_clockid_to_base()
61 * This ensures that we capture erroneous accesses to these clock ids 61 * is used to convert from clockid to the proper hrtimer_base_type.
62 * rather than moving them into the range of valid clock id's.
63 */ 62 */
64DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = 63DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
65{ 64{
@@ -67,39 +66,55 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
67 .clock_base = 66 .clock_base =
68 { 67 {
69 { 68 {
70 .index = CLOCK_REALTIME, 69 .index = HRTIMER_BASE_MONOTONIC,
70 .clockid = CLOCK_MONOTONIC,
71 .get_time = &ktime_get,
72 .resolution = KTIME_LOW_RES,
73 },
74 {
75 .index = HRTIMER_BASE_REALTIME,
76 .clockid = CLOCK_REALTIME,
71 .get_time = &ktime_get_real, 77 .get_time = &ktime_get_real,
72 .resolution = KTIME_LOW_RES, 78 .resolution = KTIME_LOW_RES,
73 }, 79 },
74 { 80 {
75 .index = CLOCK_MONOTONIC, 81 .index = HRTIMER_BASE_BOOTTIME,
76 .get_time = &ktime_get, 82 .clockid = CLOCK_BOOTTIME,
83 .get_time = &ktime_get_boottime,
77 .resolution = KTIME_LOW_RES, 84 .resolution = KTIME_LOW_RES,
78 }, 85 },
79 } 86 }
80}; 87};
81 88
89static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
90 [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
91 [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
92 [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
93};
94
95static inline int hrtimer_clockid_to_base(clockid_t clock_id)
96{
97 return hrtimer_clock_to_base_table[clock_id];
98}
99
100
82/* 101/*
83 * Get the coarse grained time at the softirq based on xtime and 102 * Get the coarse grained time at the softirq based on xtime and
84 * wall_to_monotonic. 103 * wall_to_monotonic.
85 */ 104 */
86static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) 105static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
87{ 106{
88 ktime_t xtim, tomono; 107 ktime_t xtim, mono, boot;
89 struct timespec xts, tom; 108 struct timespec xts, tom, slp;
90 unsigned long seq;
91 109
92 do { 110 get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
93 seq = read_seqbegin(&xtime_lock);
94 xts = __current_kernel_time();
95 tom = __get_wall_to_monotonic();
96 } while (read_seqretry(&xtime_lock, seq));
97 111
98 xtim = timespec_to_ktime(xts); 112 xtim = timespec_to_ktime(xts);
99 tomono = timespec_to_ktime(tom); 113 mono = ktime_add(xtim, timespec_to_ktime(tom));
100 base->clock_base[CLOCK_REALTIME].softirq_time = xtim; 114 boot = ktime_add(mono, timespec_to_ktime(slp));
101 base->clock_base[CLOCK_MONOTONIC].softirq_time = 115 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
102 ktime_add(xtim, tomono); 116 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
117 base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
103} 118}
104 119
105/* 120/*
@@ -186,10 +201,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
186 struct hrtimer_cpu_base *new_cpu_base; 201 struct hrtimer_cpu_base *new_cpu_base;
187 int this_cpu = smp_processor_id(); 202 int this_cpu = smp_processor_id();
188 int cpu = hrtimer_get_target(this_cpu, pinned); 203 int cpu = hrtimer_get_target(this_cpu, pinned);
204 int basenum = base->index;
189 205
190again: 206again:
191 new_cpu_base = &per_cpu(hrtimer_bases, cpu); 207 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
192 new_base = &new_cpu_base->clock_base[base->index]; 208 new_base = &new_cpu_base->clock_base[basenum];
193 209
194 if (base != new_base) { 210 if (base != new_base) {
195 /* 211 /*
@@ -336,6 +352,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe);
336 352
337static struct debug_obj_descr hrtimer_debug_descr; 353static struct debug_obj_descr hrtimer_debug_descr;
338 354
355static void *hrtimer_debug_hint(void *addr)
356{
357 return ((struct hrtimer *) addr)->function;
358}
359
339/* 360/*
340 * fixup_init is called when: 361 * fixup_init is called when:
341 * - an active object is initialized 362 * - an active object is initialized
@@ -395,6 +416,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
395 416
396static struct debug_obj_descr hrtimer_debug_descr = { 417static struct debug_obj_descr hrtimer_debug_descr = {
397 .name = "hrtimer", 418 .name = "hrtimer",
419 .debug_hint = hrtimer_debug_hint,
398 .fixup_init = hrtimer_fixup_init, 420 .fixup_init = hrtimer_fixup_init,
399 .fixup_activate = hrtimer_fixup_activate, 421 .fixup_activate = hrtimer_fixup_activate,
400 .fixup_free = hrtimer_fixup_free, 422 .fixup_free = hrtimer_fixup_free,
@@ -499,7 +521,7 @@ static inline int hrtimer_is_hres_enabled(void)
499 */ 521 */
500static inline int hrtimer_hres_active(void) 522static inline int hrtimer_hres_active(void)
501{ 523{
502 return __get_cpu_var(hrtimer_bases).hres_active; 524 return __this_cpu_read(hrtimer_bases.hres_active);
503} 525}
504 526
505/* 527/*
@@ -518,10 +540,13 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
518 540
519 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 541 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
520 struct hrtimer *timer; 542 struct hrtimer *timer;
543 struct timerqueue_node *next;
521 544
522 if (!base->first) 545 next = timerqueue_getnext(&base->active);
546 if (!next)
523 continue; 547 continue;
524 timer = rb_entry(base->first, struct hrtimer, node); 548 timer = container_of(next, struct hrtimer, node);
549
525 expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 550 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
526 /* 551 /*
527 * clock_was_set() has changed base->offset so the 552 * clock_was_set() has changed base->offset so the
@@ -601,67 +626,6 @@ static int hrtimer_reprogram(struct hrtimer *timer,
601 return res; 626 return res;
602} 627}
603 628
604
605/*
606 * Retrigger next event is called after clock was set
607 *
608 * Called with interrupts disabled via on_each_cpu()
609 */
610static void retrigger_next_event(void *arg)
611{
612 struct hrtimer_cpu_base *base;
613 struct timespec realtime_offset, wtm;
614 unsigned long seq;
615
616 if (!hrtimer_hres_active())
617 return;
618
619 do {
620 seq = read_seqbegin(&xtime_lock);
621 wtm = __get_wall_to_monotonic();
622 } while (read_seqretry(&xtime_lock, seq));
623 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
624
625 base = &__get_cpu_var(hrtimer_bases);
626
627 /* Adjust CLOCK_REALTIME offset */
628 raw_spin_lock(&base->lock);
629 base->clock_base[CLOCK_REALTIME].offset =
630 timespec_to_ktime(realtime_offset);
631
632 hrtimer_force_reprogram(base, 0);
633 raw_spin_unlock(&base->lock);
634}
635
636/*
637 * Clock realtime was set
638 *
639 * Change the offset of the realtime clock vs. the monotonic
640 * clock.
641 *
642 * We might have to reprogram the high resolution timer interrupt. On
643 * SMP we call the architecture specific code to retrigger _all_ high
644 * resolution timer interrupts. On UP we just disable interrupts and
645 * call the high resolution interrupt code.
646 */
647void clock_was_set(void)
648{
649 /* Retrigger the CPU local events everywhere */
650 on_each_cpu(retrigger_next_event, NULL, 1);
651}
652
653/*
654 * During resume we might have to reprogram the high resolution timer
655 * interrupt (on the local CPU):
656 */
657void hres_timers_resume(void)
658{
659 WARN_ONCE(!irqs_disabled(),
660 KERN_INFO "hres_timers_resume() called with IRQs enabled!");
661
662 retrigger_next_event(NULL);
663}
664
665/* 629/*
666 * Initialize the high resolution related parts of cpu_base 630 * Initialize the high resolution related parts of cpu_base
667 */ 631 */
@@ -672,14 +636,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
672} 636}
673 637
674/* 638/*
675 * Initialize the high resolution related parts of a hrtimer
676 */
677static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
678{
679}
680
681
682/*
683 * When High resolution timers are active, try to reprogram. Note, that in case 639 * When High resolution timers are active, try to reprogram. Note, that in case
684 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry 640 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
685 * check happens. The timer gets enqueued into the rbtree. The reprogramming 641 * check happens. The timer gets enqueued into the rbtree. The reprogramming
@@ -704,11 +660,39 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
704} 660}
705 661
706/* 662/*
663 * Retrigger next event is called after clock was set
664 *
665 * Called with interrupts disabled via on_each_cpu()
666 */
667static void retrigger_next_event(void *arg)
668{
669 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
670 struct timespec realtime_offset, xtim, wtm, sleep;
671
672 if (!hrtimer_hres_active())
673 return;
674
675 /* Optimized out for !HIGH_RES */
676 get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
677 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
678
679 /* Adjust CLOCK_REALTIME offset */
680 raw_spin_lock(&base->lock);
681 base->clock_base[HRTIMER_BASE_REALTIME].offset =
682 timespec_to_ktime(realtime_offset);
683 base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
684 timespec_to_ktime(sleep);
685
686 hrtimer_force_reprogram(base, 0);
687 raw_spin_unlock(&base->lock);
688}
689
690/*
707 * Switch to high resolution mode 691 * Switch to high resolution mode
708 */ 692 */
709static int hrtimer_switch_to_hres(void) 693static int hrtimer_switch_to_hres(void)
710{ 694{
711 int cpu = smp_processor_id(); 695 int i, cpu = smp_processor_id();
712 struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); 696 struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
713 unsigned long flags; 697 unsigned long flags;
714 698
@@ -724,8 +708,8 @@ static int hrtimer_switch_to_hres(void)
724 return 0; 708 return 0;
725 } 709 }
726 base->hres_active = 1; 710 base->hres_active = 1;
727 base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; 711 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
728 base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; 712 base->clock_base[i].resolution = KTIME_HIGH_RES;
729 713
730 tick_setup_sched_timer(); 714 tick_setup_sched_timer();
731 715
@@ -749,10 +733,43 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
749 return 0; 733 return 0;
750} 734}
751static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } 735static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
752static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } 736static inline void retrigger_next_event(void *arg) { }
753 737
754#endif /* CONFIG_HIGH_RES_TIMERS */ 738#endif /* CONFIG_HIGH_RES_TIMERS */
755 739
740/*
741 * Clock realtime was set
742 *
743 * Change the offset of the realtime clock vs. the monotonic
744 * clock.
745 *
746 * We might have to reprogram the high resolution timer interrupt. On
747 * SMP we call the architecture specific code to retrigger _all_ high
748 * resolution timer interrupts. On UP we just disable interrupts and
749 * call the high resolution interrupt code.
750 */
751void clock_was_set(void)
752{
753#ifdef CONFIG_HIGH_RES_TIMERS
754 /* Retrigger the CPU local events everywhere */
755 on_each_cpu(retrigger_next_event, NULL, 1);
756#endif
757 timerfd_clock_was_set();
758}
759
760/*
761 * During resume we might have to reprogram the high resolution timer
762 * interrupt (on the local CPU):
763 */
764void hrtimers_resume(void)
765{
766 WARN_ONCE(!irqs_disabled(),
767 KERN_INFO "hrtimers_resume() called with IRQs enabled!");
768
769 retrigger_next_event(NULL);
770 timerfd_clock_was_set();
771}
772
756static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) 773static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
757{ 774{
758#ifdef CONFIG_TIMER_STATS 775#ifdef CONFIG_TIMER_STATS
@@ -842,48 +859,18 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
842static int enqueue_hrtimer(struct hrtimer *timer, 859static int enqueue_hrtimer(struct hrtimer *timer,
843 struct hrtimer_clock_base *base) 860 struct hrtimer_clock_base *base)
844{ 861{
845 struct rb_node **link = &base->active.rb_node;
846 struct rb_node *parent = NULL;
847 struct hrtimer *entry;
848 int leftmost = 1;
849
850 debug_activate(timer); 862 debug_activate(timer);
851 863
852 /* 864 timerqueue_add(&base->active, &timer->node);
853 * Find the right place in the rbtree: 865 base->cpu_base->active_bases |= 1 << base->index;
854 */
855 while (*link) {
856 parent = *link;
857 entry = rb_entry(parent, struct hrtimer, node);
858 /*
859 * We dont care about collisions. Nodes with
860 * the same expiry time stay together.
861 */
862 if (hrtimer_get_expires_tv64(timer) <
863 hrtimer_get_expires_tv64(entry)) {
864 link = &(*link)->rb_left;
865 } else {
866 link = &(*link)->rb_right;
867 leftmost = 0;
868 }
869 }
870
871 /*
872 * Insert the timer to the rbtree and check whether it
873 * replaces the first pending timer
874 */
875 if (leftmost)
876 base->first = &timer->node;
877 866
878 rb_link_node(&timer->node, parent, link);
879 rb_insert_color(&timer->node, &base->active);
880 /* 867 /*
881 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the 868 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
882 * state of a possibly running callback. 869 * state of a possibly running callback.
883 */ 870 */
884 timer->state |= HRTIMER_STATE_ENQUEUED; 871 timer->state |= HRTIMER_STATE_ENQUEUED;
885 872
886 return leftmost; 873 return (&timer->node == base->active.next);
887} 874}
888 875
889/* 876/*
@@ -903,12 +890,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
903 if (!(timer->state & HRTIMER_STATE_ENQUEUED)) 890 if (!(timer->state & HRTIMER_STATE_ENQUEUED))
904 goto out; 891 goto out;
905 892
906 /* 893 if (&timer->node == timerqueue_getnext(&base->active)) {
907 * Remove the timer from the rbtree and replace the first
908 * entry pointer if necessary.
909 */
910 if (base->first == &timer->node) {
911 base->first = rb_next(&timer->node);
912#ifdef CONFIG_HIGH_RES_TIMERS 894#ifdef CONFIG_HIGH_RES_TIMERS
913 /* Reprogram the clock event device. if enabled */ 895 /* Reprogram the clock event device. if enabled */
914 if (reprogram && hrtimer_hres_active()) { 896 if (reprogram && hrtimer_hres_active()) {
@@ -921,7 +903,9 @@ static void __remove_hrtimer(struct hrtimer *timer,
921 } 903 }
922#endif 904#endif
923 } 905 }
924 rb_erase(&timer->node, &base->active); 906 timerqueue_del(&base->active, &timer->node);
907 if (!timerqueue_getnext(&base->active))
908 base->cpu_base->active_bases &= ~(1 << base->index);
925out: 909out:
926 timer->state = newstate; 910 timer->state = newstate;
927} 911}
@@ -1222,11 +1206,13 @@ ktime_t hrtimer_get_next_event(void)
1222 if (!hrtimer_hres_active()) { 1206 if (!hrtimer_hres_active()) {
1223 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 1207 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
1224 struct hrtimer *timer; 1208 struct hrtimer *timer;
1209 struct timerqueue_node *next;
1225 1210
1226 if (!base->first) 1211 next = timerqueue_getnext(&base->active);
1212 if (!next)
1227 continue; 1213 continue;
1228 1214
1229 timer = rb_entry(base->first, struct hrtimer, node); 1215 timer = container_of(next, struct hrtimer, node);
1230 delta.tv64 = hrtimer_get_expires_tv64(timer); 1216 delta.tv64 = hrtimer_get_expires_tv64(timer);
1231 delta = ktime_sub(delta, base->get_time()); 1217 delta = ktime_sub(delta, base->get_time());
1232 if (delta.tv64 < mindelta.tv64) 1218 if (delta.tv64 < mindelta.tv64)
@@ -1246,6 +1232,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1246 enum hrtimer_mode mode) 1232 enum hrtimer_mode mode)
1247{ 1233{
1248 struct hrtimer_cpu_base *cpu_base; 1234 struct hrtimer_cpu_base *cpu_base;
1235 int base;
1249 1236
1250 memset(timer, 0, sizeof(struct hrtimer)); 1237 memset(timer, 0, sizeof(struct hrtimer));
1251 1238
@@ -1254,8 +1241,9 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1254 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) 1241 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
1255 clock_id = CLOCK_MONOTONIC; 1242 clock_id = CLOCK_MONOTONIC;
1256 1243
1257 timer->base = &cpu_base->clock_base[clock_id]; 1244 base = hrtimer_clockid_to_base(clock_id);
1258 hrtimer_init_timer_hres(timer); 1245 timer->base = &cpu_base->clock_base[base];
1246 timerqueue_init(&timer->node);
1259 1247
1260#ifdef CONFIG_TIMER_STATS 1248#ifdef CONFIG_TIMER_STATS
1261 timer->start_site = NULL; 1249 timer->start_site = NULL;
@@ -1289,9 +1277,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
1289int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) 1277int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1290{ 1278{
1291 struct hrtimer_cpu_base *cpu_base; 1279 struct hrtimer_cpu_base *cpu_base;
1280 int base = hrtimer_clockid_to_base(which_clock);
1292 1281
1293 cpu_base = &__raw_get_cpu_var(hrtimer_bases); 1282 cpu_base = &__raw_get_cpu_var(hrtimer_bases);
1294 *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); 1283 *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
1295 1284
1296 return 0; 1285 return 0;
1297} 1286}
@@ -1346,7 +1335,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1346void hrtimer_interrupt(struct clock_event_device *dev) 1335void hrtimer_interrupt(struct clock_event_device *dev)
1347{ 1336{
1348 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1337 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1349 struct hrtimer_clock_base *base;
1350 ktime_t expires_next, now, entry_time, delta; 1338 ktime_t expires_next, now, entry_time, delta;
1351 int i, retries = 0; 1339 int i, retries = 0;
1352 1340
@@ -1368,18 +1356,21 @@ retry:
1368 */ 1356 */
1369 cpu_base->expires_next.tv64 = KTIME_MAX; 1357 cpu_base->expires_next.tv64 = KTIME_MAX;
1370 1358
1371 base = cpu_base->clock_base;
1372
1373 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1359 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1360 struct hrtimer_clock_base *base;
1361 struct timerqueue_node *node;
1374 ktime_t basenow; 1362 ktime_t basenow;
1375 struct rb_node *node;
1376 1363
1364 if (!(cpu_base->active_bases & (1 << i)))
1365 continue;
1366
1367 base = cpu_base->clock_base + i;
1377 basenow = ktime_add(now, base->offset); 1368 basenow = ktime_add(now, base->offset);
1378 1369
1379 while ((node = base->first)) { 1370 while ((node = timerqueue_getnext(&base->active))) {
1380 struct hrtimer *timer; 1371 struct hrtimer *timer;
1381 1372
1382 timer = rb_entry(node, struct hrtimer, node); 1373 timer = container_of(node, struct hrtimer, node);
1383 1374
1384 /* 1375 /*
1385 * The immediate goal for using the softexpires is 1376 * The immediate goal for using the softexpires is
@@ -1406,7 +1397,6 @@ retry:
1406 1397
1407 __run_hrtimer(timer, &basenow); 1398 __run_hrtimer(timer, &basenow);
1408 } 1399 }
1409 base++;
1410 } 1400 }
1411 1401
1412 /* 1402 /*
@@ -1535,7 +1525,7 @@ void hrtimer_run_pending(void)
1535 */ 1525 */
1536void hrtimer_run_queues(void) 1526void hrtimer_run_queues(void)
1537{ 1527{
1538 struct rb_node *node; 1528 struct timerqueue_node *node;
1539 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1529 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1540 struct hrtimer_clock_base *base; 1530 struct hrtimer_clock_base *base;
1541 int index, gettime = 1; 1531 int index, gettime = 1;
@@ -1545,8 +1535,7 @@ void hrtimer_run_queues(void)
1545 1535
1546 for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { 1536 for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
1547 base = &cpu_base->clock_base[index]; 1537 base = &cpu_base->clock_base[index];
1548 1538 if (!timerqueue_getnext(&base->active))
1549 if (!base->first)
1550 continue; 1539 continue;
1551 1540
1552 if (gettime) { 1541 if (gettime) {
@@ -1556,10 +1545,10 @@ void hrtimer_run_queues(void)
1556 1545
1557 raw_spin_lock(&cpu_base->lock); 1546 raw_spin_lock(&cpu_base->lock);
1558 1547
1559 while ((node = base->first)) { 1548 while ((node = timerqueue_getnext(&base->active))) {
1560 struct hrtimer *timer; 1549 struct hrtimer *timer;
1561 1550
1562 timer = rb_entry(node, struct hrtimer, node); 1551 timer = container_of(node, struct hrtimer, node);
1563 if (base->softirq_time.tv64 <= 1552 if (base->softirq_time.tv64 <=
1564 hrtimer_get_expires_tv64(timer)) 1553 hrtimer_get_expires_tv64(timer))
1565 break; 1554 break;
@@ -1638,7 +1627,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1638 struct timespec __user *rmtp; 1627 struct timespec __user *rmtp;
1639 int ret = 0; 1628 int ret = 0;
1640 1629
1641 hrtimer_init_on_stack(&t.timer, restart->nanosleep.index, 1630 hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
1642 HRTIMER_MODE_ABS); 1631 HRTIMER_MODE_ABS);
1643 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); 1632 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
1644 1633
@@ -1690,7 +1679,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1690 1679
1691 restart = &current_thread_info()->restart_block; 1680 restart = &current_thread_info()->restart_block;
1692 restart->fn = hrtimer_nanosleep_restart; 1681 restart->fn = hrtimer_nanosleep_restart;
1693 restart->nanosleep.index = t.timer.base->index; 1682 restart->nanosleep.clockid = t.timer.base->clockid;
1694 restart->nanosleep.rmtp = rmtp; 1683 restart->nanosleep.rmtp = rmtp;
1695 restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); 1684 restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
1696 1685
@@ -1724,8 +1713,10 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1724 1713
1725 raw_spin_lock_init(&cpu_base->lock); 1714 raw_spin_lock_init(&cpu_base->lock);
1726 1715
1727 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1716 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1728 cpu_base->clock_base[i].cpu_base = cpu_base; 1717 cpu_base->clock_base[i].cpu_base = cpu_base;
1718 timerqueue_init_head(&cpu_base->clock_base[i].active);
1719 }
1729 1720
1730 hrtimer_init_hres(cpu_base); 1721 hrtimer_init_hres(cpu_base);
1731 INIT_LIST_HEAD(&cpu_base->to_pull); 1722 INIT_LIST_HEAD(&cpu_base->to_pull);
@@ -1737,10 +1728,10 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1737 struct hrtimer_clock_base *new_base) 1728 struct hrtimer_clock_base *new_base)
1738{ 1729{
1739 struct hrtimer *timer; 1730 struct hrtimer *timer;
1740 struct rb_node *node; 1731 struct timerqueue_node *node;
1741 1732
1742 while ((node = rb_first(&old_base->active))) { 1733 while ((node = timerqueue_getnext(&old_base->active))) {
1743 timer = rb_entry(node, struct hrtimer, node); 1734 timer = container_of(node, struct hrtimer, node);
1744 BUG_ON(hrtimer_callback_running(timer)); 1735 BUG_ON(hrtimer_callback_running(timer));
1745 debug_deactivate(timer); 1736 debug_deactivate(timer);
1746 1737
@@ -1869,7 +1860,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
1869 } 1860 }
1870 1861
1871 /* 1862 /*
1872 * A NULL parameter means "inifinte" 1863 * A NULL parameter means "infinite"
1873 */ 1864 */
1874 if (!expires) { 1865 if (!expires) {
1875 schedule(); 1866 schedule();
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 0c642d51aac2..ea640120ab86 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -33,7 +33,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
33/* 33/*
34 * Zero means infinite timeout - no checking done: 34 * Zero means infinite timeout - no checking done:
35 */ 35 */
36unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; 36unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
37 37
38unsigned long __read_mostly sysctl_hung_task_warnings = 10; 38unsigned long __read_mostly sysctl_hung_task_warnings = 10;
39 39
@@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
98 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" 98 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
99 " disables this message.\n"); 99 " disables this message.\n");
100 sched_show_task(t); 100 sched_show_task(t);
101 __debug_show_held_locks(t); 101 debug_show_held_locks(t);
102 102
103 touch_nmi_watchdog(); 103 touch_nmi_watchdog();
104 104
@@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
111 * periodically exit the critical section and enter a new one. 111 * periodically exit the critical section and enter a new one.
112 * 112 *
113 * For preemptible RCU it is sufficient to call rcu_read_unlock in order 113 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
114 * exit the grace period. For classic RCU, a reschedule is required. 114 * to exit the grace period. For classic RCU, a reschedule is required.
115 */ 115 */
116static void rcu_lock_break(struct task_struct *g, struct task_struct *t) 116static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
117{ 117{
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
new file mode 100644
index 000000000000..d1d051b38e0b
--- /dev/null
+++ b/kernel/irq/Kconfig
@@ -0,0 +1,74 @@
1# Select this to activate the generic irq options below
2config HAVE_GENERIC_HARDIRQS
3 bool
4
5if HAVE_GENERIC_HARDIRQS
6menu "IRQ subsystem"
7#
8# Interrupt subsystem related configuration options
9#
10config GENERIC_HARDIRQS
11 def_bool y
12
13# Options selectable by the architecture code
14
15# Make sparse irq Kconfig switch below available
16config HAVE_SPARSE_IRQ
17 bool
18
19# Enable the generic irq autoprobe mechanism
20config GENERIC_IRQ_PROBE
21 bool
22
23# Use the generic /proc/interrupts implementation
24config GENERIC_IRQ_SHOW
25 bool
26
27# Print level/edge extra information
28config GENERIC_IRQ_SHOW_LEVEL
29 bool
30
31# Support for delayed migration from interrupt context
32config GENERIC_PENDING_IRQ
33 bool
34
35# Alpha specific irq affinity mechanism
36config AUTO_IRQ_AFFINITY
37 bool
38
39# Tasklet based software resend for pending interrupts on enable_irq()
40config HARDIRQS_SW_RESEND
41 bool
42
43# Preflow handler support for fasteoi (sparc64)
44config IRQ_PREFLOW_FASTEOI
45 bool
46
47# Edge style eoi based handler (cell)
48config IRQ_EDGE_EOI_HANDLER
49 bool
50
51# Generic configurable interrupt chip implementation
52config GENERIC_IRQ_CHIP
53 bool
54
55# Support forced irq threading
56config IRQ_FORCED_THREADING
57 bool
58
59config SPARSE_IRQ
60 bool "Support sparse irq numbering"
61 depends on HAVE_SPARSE_IRQ
62 ---help---
63
64 Sparse irq numbering is useful for distro kernels that want
65 to define a high CONFIG_NR_CPUS value but still want to have
66 low kernel memory footprint on smaller machines.
67
68 ( Sparse irqs can also be beneficial on NUMA boxes, as they spread
69 out the interrupt descriptors in a more NUMA-friendly way. )
70
71 If you don't know what to do here, say N.
72
73endmenu
74endif
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 7d047808419d..73290056cfb6 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,7 +1,7 @@
1 1
2obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o 2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 4obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 5obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 6obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
6obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
7obj-$(CONFIG_PM_SLEEP) += pm.o 7obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 2295a31ef110..342d8f44e401 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -17,7 +17,7 @@
17/* 17/*
18 * Autodetection depends on the fact that any interrupt that 18 * Autodetection depends on the fact that any interrupt that
19 * comes in on to an unassigned handler will get stuck with 19 * comes in on to an unassigned handler will get stuck with
20 * "IRQ_WAITING" cleared and the interrupt disabled. 20 * "IRQS_WAITING" cleared and the interrupt disabled.
21 */ 21 */
22static DEFINE_MUTEX(probing_active); 22static DEFINE_MUTEX(probing_active);
23 23
@@ -32,7 +32,6 @@ unsigned long probe_irq_on(void)
32{ 32{
33 struct irq_desc *desc; 33 struct irq_desc *desc;
34 unsigned long mask = 0; 34 unsigned long mask = 0;
35 unsigned int status;
36 int i; 35 int i;
37 36
38 /* 37 /*
@@ -46,20 +45,15 @@ unsigned long probe_irq_on(void)
46 */ 45 */
47 for_each_irq_desc_reverse(i, desc) { 46 for_each_irq_desc_reverse(i, desc) {
48 raw_spin_lock_irq(&desc->lock); 47 raw_spin_lock_irq(&desc->lock);
49 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 48 if (!desc->action && irq_settings_can_probe(desc)) {
50 /*
51 * An old-style architecture might still have
52 * the handle_bad_irq handler there:
53 */
54 compat_irq_chip_set_default_handler(desc);
55
56 /* 49 /*
57 * Some chips need to know about probing in 50 * Some chips need to know about probing in
58 * progress: 51 * progress:
59 */ 52 */
60 if (desc->chip->set_type) 53 if (desc->irq_data.chip->irq_set_type)
61 desc->chip->set_type(i, IRQ_TYPE_PROBE); 54 desc->irq_data.chip->irq_set_type(&desc->irq_data,
62 desc->chip->startup(i); 55 IRQ_TYPE_PROBE);
56 irq_startup(desc);
63 } 57 }
64 raw_spin_unlock_irq(&desc->lock); 58 raw_spin_unlock_irq(&desc->lock);
65 } 59 }
@@ -74,10 +68,10 @@ unsigned long probe_irq_on(void)
74 */ 68 */
75 for_each_irq_desc_reverse(i, desc) { 69 for_each_irq_desc_reverse(i, desc) {
76 raw_spin_lock_irq(&desc->lock); 70 raw_spin_lock_irq(&desc->lock);
77 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 71 if (!desc->action && irq_settings_can_probe(desc)) {
78 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 72 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
79 if (desc->chip->startup(i)) 73 if (irq_startup(desc))
80 desc->status |= IRQ_PENDING; 74 desc->istate |= IRQS_PENDING;
81 } 75 }
82 raw_spin_unlock_irq(&desc->lock); 76 raw_spin_unlock_irq(&desc->lock);
83 } 77 }
@@ -92,13 +86,12 @@ unsigned long probe_irq_on(void)
92 */ 86 */
93 for_each_irq_desc(i, desc) { 87 for_each_irq_desc(i, desc) {
94 raw_spin_lock_irq(&desc->lock); 88 raw_spin_lock_irq(&desc->lock);
95 status = desc->status;
96 89
97 if (status & IRQ_AUTODETECT) { 90 if (desc->istate & IRQS_AUTODETECT) {
98 /* It triggered already - consider it spurious. */ 91 /* It triggered already - consider it spurious. */
99 if (!(status & IRQ_WAITING)) { 92 if (!(desc->istate & IRQS_WAITING)) {
100 desc->status = status & ~IRQ_AUTODETECT; 93 desc->istate &= ~IRQS_AUTODETECT;
101 desc->chip->shutdown(i); 94 irq_shutdown(desc);
102 } else 95 } else
103 if (i < 32) 96 if (i < 32)
104 mask |= 1 << i; 97 mask |= 1 << i;
@@ -124,20 +117,18 @@ EXPORT_SYMBOL(probe_irq_on);
124 */ 117 */
125unsigned int probe_irq_mask(unsigned long val) 118unsigned int probe_irq_mask(unsigned long val)
126{ 119{
127 unsigned int status, mask = 0; 120 unsigned int mask = 0;
128 struct irq_desc *desc; 121 struct irq_desc *desc;
129 int i; 122 int i;
130 123
131 for_each_irq_desc(i, desc) { 124 for_each_irq_desc(i, desc) {
132 raw_spin_lock_irq(&desc->lock); 125 raw_spin_lock_irq(&desc->lock);
133 status = desc->status; 126 if (desc->istate & IRQS_AUTODETECT) {
134 127 if (i < 16 && !(desc->istate & IRQS_WAITING))
135 if (status & IRQ_AUTODETECT) {
136 if (i < 16 && !(status & IRQ_WAITING))
137 mask |= 1 << i; 128 mask |= 1 << i;
138 129
139 desc->status = status & ~IRQ_AUTODETECT; 130 desc->istate &= ~IRQS_AUTODETECT;
140 desc->chip->shutdown(i); 131 irq_shutdown(desc);
141 } 132 }
142 raw_spin_unlock_irq(&desc->lock); 133 raw_spin_unlock_irq(&desc->lock);
143 } 134 }
@@ -168,20 +159,18 @@ int probe_irq_off(unsigned long val)
168{ 159{
169 int i, irq_found = 0, nr_of_irqs = 0; 160 int i, irq_found = 0, nr_of_irqs = 0;
170 struct irq_desc *desc; 161 struct irq_desc *desc;
171 unsigned int status;
172 162
173 for_each_irq_desc(i, desc) { 163 for_each_irq_desc(i, desc) {
174 raw_spin_lock_irq(&desc->lock); 164 raw_spin_lock_irq(&desc->lock);
175 status = desc->status;
176 165
177 if (status & IRQ_AUTODETECT) { 166 if (desc->istate & IRQS_AUTODETECT) {
178 if (!(status & IRQ_WAITING)) { 167 if (!(desc->istate & IRQS_WAITING)) {
179 if (!nr_of_irqs) 168 if (!nr_of_irqs)
180 irq_found = i; 169 irq_found = i;
181 nr_of_irqs++; 170 nr_of_irqs++;
182 } 171 }
183 desc->status = status & ~IRQ_AUTODETECT; 172 desc->istate &= ~IRQS_AUTODETECT;
184 desc->chip->shutdown(i); 173 irq_shutdown(desc);
185 } 174 }
186 raw_spin_unlock_irq(&desc->lock); 175 raw_spin_unlock_irq(&desc->lock);
187 } 176 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b7091d5ca2f8..d5a3009da71a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,363 +18,217 @@
18 18
19#include "internals.h" 19#include "internals.h"
20 20
21static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
22{
23 struct irq_desc *desc;
24 unsigned long flags;
25
26 desc = irq_to_desc(irq);
27 if (!desc) {
28 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
29 return;
30 }
31
32 /* Ensure we don't have left over values from a previous use of this irq */
33 raw_spin_lock_irqsave(&desc->lock, flags);
34 desc->status = IRQ_DISABLED;
35 desc->chip = &no_irq_chip;
36 desc->handle_irq = handle_bad_irq;
37 desc->depth = 1;
38 desc->msi_desc = NULL;
39 desc->handler_data = NULL;
40 if (!keep_chip_data)
41 desc->chip_data = NULL;
42 desc->action = NULL;
43 desc->irq_count = 0;
44 desc->irqs_unhandled = 0;
45#ifdef CONFIG_SMP
46 cpumask_setall(desc->affinity);
47#ifdef CONFIG_GENERIC_PENDING_IRQ
48 cpumask_clear(desc->pending_mask);
49#endif
50#endif
51 raw_spin_unlock_irqrestore(&desc->lock, flags);
52}
53
54/** 21/**
55 * dynamic_irq_init - initialize a dynamically allocated irq 22 * irq_set_chip - set the irq chip for an irq
56 * @irq: irq number to initialize
57 */
58void dynamic_irq_init(unsigned int irq)
59{
60 dynamic_irq_init_x(irq, false);
61}
62
63/**
64 * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
65 * @irq: irq number to initialize
66 *
67 * does not set irq_to_desc(irq)->chip_data to NULL
68 */
69void dynamic_irq_init_keep_chip_data(unsigned int irq)
70{
71 dynamic_irq_init_x(irq, true);
72}
73
74static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
75{
76 struct irq_desc *desc = irq_to_desc(irq);
77 unsigned long flags;
78
79 if (!desc) {
80 WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
81 return;
82 }
83
84 raw_spin_lock_irqsave(&desc->lock, flags);
85 if (desc->action) {
86 raw_spin_unlock_irqrestore(&desc->lock, flags);
87 WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
88 irq);
89 return;
90 }
91 desc->msi_desc = NULL;
92 desc->handler_data = NULL;
93 if (!keep_chip_data)
94 desc->chip_data = NULL;
95 desc->handle_irq = handle_bad_irq;
96 desc->chip = &no_irq_chip;
97 desc->name = NULL;
98 clear_kstat_irqs(desc);
99 raw_spin_unlock_irqrestore(&desc->lock, flags);
100}
101
102/**
103 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
104 * @irq: irq number to initialize
105 */
106void dynamic_irq_cleanup(unsigned int irq)
107{
108 dynamic_irq_cleanup_x(irq, false);
109}
110
111/**
112 * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
113 * @irq: irq number to initialize
114 *
115 * does not set irq_to_desc(irq)->chip_data to NULL
116 */
117void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
118{
119 dynamic_irq_cleanup_x(irq, true);
120}
121
122
123/**
124 * set_irq_chip - set the irq chip for an irq
125 * @irq: irq number 23 * @irq: irq number
126 * @chip: pointer to irq chip description structure 24 * @chip: pointer to irq chip description structure
127 */ 25 */
128int set_irq_chip(unsigned int irq, struct irq_chip *chip) 26int irq_set_chip(unsigned int irq, struct irq_chip *chip)
129{ 27{
130 struct irq_desc *desc = irq_to_desc(irq);
131 unsigned long flags; 28 unsigned long flags;
29 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
132 30
133 if (!desc) { 31 if (!desc)
134 WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
135 return -EINVAL; 32 return -EINVAL;
136 }
137 33
138 if (!chip) 34 if (!chip)
139 chip = &no_irq_chip; 35 chip = &no_irq_chip;
140 36
141 raw_spin_lock_irqsave(&desc->lock, flags); 37 desc->irq_data.chip = chip;
142 irq_chip_set_defaults(chip); 38 irq_put_desc_unlock(desc, flags);
143 desc->chip = chip; 39 /*
144 raw_spin_unlock_irqrestore(&desc->lock, flags); 40 * For !CONFIG_SPARSE_IRQ make the irq show up in
145 41 * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is
42 * already marked, and this call is harmless.
43 */
44 irq_reserve_irq(irq);
146 return 0; 45 return 0;
147} 46}
148EXPORT_SYMBOL(set_irq_chip); 47EXPORT_SYMBOL(irq_set_chip);
149 48
150/** 49/**
151 * set_irq_type - set the irq trigger type for an irq 50 * irq_set_type - set the irq trigger type for an irq
152 * @irq: irq number 51 * @irq: irq number
153 * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h 52 * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
154 */ 53 */
155int set_irq_type(unsigned int irq, unsigned int type) 54int irq_set_irq_type(unsigned int irq, unsigned int type)
156{ 55{
157 struct irq_desc *desc = irq_to_desc(irq);
158 unsigned long flags; 56 unsigned long flags;
159 int ret = -ENXIO; 57 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
58 int ret = 0;
160 59
161 if (!desc) { 60 if (!desc)
162 printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); 61 return -EINVAL;
163 return -ENODEV;
164 }
165 62
166 type &= IRQ_TYPE_SENSE_MASK; 63 type &= IRQ_TYPE_SENSE_MASK;
167 if (type == IRQ_TYPE_NONE) 64 if (type != IRQ_TYPE_NONE)
168 return 0; 65 ret = __irq_set_trigger(desc, irq, type);
169 66 irq_put_desc_busunlock(desc, flags);
170 raw_spin_lock_irqsave(&desc->lock, flags);
171 ret = __irq_set_trigger(desc, irq, type);
172 raw_spin_unlock_irqrestore(&desc->lock, flags);
173 return ret; 67 return ret;
174} 68}
175EXPORT_SYMBOL(set_irq_type); 69EXPORT_SYMBOL(irq_set_irq_type);
176 70
177/** 71/**
178 * set_irq_data - set irq type data for an irq 72 * irq_set_handler_data - set irq handler data for an irq
179 * @irq: Interrupt number 73 * @irq: Interrupt number
180 * @data: Pointer to interrupt specific data 74 * @data: Pointer to interrupt specific data
181 * 75 *
182 * Set the hardware irq controller data for an irq 76 * Set the hardware irq controller data for an irq
183 */ 77 */
184int set_irq_data(unsigned int irq, void *data) 78int irq_set_handler_data(unsigned int irq, void *data)
185{ 79{
186 struct irq_desc *desc = irq_to_desc(irq);
187 unsigned long flags; 80 unsigned long flags;
81 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
188 82
189 if (!desc) { 83 if (!desc)
190 printk(KERN_ERR
191 "Trying to install controller data for IRQ%d\n", irq);
192 return -EINVAL; 84 return -EINVAL;
193 } 85 desc->irq_data.handler_data = data;
194 86 irq_put_desc_unlock(desc, flags);
195 raw_spin_lock_irqsave(&desc->lock, flags);
196 desc->handler_data = data;
197 raw_spin_unlock_irqrestore(&desc->lock, flags);
198 return 0; 87 return 0;
199} 88}
200EXPORT_SYMBOL(set_irq_data); 89EXPORT_SYMBOL(irq_set_handler_data);
201 90
202/** 91/**
203 * set_irq_msi - set MSI descriptor data for an irq 92 * irq_set_msi_desc - set MSI descriptor data for an irq
204 * @irq: Interrupt number 93 * @irq: Interrupt number
205 * @entry: Pointer to MSI descriptor data 94 * @entry: Pointer to MSI descriptor data
206 * 95 *
207 * Set the MSI descriptor entry for an irq 96 * Set the MSI descriptor entry for an irq
208 */ 97 */
209int set_irq_msi(unsigned int irq, struct msi_desc *entry) 98int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
210{ 99{
211 struct irq_desc *desc = irq_to_desc(irq);
212 unsigned long flags; 100 unsigned long flags;
101 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
213 102
214 if (!desc) { 103 if (!desc)
215 printk(KERN_ERR
216 "Trying to install msi data for IRQ%d\n", irq);
217 return -EINVAL; 104 return -EINVAL;
218 } 105 desc->irq_data.msi_desc = entry;
219
220 raw_spin_lock_irqsave(&desc->lock, flags);
221 desc->msi_desc = entry;
222 if (entry) 106 if (entry)
223 entry->irq = irq; 107 entry->irq = irq;
224 raw_spin_unlock_irqrestore(&desc->lock, flags); 108 irq_put_desc_unlock(desc, flags);
225 return 0; 109 return 0;
226} 110}
227 111
228/** 112/**
229 * set_irq_chip_data - set irq chip data for an irq 113 * irq_set_chip_data - set irq chip data for an irq
230 * @irq: Interrupt number 114 * @irq: Interrupt number
231 * @data: Pointer to chip specific data 115 * @data: Pointer to chip specific data
232 * 116 *
233 * Set the hardware irq chip data for an irq 117 * Set the hardware irq chip data for an irq
234 */ 118 */
235int set_irq_chip_data(unsigned int irq, void *data) 119int irq_set_chip_data(unsigned int irq, void *data)
236{ 120{
237 struct irq_desc *desc = irq_to_desc(irq);
238 unsigned long flags; 121 unsigned long flags;
122 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
239 123
240 if (!desc) { 124 if (!desc)
241 printk(KERN_ERR
242 "Trying to install chip data for IRQ%d\n", irq);
243 return -EINVAL;
244 }
245
246 if (!desc->chip) {
247 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
248 return -EINVAL; 125 return -EINVAL;
249 } 126 desc->irq_data.chip_data = data;
250 127 irq_put_desc_unlock(desc, flags);
251 raw_spin_lock_irqsave(&desc->lock, flags);
252 desc->chip_data = data;
253 raw_spin_unlock_irqrestore(&desc->lock, flags);
254
255 return 0; 128 return 0;
256} 129}
257EXPORT_SYMBOL(set_irq_chip_data); 130EXPORT_SYMBOL(irq_set_chip_data);
258 131
259/** 132struct irq_data *irq_get_irq_data(unsigned int irq)
260 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
261 *
262 * @irq: Interrupt number
263 * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag
264 *
265 * The IRQ_NESTED_THREAD flag indicates that on
266 * request_threaded_irq() no separate interrupt thread should be
267 * created for the irq as the handler are called nested in the
268 * context of a demultiplexing interrupt handler thread.
269 */
270void set_irq_nested_thread(unsigned int irq, int nest)
271{ 133{
272 struct irq_desc *desc = irq_to_desc(irq); 134 struct irq_desc *desc = irq_to_desc(irq);
273 unsigned long flags;
274 135
275 if (!desc) 136 return desc ? &desc->irq_data : NULL;
276 return; 137}
138EXPORT_SYMBOL_GPL(irq_get_irq_data);
277 139
278 raw_spin_lock_irqsave(&desc->lock, flags); 140static void irq_state_clr_disabled(struct irq_desc *desc)
279 if (nest) 141{
280 desc->status |= IRQ_NESTED_THREAD; 142 irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED);
281 else
282 desc->status &= ~IRQ_NESTED_THREAD;
283 raw_spin_unlock_irqrestore(&desc->lock, flags);
284} 143}
285EXPORT_SYMBOL_GPL(set_irq_nested_thread);
286 144
287/* 145static void irq_state_set_disabled(struct irq_desc *desc)
288 * default enable function
289 */
290static void default_enable(unsigned int irq)
291{ 146{
292 struct irq_desc *desc = irq_to_desc(irq); 147 irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
148}
293 149
294 desc->chip->unmask(irq); 150static void irq_state_clr_masked(struct irq_desc *desc)
295 desc->status &= ~IRQ_MASKED; 151{
152 irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
296} 153}
297 154
298/* 155static void irq_state_set_masked(struct irq_desc *desc)
299 * default disable function
300 */
301static void default_disable(unsigned int irq)
302{ 156{
157 irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
303} 158}
304 159
305/* 160int irq_startup(struct irq_desc *desc)
306 * default startup function
307 */
308static unsigned int default_startup(unsigned int irq)
309{ 161{
310 struct irq_desc *desc = irq_to_desc(irq); 162 irq_state_clr_disabled(desc);
163 desc->depth = 0;
311 164
312 desc->chip->enable(irq); 165 if (desc->irq_data.chip->irq_startup) {
166 int ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
167 irq_state_clr_masked(desc);
168 return ret;
169 }
170
171 irq_enable(desc);
313 return 0; 172 return 0;
314} 173}
315 174
316/* 175void irq_shutdown(struct irq_desc *desc)
317 * default shutdown function
318 */
319static void default_shutdown(unsigned int irq)
320{ 176{
321 struct irq_desc *desc = irq_to_desc(irq); 177 irq_state_set_disabled(desc);
178 desc->depth = 1;
179 if (desc->irq_data.chip->irq_shutdown)
180 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
181 if (desc->irq_data.chip->irq_disable)
182 desc->irq_data.chip->irq_disable(&desc->irq_data);
183 else
184 desc->irq_data.chip->irq_mask(&desc->irq_data);
185 irq_state_set_masked(desc);
186}
322 187
323 desc->chip->mask(irq); 188void irq_enable(struct irq_desc *desc)
324 desc->status |= IRQ_MASKED; 189{
190 irq_state_clr_disabled(desc);
191 if (desc->irq_data.chip->irq_enable)
192 desc->irq_data.chip->irq_enable(&desc->irq_data);
193 else
194 desc->irq_data.chip->irq_unmask(&desc->irq_data);
195 irq_state_clr_masked(desc);
325} 196}
326 197
327/* 198void irq_disable(struct irq_desc *desc)
328 * Fixup enable/disable function pointers
329 */
330void irq_chip_set_defaults(struct irq_chip *chip)
331{ 199{
332 if (!chip->enable) 200 irq_state_set_disabled(desc);
333 chip->enable = default_enable; 201 if (desc->irq_data.chip->irq_disable) {
334 if (!chip->disable) 202 desc->irq_data.chip->irq_disable(&desc->irq_data);
335 chip->disable = default_disable; 203 irq_state_set_masked(desc);
336 if (!chip->startup) 204 }
337 chip->startup = default_startup;
338 /*
339 * We use chip->disable, when the user provided its own. When
340 * we have default_disable set for chip->disable, then we need
341 * to use default_shutdown, otherwise the irq line is not
342 * disabled on free_irq():
343 */
344 if (!chip->shutdown)
345 chip->shutdown = chip->disable != default_disable ?
346 chip->disable : default_shutdown;
347 if (!chip->name)
348 chip->name = chip->typename;
349 if (!chip->end)
350 chip->end = dummy_irq_chip.end;
351} 205}
352 206
353static inline void mask_ack_irq(struct irq_desc *desc, int irq) 207static inline void mask_ack_irq(struct irq_desc *desc)
354{ 208{
355 if (desc->chip->mask_ack) 209 if (desc->irq_data.chip->irq_mask_ack)
356 desc->chip->mask_ack(irq); 210 desc->irq_data.chip->irq_mask_ack(&desc->irq_data);
357 else { 211 else {
358 desc->chip->mask(irq); 212 desc->irq_data.chip->irq_mask(&desc->irq_data);
359 if (desc->chip->ack) 213 if (desc->irq_data.chip->irq_ack)
360 desc->chip->ack(irq); 214 desc->irq_data.chip->irq_ack(&desc->irq_data);
361 } 215 }
362 desc->status |= IRQ_MASKED; 216 irq_state_set_masked(desc);
363} 217}
364 218
365static inline void mask_irq(struct irq_desc *desc, int irq) 219void mask_irq(struct irq_desc *desc)
366{ 220{
367 if (desc->chip->mask) { 221 if (desc->irq_data.chip->irq_mask) {
368 desc->chip->mask(irq); 222 desc->irq_data.chip->irq_mask(&desc->irq_data);
369 desc->status |= IRQ_MASKED; 223 irq_state_set_masked(desc);
370 } 224 }
371} 225}
372 226
373static inline void unmask_irq(struct irq_desc *desc, int irq) 227void unmask_irq(struct irq_desc *desc)
374{ 228{
375 if (desc->chip->unmask) { 229 if (desc->irq_data.chip->irq_unmask) {
376 desc->chip->unmask(irq); 230 desc->irq_data.chip->irq_unmask(&desc->irq_data);
377 desc->status &= ~IRQ_MASKED; 231 irq_state_clr_masked(desc);
378 } 232 }
379} 233}
380 234
@@ -399,10 +253,10 @@ void handle_nested_irq(unsigned int irq)
399 kstat_incr_irqs_this_cpu(irq, desc); 253 kstat_incr_irqs_this_cpu(irq, desc);
400 254
401 action = desc->action; 255 action = desc->action;
402 if (unlikely(!action || (desc->status & IRQ_DISABLED))) 256 if (unlikely(!action || irqd_irq_disabled(&desc->irq_data)))
403 goto out_unlock; 257 goto out_unlock;
404 258
405 desc->status |= IRQ_INPROGRESS; 259 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
406 raw_spin_unlock_irq(&desc->lock); 260 raw_spin_unlock_irq(&desc->lock);
407 261
408 action_ret = action->thread_fn(action->irq, action->dev_id); 262 action_ret = action->thread_fn(action->irq, action->dev_id);
@@ -410,13 +264,20 @@ void handle_nested_irq(unsigned int irq)
410 note_interrupt(irq, desc, action_ret); 264 note_interrupt(irq, desc, action_ret);
411 265
412 raw_spin_lock_irq(&desc->lock); 266 raw_spin_lock_irq(&desc->lock);
413 desc->status &= ~IRQ_INPROGRESS; 267 irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
414 268
415out_unlock: 269out_unlock:
416 raw_spin_unlock_irq(&desc->lock); 270 raw_spin_unlock_irq(&desc->lock);
417} 271}
418EXPORT_SYMBOL_GPL(handle_nested_irq); 272EXPORT_SYMBOL_GPL(handle_nested_irq);
419 273
274static bool irq_check_poll(struct irq_desc *desc)
275{
276 if (!(desc->istate & IRQS_POLL_INPROGRESS))
277 return false;
278 return irq_wait_for_poll(desc);
279}
280
420/** 281/**
421 * handle_simple_irq - Simple and software-decoded IRQs. 282 * handle_simple_irq - Simple and software-decoded IRQs.
422 * @irq: the interrupt number 283 * @irq: the interrupt number
@@ -432,32 +293,24 @@ EXPORT_SYMBOL_GPL(handle_nested_irq);
432void 293void
433handle_simple_irq(unsigned int irq, struct irq_desc *desc) 294handle_simple_irq(unsigned int irq, struct irq_desc *desc)
434{ 295{
435 struct irqaction *action;
436 irqreturn_t action_ret;
437
438 raw_spin_lock(&desc->lock); 296 raw_spin_lock(&desc->lock);
439 297
440 if (unlikely(desc->status & IRQ_INPROGRESS)) 298 if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
441 goto out_unlock; 299 if (!irq_check_poll(desc))
442 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 300 goto out_unlock;
301
302 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
443 kstat_incr_irqs_this_cpu(irq, desc); 303 kstat_incr_irqs_this_cpu(irq, desc);
444 304
445 action = desc->action; 305 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
446 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
447 goto out_unlock; 306 goto out_unlock;
448 307
449 desc->status |= IRQ_INPROGRESS; 308 handle_irq_event(desc);
450 raw_spin_unlock(&desc->lock);
451 309
452 action_ret = handle_IRQ_event(irq, action);
453 if (!noirqdebug)
454 note_interrupt(irq, desc, action_ret);
455
456 raw_spin_lock(&desc->lock);
457 desc->status &= ~IRQ_INPROGRESS;
458out_unlock: 310out_unlock:
459 raw_spin_unlock(&desc->lock); 311 raw_spin_unlock(&desc->lock);
460} 312}
313EXPORT_SYMBOL_GPL(handle_simple_irq);
461 314
462/** 315/**
463 * handle_level_irq - Level type irq handler 316 * handle_level_irq - Level type irq handler
@@ -472,42 +325,42 @@ out_unlock:
472void 325void
473handle_level_irq(unsigned int irq, struct irq_desc *desc) 326handle_level_irq(unsigned int irq, struct irq_desc *desc)
474{ 327{
475 struct irqaction *action;
476 irqreturn_t action_ret;
477
478 raw_spin_lock(&desc->lock); 328 raw_spin_lock(&desc->lock);
479 mask_ack_irq(desc, irq); 329 mask_ack_irq(desc);
480 330
481 if (unlikely(desc->status & IRQ_INPROGRESS)) 331 if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
482 goto out_unlock; 332 if (!irq_check_poll(desc))
483 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 333 goto out_unlock;
334
335 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
484 kstat_incr_irqs_this_cpu(irq, desc); 336 kstat_incr_irqs_this_cpu(irq, desc);
485 337
486 /* 338 /*
487 * If its disabled or no action available 339 * If its disabled or no action available
488 * keep it masked and get out of here 340 * keep it masked and get out of here
489 */ 341 */
490 action = desc->action; 342 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
491 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
492 goto out_unlock; 343 goto out_unlock;
493 344
494 desc->status |= IRQ_INPROGRESS; 345 handle_irq_event(desc);
495 raw_spin_unlock(&desc->lock);
496
497 action_ret = handle_IRQ_event(irq, action);
498 if (!noirqdebug)
499 note_interrupt(irq, desc, action_ret);
500
501 raw_spin_lock(&desc->lock);
502 desc->status &= ~IRQ_INPROGRESS;
503 346
504 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) 347 if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT))
505 unmask_irq(desc, irq); 348 unmask_irq(desc);
506out_unlock: 349out_unlock:
507 raw_spin_unlock(&desc->lock); 350 raw_spin_unlock(&desc->lock);
508} 351}
509EXPORT_SYMBOL_GPL(handle_level_irq); 352EXPORT_SYMBOL_GPL(handle_level_irq);
510 353
354#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
355static inline void preflow_handler(struct irq_desc *desc)
356{
357 if (desc->preflow_handler)
358 desc->preflow_handler(&desc->irq_data);
359}
360#else
361static inline void preflow_handler(struct irq_desc *desc) { }
362#endif
363
511/** 364/**
512 * handle_fasteoi_irq - irq handler for transparent controllers 365 * handle_fasteoi_irq - irq handler for transparent controllers
513 * @irq: the interrupt number 366 * @irq: the interrupt number
@@ -521,42 +374,40 @@ EXPORT_SYMBOL_GPL(handle_level_irq);
521void 374void
522handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) 375handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
523{ 376{
524 struct irqaction *action;
525 irqreturn_t action_ret;
526
527 raw_spin_lock(&desc->lock); 377 raw_spin_lock(&desc->lock);
528 378
529 if (unlikely(desc->status & IRQ_INPROGRESS)) 379 if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
530 goto out; 380 if (!irq_check_poll(desc))
381 goto out;
531 382
532 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 383 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
533 kstat_incr_irqs_this_cpu(irq, desc); 384 kstat_incr_irqs_this_cpu(irq, desc);
534 385
535 /* 386 /*
536 * If its disabled or no action available 387 * If its disabled or no action available
537 * then mask it and get out of here: 388 * then mask it and get out of here:
538 */ 389 */
539 action = desc->action; 390 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
540 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 391 desc->istate |= IRQS_PENDING;
541 desc->status |= IRQ_PENDING; 392 mask_irq(desc);
542 mask_irq(desc, irq);
543 goto out; 393 goto out;
544 } 394 }
545 395
546 desc->status |= IRQ_INPROGRESS; 396 if (desc->istate & IRQS_ONESHOT)
547 desc->status &= ~IRQ_PENDING; 397 mask_irq(desc);
548 raw_spin_unlock(&desc->lock);
549
550 action_ret = handle_IRQ_event(irq, action);
551 if (!noirqdebug)
552 note_interrupt(irq, desc, action_ret);
553 398
554 raw_spin_lock(&desc->lock); 399 preflow_handler(desc);
555 desc->status &= ~IRQ_INPROGRESS; 400 handle_irq_event(desc);
556out:
557 desc->chip->eoi(irq);
558 401
402out_eoi:
403 desc->irq_data.chip->irq_eoi(&desc->irq_data);
404out_unlock:
559 raw_spin_unlock(&desc->lock); 405 raw_spin_unlock(&desc->lock);
406 return;
407out:
408 if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED))
409 goto out_eoi;
410 goto out_unlock;
560} 411}
561 412
562/** 413/**
@@ -565,7 +416,7 @@ out:
565 * @desc: the interrupt description structure for this irq 416 * @desc: the interrupt description structure for this irq
566 * 417 *
567 * Interrupt occures on the falling and/or rising edge of a hardware 418 * Interrupt occures on the falling and/or rising edge of a hardware
568 * signal. The occurence is latched into the irq controller hardware 419 * signal. The occurrence is latched into the irq controller hardware
569 * and must be acked in order to be reenabled. After the ack another 420 * and must be acked in order to be reenabled. After the ack another
570 * interrupt can happen on the same source even before the first one 421 * interrupt can happen on the same source even before the first one
571 * is handled by the associated event handler. If this happens it 422 * is handled by the associated event handler. If this happens it
@@ -580,34 +431,28 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
580{ 431{
581 raw_spin_lock(&desc->lock); 432 raw_spin_lock(&desc->lock);
582 433
583 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 434 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
584
585 /* 435 /*
586 * If we're currently running this IRQ, or its disabled, 436 * If we're currently running this IRQ, or its disabled,
587 * we shouldn't process the IRQ. Mark it pending, handle 437 * we shouldn't process the IRQ. Mark it pending, handle
588 * the necessary masking and go out 438 * the necessary masking and go out
589 */ 439 */
590 if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || 440 if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
591 !desc->action)) { 441 irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
592 desc->status |= (IRQ_PENDING | IRQ_MASKED); 442 if (!irq_check_poll(desc)) {
593 mask_ack_irq(desc, irq); 443 desc->istate |= IRQS_PENDING;
594 goto out_unlock; 444 mask_ack_irq(desc);
445 goto out_unlock;
446 }
595 } 447 }
596 kstat_incr_irqs_this_cpu(irq, desc); 448 kstat_incr_irqs_this_cpu(irq, desc);
597 449
598 /* Start handling the irq */ 450 /* Start handling the irq */
599 if (desc->chip->ack) 451 desc->irq_data.chip->irq_ack(&desc->irq_data);
600 desc->chip->ack(irq);
601
602 /* Mark the IRQ currently in progress.*/
603 desc->status |= IRQ_INPROGRESS;
604 452
605 do { 453 do {
606 struct irqaction *action = desc->action; 454 if (unlikely(!desc->action)) {
607 irqreturn_t action_ret; 455 mask_irq(desc);
608
609 if (unlikely(!action)) {
610 mask_irq(desc, irq);
611 goto out_unlock; 456 goto out_unlock;
612 } 457 }
613 458
@@ -616,26 +461,66 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
616 * one, we could have masked the irq. 461 * one, we could have masked the irq.
617 * Renable it, if it was not disabled in meantime. 462 * Renable it, if it was not disabled in meantime.
618 */ 463 */
619 if (unlikely((desc->status & 464 if (unlikely(desc->istate & IRQS_PENDING)) {
620 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == 465 if (!irqd_irq_disabled(&desc->irq_data) &&
621 (IRQ_PENDING | IRQ_MASKED))) { 466 irqd_irq_masked(&desc->irq_data))
622 unmask_irq(desc, irq); 467 unmask_irq(desc);
623 } 468 }
624 469
625 desc->status &= ~IRQ_PENDING; 470 handle_irq_event(desc);
626 raw_spin_unlock(&desc->lock);
627 action_ret = handle_IRQ_event(irq, action);
628 if (!noirqdebug)
629 note_interrupt(irq, desc, action_ret);
630 raw_spin_lock(&desc->lock);
631 471
632 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); 472 } while ((desc->istate & IRQS_PENDING) &&
473 !irqd_irq_disabled(&desc->irq_data));
633 474
634 desc->status &= ~IRQ_INPROGRESS;
635out_unlock: 475out_unlock:
636 raw_spin_unlock(&desc->lock); 476 raw_spin_unlock(&desc->lock);
637} 477}
638 478
479#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
480/**
481 * handle_edge_eoi_irq - edge eoi type IRQ handler
482 * @irq: the interrupt number
483 * @desc: the interrupt description structure for this irq
484 *
485 * Similar as the above handle_edge_irq, but using eoi and w/o the
486 * mask/unmask logic.
487 */
488void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
489{
490 struct irq_chip *chip = irq_desc_get_chip(desc);
491
492 raw_spin_lock(&desc->lock);
493
494 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
495 /*
496 * If we're currently running this IRQ, or its disabled,
497 * we shouldn't process the IRQ. Mark it pending, handle
498 * the necessary masking and go out
499 */
500 if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
501 irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
502 if (!irq_check_poll(desc)) {
503 desc->istate |= IRQS_PENDING;
504 goto out_eoi;
505 }
506 }
507 kstat_incr_irqs_this_cpu(irq, desc);
508
509 do {
510 if (unlikely(!desc->action))
511 goto out_eoi;
512
513 handle_irq_event(desc);
514
515 } while ((desc->istate & IRQS_PENDING) &&
516 !irqd_irq_disabled(&desc->irq_data));
517
518out_eoi:
519 chip->irq_eoi(&desc->irq_data);
520 raw_spin_unlock(&desc->lock);
521}
522#endif
523
639/** 524/**
640 * handle_percpu_irq - Per CPU local irq handler 525 * handle_percpu_irq - Per CPU local irq handler
641 * @irq: the interrupt number 526 * @irq: the interrupt number
@@ -646,115 +531,147 @@ out_unlock:
646void 531void
647handle_percpu_irq(unsigned int irq, struct irq_desc *desc) 532handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
648{ 533{
649 irqreturn_t action_ret; 534 struct irq_chip *chip = irq_desc_get_chip(desc);
650 535
651 kstat_incr_irqs_this_cpu(irq, desc); 536 kstat_incr_irqs_this_cpu(irq, desc);
652 537
653 if (desc->chip->ack) 538 if (chip->irq_ack)
654 desc->chip->ack(irq); 539 chip->irq_ack(&desc->irq_data);
655 540
656 action_ret = handle_IRQ_event(irq, desc->action); 541 handle_irq_event_percpu(desc, desc->action);
657 if (!noirqdebug)
658 note_interrupt(irq, desc, action_ret);
659 542
660 if (desc->chip->eoi) 543 if (chip->irq_eoi)
661 desc->chip->eoi(irq); 544 chip->irq_eoi(&desc->irq_data);
662} 545}
663 546
664void 547void
665__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, 548__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
666 const char *name) 549 const char *name)
667{ 550{
668 struct irq_desc *desc = irq_to_desc(irq);
669 unsigned long flags; 551 unsigned long flags;
552 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
670 553
671 if (!desc) { 554 if (!desc)
672 printk(KERN_ERR
673 "Trying to install type control for IRQ%d\n", irq);
674 return; 555 return;
675 }
676 556
677 if (!handle) 557 if (!handle) {
678 handle = handle_bad_irq; 558 handle = handle_bad_irq;
679 else if (desc->chip == &no_irq_chip) { 559 } else {
680 printk(KERN_WARNING "Trying to install %sinterrupt handler " 560 if (WARN_ON(desc->irq_data.chip == &no_irq_chip))
681 "for IRQ%d\n", is_chained ? "chained " : "", irq); 561 goto out;
682 /*
683 * Some ARM implementations install a handler for really dumb
684 * interrupt hardware without setting an irq_chip. This worked
685 * with the ARM no_irq_chip but the check in setup_irq would
686 * prevent us to setup the interrupt at all. Switch it to
687 * dummy_irq_chip for easy transition.
688 */
689 desc->chip = &dummy_irq_chip;
690 } 562 }
691 563
692 chip_bus_lock(irq, desc);
693 raw_spin_lock_irqsave(&desc->lock, flags);
694
695 /* Uninstall? */ 564 /* Uninstall? */
696 if (handle == handle_bad_irq) { 565 if (handle == handle_bad_irq) {
697 if (desc->chip != &no_irq_chip) 566 if (desc->irq_data.chip != &no_irq_chip)
698 mask_ack_irq(desc, irq); 567 mask_ack_irq(desc);
699 desc->status |= IRQ_DISABLED; 568 irq_state_set_disabled(desc);
700 desc->depth = 1; 569 desc->depth = 1;
701 } 570 }
702 desc->handle_irq = handle; 571 desc->handle_irq = handle;
703 desc->name = name; 572 desc->name = name;
704 573
705 if (handle != handle_bad_irq && is_chained) { 574 if (handle != handle_bad_irq && is_chained) {
706 desc->status &= ~IRQ_DISABLED; 575 irq_settings_set_noprobe(desc);
707 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; 576 irq_settings_set_norequest(desc);
708 desc->depth = 0; 577 irq_settings_set_nothread(desc);
709 desc->chip->startup(irq); 578 irq_startup(desc);
710 } 579 }
711 raw_spin_unlock_irqrestore(&desc->lock, flags); 580out:
712 chip_bus_sync_unlock(irq, desc); 581 irq_put_desc_busunlock(desc, flags);
713} 582}
714EXPORT_SYMBOL_GPL(__set_irq_handler); 583EXPORT_SYMBOL_GPL(__irq_set_handler);
715 584
716void 585void
717set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, 586irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
718 irq_flow_handler_t handle) 587 irq_flow_handler_t handle, const char *name)
719{ 588{
720 set_irq_chip(irq, chip); 589 irq_set_chip(irq, chip);
721 __set_irq_handler(irq, handle, 0, NULL); 590 __irq_set_handler(irq, handle, 0, name);
722} 591}
723 592
724void 593void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
725set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
726 irq_flow_handler_t handle, const char *name)
727{ 594{
728 set_irq_chip(irq, chip); 595 unsigned long flags;
729 __set_irq_handler(irq, handle, 0, name); 596 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
597
598 if (!desc)
599 return;
600 irq_settings_clr_and_set(desc, clr, set);
601
602 irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
603 IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
604 if (irq_settings_has_no_balance_set(desc))
605 irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
606 if (irq_settings_is_per_cpu(desc))
607 irqd_set(&desc->irq_data, IRQD_PER_CPU);
608 if (irq_settings_can_move_pcntxt(desc))
609 irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
610 if (irq_settings_is_level(desc))
611 irqd_set(&desc->irq_data, IRQD_LEVEL);
612
613 irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
614
615 irq_put_desc_unlock(desc, flags);
730} 616}
617EXPORT_SYMBOL_GPL(irq_modify_status);
731 618
732void set_irq_noprobe(unsigned int irq) 619/**
620 * irq_cpu_online - Invoke all irq_cpu_online functions.
621 *
622 * Iterate through all irqs and invoke the chip.irq_cpu_online()
623 * for each.
624 */
625void irq_cpu_online(void)
733{ 626{
734 struct irq_desc *desc = irq_to_desc(irq); 627 struct irq_desc *desc;
628 struct irq_chip *chip;
735 unsigned long flags; 629 unsigned long flags;
630 unsigned int irq;
736 631
737 if (!desc) { 632 for_each_active_irq(irq) {
738 printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); 633 desc = irq_to_desc(irq);
739 return; 634 if (!desc)
740 } 635 continue;
636
637 raw_spin_lock_irqsave(&desc->lock, flags);
741 638
742 raw_spin_lock_irqsave(&desc->lock, flags); 639 chip = irq_data_get_irq_chip(&desc->irq_data);
743 desc->status |= IRQ_NOPROBE; 640 if (chip && chip->irq_cpu_online &&
744 raw_spin_unlock_irqrestore(&desc->lock, flags); 641 (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
642 !irqd_irq_disabled(&desc->irq_data)))
643 chip->irq_cpu_online(&desc->irq_data);
644
645 raw_spin_unlock_irqrestore(&desc->lock, flags);
646 }
745} 647}
746 648
747void set_irq_probe(unsigned int irq) 649/**
650 * irq_cpu_offline - Invoke all irq_cpu_offline functions.
651 *
652 * Iterate through all irqs and invoke the chip.irq_cpu_offline()
653 * for each.
654 */
655void irq_cpu_offline(void)
748{ 656{
749 struct irq_desc *desc = irq_to_desc(irq); 657 struct irq_desc *desc;
658 struct irq_chip *chip;
750 unsigned long flags; 659 unsigned long flags;
660 unsigned int irq;
751 661
752 if (!desc) { 662 for_each_active_irq(irq) {
753 printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); 663 desc = irq_to_desc(irq);
754 return; 664 if (!desc)
755 } 665 continue;
666
667 raw_spin_lock_irqsave(&desc->lock, flags);
756 668
757 raw_spin_lock_irqsave(&desc->lock, flags); 669 chip = irq_data_get_irq_chip(&desc->irq_data);
758 desc->status &= ~IRQ_NOPROBE; 670 if (chip && chip->irq_cpu_offline &&
759 raw_spin_unlock_irqrestore(&desc->lock, flags); 671 (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
672 !irqd_irq_disabled(&desc->irq_data)))
673 chip->irq_cpu_offline(&desc->irq_data);
674
675 raw_spin_unlock_irqrestore(&desc->lock, flags);
676 }
760} 677}
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
new file mode 100644
index 000000000000..97a8bfadc88a
--- /dev/null
+++ b/kernel/irq/debug.h
@@ -0,0 +1,45 @@
1/*
2 * Debugging printout:
3 */
4
5#include <linux/kallsyms.h>
6
7#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
8#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
9/* FIXME */
10#define PD(f) do { } while (0)
11
12static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
13{
14 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
15 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
16 printk("->handle_irq(): %p, ", desc->handle_irq);
17 print_symbol("%s\n", (unsigned long)desc->handle_irq);
18 printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
19 print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
20 printk("->action(): %p\n", desc->action);
21 if (desc->action) {
22 printk("->action->handler(): %p, ", desc->action->handler);
23 print_symbol("%s\n", (unsigned long)desc->action->handler);
24 }
25
26 P(IRQ_LEVEL);
27 P(IRQ_PER_CPU);
28 P(IRQ_NOPROBE);
29 P(IRQ_NOREQUEST);
30 P(IRQ_NOTHREAD);
31 P(IRQ_NOAUTOEN);
32
33 PS(IRQS_AUTODETECT);
34 PS(IRQS_REPLAY);
35 PS(IRQS_WAITING);
36 PS(IRQS_PENDING);
37
38 PD(IRQS_INPROGRESS);
39 PD(IRQS_DISABLED);
40 PD(IRQS_MASKED);
41}
42
43#undef P
44#undef PS
45#undef PD
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
new file mode 100644
index 000000000000..b5fcd96c7102
--- /dev/null
+++ b/kernel/irq/dummychip.c
@@ -0,0 +1,59 @@
1/*
2 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
3 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
4 *
5 * This file contains the dummy interrupt chip implementation
6 */
7#include <linux/interrupt.h>
8#include <linux/irq.h>
9
10#include "internals.h"
11
12/*
13 * What should we do if we get a hw irq event on an illegal vector?
14 * Each architecture has to answer this themself.
15 */
16static void ack_bad(struct irq_data *data)
17{
18 struct irq_desc *desc = irq_data_to_desc(data);
19
20 print_irq_desc(data->irq, desc);
21 ack_bad_irq(data->irq);
22}
23
24/*
25 * NOP functions
26 */
27static void noop(struct irq_data *data) { }
28
29static unsigned int noop_ret(struct irq_data *data)
30{
31 return 0;
32}
33
34/*
35 * Generic no controller implementation
36 */
37struct irq_chip no_irq_chip = {
38 .name = "none",
39 .irq_startup = noop_ret,
40 .irq_shutdown = noop,
41 .irq_enable = noop,
42 .irq_disable = noop,
43 .irq_ack = ack_bad,
44};
45
46/*
47 * Generic dummy implementation which can be used for
48 * real dumb interrupt sources
49 */
50struct irq_chip dummy_irq_chip = {
51 .name = "dummy",
52 .irq_startup = noop_ret,
53 .irq_shutdown = noop,
54 .irq_enable = noop,
55 .irq_disable = noop,
56 .irq_ack = noop,
57 .irq_mask = noop,
58 .irq_unmask = noop,
59};
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
new file mode 100644
index 000000000000..3a2cab407b93
--- /dev/null
+++ b/kernel/irq/generic-chip.c
@@ -0,0 +1,368 @@
1/*
2 * Library implementing the most common irq chip callback functions
3 *
4 * Copyright (C) 2011, Thomas Gleixner
5 */
6#include <linux/io.h>
7#include <linux/irq.h>
8#include <linux/slab.h>
9#include <linux/interrupt.h>
10#include <linux/kernel_stat.h>
11#include <linux/syscore_ops.h>
12
13#include "internals.h"
14
15static LIST_HEAD(gc_list);
16static DEFINE_RAW_SPINLOCK(gc_lock);
17
18static inline struct irq_chip_regs *cur_regs(struct irq_data *d)
19{
20 return &container_of(d->chip, struct irq_chip_type, chip)->regs;
21}
22
23/**
24 * irq_gc_noop - NOOP function
25 * @d: irq_data
26 */
27void irq_gc_noop(struct irq_data *d)
28{
29}
30
31/**
32 * irq_gc_mask_disable_reg - Mask chip via disable register
33 * @d: irq_data
34 *
35 * Chip has separate enable/disable registers instead of a single mask
36 * register.
37 */
38void irq_gc_mask_disable_reg(struct irq_data *d)
39{
40 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
41 u32 mask = 1 << (d->irq - gc->irq_base);
42
43 irq_gc_lock(gc);
44 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable);
45 gc->mask_cache &= ~mask;
46 irq_gc_unlock(gc);
47}
48
49/**
50 * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register
51 * @d: irq_data
52 *
53 * Chip has a single mask register. Values of this register are cached
54 * and protected by gc->lock
55 */
56void irq_gc_mask_set_bit(struct irq_data *d)
57{
58 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
59 u32 mask = 1 << (d->irq - gc->irq_base);
60
61 irq_gc_lock(gc);
62 gc->mask_cache |= mask;
63 irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
64 irq_gc_unlock(gc);
65}
66
67/**
68 * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register
69 * @d: irq_data
70 *
71 * Chip has a single mask register. Values of this register are cached
72 * and protected by gc->lock
73 */
74void irq_gc_mask_clr_bit(struct irq_data *d)
75{
76 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
77 u32 mask = 1 << (d->irq - gc->irq_base);
78
79 irq_gc_lock(gc);
80 gc->mask_cache &= ~mask;
81 irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
82 irq_gc_unlock(gc);
83}
84
85/**
86 * irq_gc_unmask_enable_reg - Unmask chip via enable register
87 * @d: irq_data
88 *
89 * Chip has separate enable/disable registers instead of a single mask
90 * register.
91 */
92void irq_gc_unmask_enable_reg(struct irq_data *d)
93{
94 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
95 u32 mask = 1 << (d->irq - gc->irq_base);
96
97 irq_gc_lock(gc);
98 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable);
99 gc->mask_cache |= mask;
100 irq_gc_unlock(gc);
101}
102
103/**
104 * irq_gc_ack_set_bit - Ack pending interrupt via setting bit
105 * @d: irq_data
106 */
107void irq_gc_ack_set_bit(struct irq_data *d)
108{
109 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
110 u32 mask = 1 << (d->irq - gc->irq_base);
111
112 irq_gc_lock(gc);
113 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
114 irq_gc_unlock(gc);
115}
116
117/**
118 * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit
119 * @d: irq_data
120 */
121void irq_gc_ack_clr_bit(struct irq_data *d)
122{
123 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
124 u32 mask = ~(1 << (d->irq - gc->irq_base));
125
126 irq_gc_lock(gc);
127 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
128 irq_gc_unlock(gc);
129}
130
131/**
132 * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt
133 * @d: irq_data
134 */
135void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
136{
137 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
138 u32 mask = 1 << (d->irq - gc->irq_base);
139
140 irq_gc_lock(gc);
141 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask);
142 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
143 irq_gc_unlock(gc);
144}
145
146/**
147 * irq_gc_eoi - EOI interrupt
148 * @d: irq_data
149 */
150void irq_gc_eoi(struct irq_data *d)
151{
152 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
153 u32 mask = 1 << (d->irq - gc->irq_base);
154
155 irq_gc_lock(gc);
156 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi);
157 irq_gc_unlock(gc);
158}
159
160/**
161 * irq_gc_set_wake - Set/clr wake bit for an interrupt
162 * @d: irq_data
163 *
164 * For chips where the wake from suspend functionality is not
165 * configured in a separate register and the wakeup active state is
166 * just stored in a bitmask.
167 */
168int irq_gc_set_wake(struct irq_data *d, unsigned int on)
169{
170 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
171 u32 mask = 1 << (d->irq - gc->irq_base);
172
173 if (!(mask & gc->wake_enabled))
174 return -EINVAL;
175
176 irq_gc_lock(gc);
177 if (on)
178 gc->wake_active |= mask;
179 else
180 gc->wake_active &= ~mask;
181 irq_gc_unlock(gc);
182 return 0;
183}
184
185/**
186 * irq_alloc_generic_chip - Allocate a generic chip and initialize it
187 * @name: Name of the irq chip
188 * @num_ct: Number of irq_chip_type instances associated with this
189 * @irq_base: Interrupt base nr for this chip
190 * @reg_base: Register base address (virtual)
191 * @handler: Default flow handler associated with this chip
192 *
193 * Returns an initialized irq_chip_generic structure. The chip defaults
194 * to the primary (index 0) irq_chip_type and @handler
195 */
196struct irq_chip_generic *
197irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
198 void __iomem *reg_base, irq_flow_handler_t handler)
199{
200 struct irq_chip_generic *gc;
201 unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
202
203 gc = kzalloc(sz, GFP_KERNEL);
204 if (gc) {
205 raw_spin_lock_init(&gc->lock);
206 gc->num_ct = num_ct;
207 gc->irq_base = irq_base;
208 gc->reg_base = reg_base;
209 gc->chip_types->chip.name = name;
210 gc->chip_types->handler = handler;
211 }
212 return gc;
213}
214
215/*
216 * Separate lockdep class for interrupt chip which can nest irq_desc
217 * lock.
218 */
219static struct lock_class_key irq_nested_lock_class;
220
221/**
222 * irq_setup_generic_chip - Setup a range of interrupts with a generic chip
223 * @gc: Generic irq chip holding all data
224 * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base
225 * @flags: Flags for initialization
226 * @clr: IRQ_* bits to clear
227 * @set: IRQ_* bits to set
228 *
229 * Set up max. 32 interrupts starting from gc->irq_base. Note, this
230 * initializes all interrupts to the primary irq_chip_type and its
231 * associated handler.
232 */
233void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
234 enum irq_gc_flags flags, unsigned int clr,
235 unsigned int set)
236{
237 struct irq_chip_type *ct = gc->chip_types;
238 unsigned int i;
239
240 raw_spin_lock(&gc_lock);
241 list_add_tail(&gc->list, &gc_list);
242 raw_spin_unlock(&gc_lock);
243
244 /* Init mask cache ? */
245 if (flags & IRQ_GC_INIT_MASK_CACHE)
246 gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
247
248 for (i = gc->irq_base; msk; msk >>= 1, i++) {
249 if (!msk & 0x01)
250 continue;
251
252 if (flags & IRQ_GC_INIT_NESTED_LOCK)
253 irq_set_lockdep_class(i, &irq_nested_lock_class);
254
255 irq_set_chip_and_handler(i, &ct->chip, ct->handler);
256 irq_set_chip_data(i, gc);
257 irq_modify_status(i, clr, set);
258 }
259 gc->irq_cnt = i - gc->irq_base;
260}
261
262/**
263 * irq_setup_alt_chip - Switch to alternative chip
264 * @d: irq_data for this interrupt
265 * @type Flow type to be initialized
266 *
267 * Only to be called from chip->irq_set_type() callbacks.
268 */
269int irq_setup_alt_chip(struct irq_data *d, unsigned int type)
270{
271 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
272 struct irq_chip_type *ct = gc->chip_types;
273 unsigned int i;
274
275 for (i = 0; i < gc->num_ct; i++, ct++) {
276 if (ct->type & type) {
277 d->chip = &ct->chip;
278 irq_data_to_desc(d)->handle_irq = ct->handler;
279 return 0;
280 }
281 }
282 return -EINVAL;
283}
284
285/**
286 * irq_remove_generic_chip - Remove a chip
287 * @gc: Generic irq chip holding all data
288 * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base
289 * @clr: IRQ_* bits to clear
290 * @set: IRQ_* bits to set
291 *
292 * Remove up to 32 interrupts starting from gc->irq_base.
293 */
294void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
295 unsigned int clr, unsigned int set)
296{
297 unsigned int i = gc->irq_base;
298
299 raw_spin_lock(&gc_lock);
300 list_del(&gc->list);
301 raw_spin_unlock(&gc_lock);
302
303 for (; msk; msk >>= 1, i++) {
304 if (!msk & 0x01)
305 continue;
306
307 /* Remove handler first. That will mask the irq line */
308 irq_set_handler(i, NULL);
309 irq_set_chip(i, &no_irq_chip);
310 irq_set_chip_data(i, NULL);
311 irq_modify_status(i, clr, set);
312 }
313}
314
315#ifdef CONFIG_PM
316static int irq_gc_suspend(void)
317{
318 struct irq_chip_generic *gc;
319
320 list_for_each_entry(gc, &gc_list, list) {
321 struct irq_chip_type *ct = gc->chip_types;
322
323 if (ct->chip.irq_suspend)
324 ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base));
325 }
326 return 0;
327}
328
329static void irq_gc_resume(void)
330{
331 struct irq_chip_generic *gc;
332
333 list_for_each_entry(gc, &gc_list, list) {
334 struct irq_chip_type *ct = gc->chip_types;
335
336 if (ct->chip.irq_resume)
337 ct->chip.irq_resume(irq_get_irq_data(gc->irq_base));
338 }
339}
340#else
341#define irq_gc_suspend NULL
342#define irq_gc_resume NULL
343#endif
344
345static void irq_gc_shutdown(void)
346{
347 struct irq_chip_generic *gc;
348
349 list_for_each_entry(gc, &gc_list, list) {
350 struct irq_chip_type *ct = gc->chip_types;
351
352 if (ct->chip.irq_pm_shutdown)
353 ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base));
354 }
355}
356
357static struct syscore_ops irq_gc_syscore_ops = {
358 .suspend = irq_gc_suspend,
359 .resume = irq_gc_resume,
360 .shutdown = irq_gc_shutdown,
361};
362
363static int __init irq_gc_init_ops(void)
364{
365 register_syscore_ops(&irq_gc_syscore_ops);
366 return 0;
367}
368device_initcall(irq_gc_init_ops);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 27e5c6911223..470d08c82bbe 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,24 +11,15 @@
11 */ 11 */
12 12
13#include <linux/irq.h> 13#include <linux/irq.h>
14#include <linux/sched.h>
15#include <linux/slab.h>
16#include <linux/module.h>
17#include <linux/random.h> 14#include <linux/random.h>
15#include <linux/sched.h>
18#include <linux/interrupt.h> 16#include <linux/interrupt.h>
19#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
20#include <linux/rculist.h> 18
21#include <linux/hash.h>
22#include <linux/radix-tree.h>
23#include <trace/events/irq.h> 19#include <trace/events/irq.h>
24 20
25#include "internals.h" 21#include "internals.h"
26 22
27/*
28 * lockdep: we want to handle all irq_desc locks as a single lock-class:
29 */
30struct lock_class_key irq_desc_lock_class;
31
32/** 23/**
33 * handle_bad_irq - handle spurious and unhandled irqs 24 * handle_bad_irq - handle spurious and unhandled irqs
34 * @irq: the interrupt number 25 * @irq: the interrupt number
@@ -43,304 +34,6 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
43 ack_bad_irq(irq); 34 ack_bad_irq(irq);
44} 35}
45 36
46#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
47static void __init init_irq_default_affinity(void)
48{
49 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
50 cpumask_setall(irq_default_affinity);
51}
52#else
53static void __init init_irq_default_affinity(void)
54{
55}
56#endif
57
58/*
59 * Linux has a controller-independent interrupt architecture.
60 * Every controller has a 'controller-template', that is used
61 * by the main code to do the right thing. Each driver-visible
62 * interrupt source is transparently wired to the appropriate
63 * controller. Thus drivers need not be aware of the
64 * interrupt-controller.
65 *
66 * The code is designed to be easily extended with new/different
67 * interrupt controllers, without having to do assembly magic or
68 * having to touch the generic code.
69 *
70 * Controller mappings for all interrupt sources:
71 */
72int nr_irqs = NR_IRQS;
73EXPORT_SYMBOL_GPL(nr_irqs);
74
75#ifdef CONFIG_SPARSE_IRQ
76
77static struct irq_desc irq_desc_init = {
78 .irq = -1,
79 .status = IRQ_DISABLED,
80 .chip = &no_irq_chip,
81 .handle_irq = handle_bad_irq,
82 .depth = 1,
83 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
84};
85
86void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
87{
88 void *ptr;
89
90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
91 GFP_ATOMIC, node);
92
93 /*
94 * don't overwite if can not get new one
95 * init_copy_kstat_irqs() could still use old one
96 */
97 if (ptr) {
98 printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node);
99 desc->kstat_irqs = ptr;
100 }
101}
102
103static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
104{
105 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
106
107 raw_spin_lock_init(&desc->lock);
108 desc->irq = irq;
109#ifdef CONFIG_SMP
110 desc->node = node;
111#endif
112 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
113 init_kstat_irqs(desc, node, nr_cpu_ids);
114 if (!desc->kstat_irqs) {
115 printk(KERN_ERR "can not alloc kstat_irqs\n");
116 BUG_ON(1);
117 }
118 if (!alloc_desc_masks(desc, node, false)) {
119 printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
120 BUG_ON(1);
121 }
122 init_desc_masks(desc);
123 arch_init_chip_data(desc, node);
124}
125
126/*
127 * Protect the sparse_irqs:
128 */
129DEFINE_RAW_SPINLOCK(sparse_irq_lock);
130
131static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
132
133static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
134{
135 radix_tree_insert(&irq_desc_tree, irq, desc);
136}
137
138struct irq_desc *irq_to_desc(unsigned int irq)
139{
140 return radix_tree_lookup(&irq_desc_tree, irq);
141}
142
143void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
144{
145 void **ptr;
146
147 ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
148 if (ptr)
149 radix_tree_replace_slot(ptr, desc);
150}
151
152static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
153 [0 ... NR_IRQS_LEGACY-1] = {
154 .irq = -1,
155 .status = IRQ_DISABLED,
156 .chip = &no_irq_chip,
157 .handle_irq = handle_bad_irq,
158 .depth = 1,
159 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
160 }
161};
162
163static unsigned int *kstat_irqs_legacy;
164
165int __init early_irq_init(void)
166{
167 struct irq_desc *desc;
168 int legacy_count;
169 int node;
170 int i;
171
172 init_irq_default_affinity();
173
174 /* initialize nr_irqs based on nr_cpu_ids */
175 arch_probe_nr_irqs();
176 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
177
178 desc = irq_desc_legacy;
179 legacy_count = ARRAY_SIZE(irq_desc_legacy);
180 node = first_online_node;
181
182 /* allocate based on nr_cpu_ids */
183 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
184 sizeof(int), GFP_NOWAIT, node);
185
186 for (i = 0; i < legacy_count; i++) {
187 desc[i].irq = i;
188#ifdef CONFIG_SMP
189 desc[i].node = node;
190#endif
191 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
192 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
193 alloc_desc_masks(&desc[i], node, true);
194 init_desc_masks(&desc[i]);
195 set_irq_desc(i, &desc[i]);
196 }
197
198 return arch_early_irq_init();
199}
200
201struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
202{
203 struct irq_desc *desc;
204 unsigned long flags;
205
206 if (irq >= nr_irqs) {
207 WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
208 irq, nr_irqs);
209 return NULL;
210 }
211
212 desc = irq_to_desc(irq);
213 if (desc)
214 return desc;
215
216 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
217
218 /* We have to check it to avoid races with another CPU */
219 desc = irq_to_desc(irq);
220 if (desc)
221 goto out_unlock;
222
223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
224
225 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
226 if (!desc) {
227 printk(KERN_ERR "can not alloc irq_desc\n");
228 BUG_ON(1);
229 }
230 init_one_irq_desc(irq, desc, node);
231
232 set_irq_desc(irq, desc);
233
234out_unlock:
235 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
236
237 return desc;
238}
239
240#else /* !CONFIG_SPARSE_IRQ */
241
242struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
243 [0 ... NR_IRQS-1] = {
244 .status = IRQ_DISABLED,
245 .chip = &no_irq_chip,
246 .handle_irq = handle_bad_irq,
247 .depth = 1,
248 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
249 }
250};
251
252static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
253int __init early_irq_init(void)
254{
255 struct irq_desc *desc;
256 int count;
257 int i;
258
259 init_irq_default_affinity();
260
261 printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
262
263 desc = irq_desc;
264 count = ARRAY_SIZE(irq_desc);
265
266 for (i = 0; i < count; i++) {
267 desc[i].irq = i;
268 alloc_desc_masks(&desc[i], 0, true);
269 init_desc_masks(&desc[i]);
270 desc[i].kstat_irqs = kstat_irqs_all[i];
271 }
272 return arch_early_irq_init();
273}
274
275struct irq_desc *irq_to_desc(unsigned int irq)
276{
277 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
278}
279
280struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
281{
282 return irq_to_desc(irq);
283}
284#endif /* !CONFIG_SPARSE_IRQ */
285
286void clear_kstat_irqs(struct irq_desc *desc)
287{
288 memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
289}
290
291/*
292 * What should we do if we get a hw irq event on an illegal vector?
293 * Each architecture has to answer this themself.
294 */
295static void ack_bad(unsigned int irq)
296{
297 struct irq_desc *desc = irq_to_desc(irq);
298
299 print_irq_desc(irq, desc);
300 ack_bad_irq(irq);
301}
302
303/*
304 * NOP functions
305 */
306static void noop(unsigned int irq)
307{
308}
309
310static unsigned int noop_ret(unsigned int irq)
311{
312 return 0;
313}
314
315/*
316 * Generic no controller implementation
317 */
318struct irq_chip no_irq_chip = {
319 .name = "none",
320 .startup = noop_ret,
321 .shutdown = noop,
322 .enable = noop,
323 .disable = noop,
324 .ack = ack_bad,
325 .end = noop,
326};
327
328/*
329 * Generic dummy implementation which can be used for
330 * real dumb interrupt sources
331 */
332struct irq_chip dummy_irq_chip = {
333 .name = "dummy",
334 .startup = noop_ret,
335 .shutdown = noop,
336 .enable = noop,
337 .disable = noop,
338 .ack = noop,
339 .mask = noop,
340 .unmask = noop,
341 .end = noop,
342};
343
344/* 37/*
345 * Special, empty irq handler: 38 * Special, empty irq handler:
346 */ 39 */
@@ -358,31 +51,87 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
358 "but no thread function available.", irq, action->name); 51 "but no thread function available.", irq, action->name);
359} 52}
360 53
361/** 54static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
362 * handle_IRQ_event - irq action chain handler 55{
363 * @irq: the interrupt number 56 /*
364 * @action: the interrupt action chain for this irq 57 * Wake up the handler thread for this action. In case the
365 * 58 * thread crashed and was killed we just pretend that we
366 * Handles the action chain of an irq event 59 * handled the interrupt. The hardirq handler has disabled the
367 */ 60 * device interrupt, so no irq storm is lurking. If the
368irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) 61 * RUNTHREAD bit is already set, nothing to do.
62 */
63 if (test_bit(IRQTF_DIED, &action->thread_flags) ||
64 test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
65 return;
66
67 /*
68 * It's safe to OR the mask lockless here. We have only two
69 * places which write to threads_oneshot: This code and the
70 * irq thread.
71 *
72 * This code is the hard irq context and can never run on two
73 * cpus in parallel. If it ever does we have more serious
74 * problems than this bitmask.
75 *
76 * The irq threads of this irq which clear their "running" bit
77 * in threads_oneshot are serialized via desc->lock against
78 * each other and they are serialized against this code by
79 * IRQS_INPROGRESS.
80 *
81 * Hard irq handler:
82 *
83 * spin_lock(desc->lock);
84 * desc->state |= IRQS_INPROGRESS;
85 * spin_unlock(desc->lock);
86 * set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
87 * desc->threads_oneshot |= mask;
88 * spin_lock(desc->lock);
89 * desc->state &= ~IRQS_INPROGRESS;
90 * spin_unlock(desc->lock);
91 *
92 * irq thread:
93 *
94 * again:
95 * spin_lock(desc->lock);
96 * if (desc->state & IRQS_INPROGRESS) {
97 * spin_unlock(desc->lock);
98 * while(desc->state & IRQS_INPROGRESS)
99 * cpu_relax();
100 * goto again;
101 * }
102 * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
103 * desc->threads_oneshot &= ~mask;
104 * spin_unlock(desc->lock);
105 *
106 * So either the thread waits for us to clear IRQS_INPROGRESS
107 * or we are waiting in the flow handler for desc->lock to be
108 * released before we reach this point. The thread also checks
109 * IRQTF_RUNTHREAD under desc->lock. If set it leaves
110 * threads_oneshot untouched and runs the thread another time.
111 */
112 desc->threads_oneshot |= action->thread_mask;
113 wake_up_process(action->thread);
114}
115
116irqreturn_t
117handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
369{ 118{
370 irqreturn_t ret, retval = IRQ_NONE; 119 irqreturn_t retval = IRQ_NONE;
371 unsigned int status = 0; 120 unsigned int random = 0, irq = desc->irq_data.irq;
372 121
373 do { 122 do {
123 irqreturn_t res;
124
374 trace_irq_handler_entry(irq, action); 125 trace_irq_handler_entry(irq, action);
375 ret = action->handler(irq, action->dev_id); 126 res = action->handler(irq, action->dev_id);
376 trace_irq_handler_exit(irq, action, ret); 127 trace_irq_handler_exit(irq, action, res);
377 128
378 switch (ret) { 129 if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
379 case IRQ_WAKE_THREAD: 130 irq, action->handler))
380 /* 131 local_irq_disable();
381 * Set result to handled so the spurious check
382 * does not trigger.
383 */
384 ret = IRQ_HANDLED;
385 132
133 switch (res) {
134 case IRQ_WAKE_THREAD:
386 /* 135 /*
387 * Catch drivers which return WAKE_THREAD but 136 * Catch drivers which return WAKE_THREAD but
388 * did not set up a thread function 137 * did not set up a thread function
@@ -392,165 +141,41 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
392 break; 141 break;
393 } 142 }
394 143
395 /* 144 irq_wake_thread(desc, action);
396 * Wake up the handler thread for this
397 * action. In case the thread crashed and was
398 * killed we just pretend that we handled the
399 * interrupt. The hardirq handler above has
400 * disabled the device interrupt, so no irq
401 * storm is lurking.
402 */
403 if (likely(!test_bit(IRQTF_DIED,
404 &action->thread_flags))) {
405 set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
406 wake_up_process(action->thread);
407 }
408 145
409 /* Fall through to add to randomness */ 146 /* Fall through to add to randomness */
410 case IRQ_HANDLED: 147 case IRQ_HANDLED:
411 status |= action->flags; 148 random |= action->flags;
412 break; 149 break;
413 150
414 default: 151 default:
415 break; 152 break;
416 } 153 }
417 154
418 retval |= ret; 155 retval |= res;
419 action = action->next; 156 action = action->next;
420 } while (action); 157 } while (action);
421 158
422 if (status & IRQF_SAMPLE_RANDOM) 159 if (random & IRQF_SAMPLE_RANDOM)
423 add_interrupt_randomness(irq); 160 add_interrupt_randomness(irq);
424 local_irq_disable();
425 161
162 if (!noirqdebug)
163 note_interrupt(irq, desc, retval);
426 return retval; 164 return retval;
427} 165}
428 166
429#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ 167irqreturn_t handle_irq_event(struct irq_desc *desc)
430
431#ifdef CONFIG_ENABLE_WARN_DEPRECATED
432# warning __do_IRQ is deprecated. Please convert to proper flow handlers
433#endif
434
435/**
436 * __do_IRQ - original all in one highlevel IRQ handler
437 * @irq: the interrupt number
438 *
439 * __do_IRQ handles all normal device IRQ's (the special
440 * SMP cross-CPU interrupts have their own specific
441 * handlers).
442 *
443 * This is the original x86 implementation which is used for every
444 * interrupt type.
445 */
446unsigned int __do_IRQ(unsigned int irq)
447{ 168{
448 struct irq_desc *desc = irq_to_desc(irq); 169 struct irqaction *action = desc->action;
449 struct irqaction *action; 170 irqreturn_t ret;
450 unsigned int status;
451
452 kstat_incr_irqs_this_cpu(irq, desc);
453
454 if (CHECK_IRQ_PER_CPU(desc->status)) {
455 irqreturn_t action_ret;
456
457 /*
458 * No locking required for CPU-local interrupts:
459 */
460 if (desc->chip->ack)
461 desc->chip->ack(irq);
462 if (likely(!(desc->status & IRQ_DISABLED))) {
463 action_ret = handle_IRQ_event(irq, desc->action);
464 if (!noirqdebug)
465 note_interrupt(irq, desc, action_ret);
466 }
467 desc->chip->end(irq);
468 return 1;
469 }
470
471 raw_spin_lock(&desc->lock);
472 if (desc->chip->ack)
473 desc->chip->ack(irq);
474 /*
475 * REPLAY is when Linux resends an IRQ that was dropped earlier
476 * WAITING is used by probe to mark irqs that are being tested
477 */
478 status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
479 status |= IRQ_PENDING; /* we _want_ to handle it */
480
481 /*
482 * If the IRQ is disabled for whatever reason, we cannot
483 * use the action we have.
484 */
485 action = NULL;
486 if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
487 action = desc->action;
488 status &= ~IRQ_PENDING; /* we commit to handling */
489 status |= IRQ_INPROGRESS; /* we are handling it */
490 }
491 desc->status = status;
492 171
493 /* 172 desc->istate &= ~IRQS_PENDING;
494 * If there is no IRQ handler or it was disabled, exit early. 173 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
495 * Since we set PENDING, if another processor is handling
496 * a different instance of this same irq, the other processor
497 * will take care of it.
498 */
499 if (unlikely(!action))
500 goto out;
501
502 /*
503 * Edge triggered interrupts need to remember
504 * pending events.
505 * This applies to any hw interrupts that allow a second
506 * instance of the same irq to arrive while we are in do_IRQ
507 * or in the handler. But the code here only handles the _second_
508 * instance of the irq, not the third or fourth. So it is mostly
509 * useful for irq hardware that does not mask cleanly in an
510 * SMP environment.
511 */
512 for (;;) {
513 irqreturn_t action_ret;
514
515 raw_spin_unlock(&desc->lock);
516
517 action_ret = handle_IRQ_event(irq, action);
518 if (!noirqdebug)
519 note_interrupt(irq, desc, action_ret);
520
521 raw_spin_lock(&desc->lock);
522 if (likely(!(desc->status & IRQ_PENDING)))
523 break;
524 desc->status &= ~IRQ_PENDING;
525 }
526 desc->status &= ~IRQ_INPROGRESS;
527
528out:
529 /*
530 * The ->end() handler has to deal with interrupts which got
531 * disabled while the handler was running.
532 */
533 desc->chip->end(irq);
534 raw_spin_unlock(&desc->lock); 174 raw_spin_unlock(&desc->lock);
535 175
536 return 1; 176 ret = handle_irq_event_percpu(desc, action);
537}
538#endif
539
540void early_init_irq_lock_class(void)
541{
542 struct irq_desc *desc;
543 int i;
544
545 for_each_irq_desc(i, desc) {
546 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
547 }
548}
549 177
550unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) 178 raw_spin_lock(&desc->lock);
551{ 179 irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
552 struct irq_desc *desc = irq_to_desc(irq); 180 return ret;
553 return desc ? desc->kstat_irqs[cpu] : 0;
554} 181}
555EXPORT_SYMBOL(kstat_irqs_cpu);
556
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index c63f3bc88f0b..6546431447d7 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,95 +1,171 @@
1/* 1/*
2 * IRQ subsystem internal functions and variables: 2 * IRQ subsystem internal functions and variables:
3 *
4 * Do not ever include this file from anything else than
5 * kernel/irq/. Do not even think about using any information outside
6 * of this file for your non core code.
3 */ 7 */
8#include <linux/irqdesc.h>
9
10#ifdef CONFIG_SPARSE_IRQ
11# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
12#else
13# define IRQ_BITMAP_BITS NR_IRQS
14#endif
15
16#define istate core_internal_state__do_not_mess_with_it
4 17
5extern int noirqdebug; 18extern int noirqdebug;
6 19
7/* Set default functions for irq_chip structures: */ 20/*
8extern void irq_chip_set_defaults(struct irq_chip *chip); 21 * Bits used by threaded handlers:
22 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
23 * IRQTF_DIED - handler thread died
24 * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
25 * IRQTF_AFFINITY - irq thread is requested to adjust affinity
26 * IRQTF_FORCED_THREAD - irq action is force threaded
27 */
28enum {
29 IRQTF_RUNTHREAD,
30 IRQTF_DIED,
31 IRQTF_WARNED,
32 IRQTF_AFFINITY,
33 IRQTF_FORCED_THREAD,
34};
9 35
10/* Set default handler: */ 36/*
11extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); 37 * Bit masks for desc->state
38 *
39 * IRQS_AUTODETECT - autodetection in progress
40 * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt
41 * detection
42 * IRQS_POLL_INPROGRESS - polling in progress
43 * IRQS_ONESHOT - irq is not unmasked in primary handler
44 * IRQS_REPLAY - irq is replayed
45 * IRQS_WAITING - irq is waiting
46 * IRQS_PENDING - irq is pending and replayed later
47 * IRQS_SUSPENDED - irq is suspended
48 */
49enum {
50 IRQS_AUTODETECT = 0x00000001,
51 IRQS_SPURIOUS_DISABLED = 0x00000002,
52 IRQS_POLL_INPROGRESS = 0x00000008,
53 IRQS_ONESHOT = 0x00000020,
54 IRQS_REPLAY = 0x00000040,
55 IRQS_WAITING = 0x00000080,
56 IRQS_PENDING = 0x00000200,
57 IRQS_SUSPENDED = 0x00000800,
58};
59
60#include "debug.h"
61#include "settings.h"
62
63#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
12 64
13extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 65extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
14 unsigned long flags); 66 unsigned long flags);
15extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); 67extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
16extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 68extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
17 69
18extern struct lock_class_key irq_desc_lock_class; 70extern int irq_startup(struct irq_desc *desc);
71extern void irq_shutdown(struct irq_desc *desc);
72extern void irq_enable(struct irq_desc *desc);
73extern void irq_disable(struct irq_desc *desc);
74extern void mask_irq(struct irq_desc *desc);
75extern void unmask_irq(struct irq_desc *desc);
76
19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 77extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
20extern void clear_kstat_irqs(struct irq_desc *desc);
21extern raw_spinlock_t sparse_irq_lock;
22 78
23#ifdef CONFIG_SPARSE_IRQ 79irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
24void replace_irq_desc(unsigned int irq, struct irq_desc *desc); 80irqreturn_t handle_irq_event(struct irq_desc *desc);
25#endif 81
82/* Resending of interrupts :*/
83void check_irq_resend(struct irq_desc *desc, unsigned int irq);
84bool irq_wait_for_poll(struct irq_desc *desc);
26 85
27#ifdef CONFIG_PROC_FS 86#ifdef CONFIG_PROC_FS
28extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); 87extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
88extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc);
29extern void register_handler_proc(unsigned int irq, struct irqaction *action); 89extern void register_handler_proc(unsigned int irq, struct irqaction *action);
30extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); 90extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
31#else 91#else
32static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } 92static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
93static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { }
33static inline void register_handler_proc(unsigned int irq, 94static inline void register_handler_proc(unsigned int irq,
34 struct irqaction *action) { } 95 struct irqaction *action) { }
35static inline void unregister_handler_proc(unsigned int irq, 96static inline void unregister_handler_proc(unsigned int irq,
36 struct irqaction *action) { } 97 struct irqaction *action) { }
37#endif 98#endif
38 99
39extern int irq_select_affinity_usr(unsigned int irq); 100extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
40 101
41extern void irq_set_thread_affinity(struct irq_desc *desc); 102extern void irq_set_thread_affinity(struct irq_desc *desc);
42 103
43/* Inline functions for support of irq chips on slow busses */ 104/* Inline functions for support of irq chips on slow busses */
44static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc) 105static inline void chip_bus_lock(struct irq_desc *desc)
106{
107 if (unlikely(desc->irq_data.chip->irq_bus_lock))
108 desc->irq_data.chip->irq_bus_lock(&desc->irq_data);
109}
110
111static inline void chip_bus_sync_unlock(struct irq_desc *desc)
112{
113 if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock))
114 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
115}
116
117struct irq_desc *
118__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus);
119void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
120
121static inline struct irq_desc *
122irq_get_desc_buslock(unsigned int irq, unsigned long *flags)
123{
124 return __irq_get_desc_lock(irq, flags, true);
125}
126
127static inline void
128irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
45{ 129{
46 if (unlikely(desc->chip->bus_lock)) 130 __irq_put_desc_unlock(desc, flags, true);
47 desc->chip->bus_lock(irq);
48} 131}
49 132
50static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc) 133static inline struct irq_desc *
134irq_get_desc_lock(unsigned int irq, unsigned long *flags)
51{ 135{
52 if (unlikely(desc->chip->bus_sync_unlock)) 136 return __irq_get_desc_lock(irq, flags, false);
53 desc->chip->bus_sync_unlock(irq); 137}
138
139static inline void
140irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
141{
142 __irq_put_desc_unlock(desc, flags, false);
54} 143}
55 144
56/* 145/*
57 * Debugging printout: 146 * Manipulation functions for irq_data.state
58 */ 147 */
148static inline void irqd_set_move_pending(struct irq_data *d)
149{
150 d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
151}
59 152
60#include <linux/kallsyms.h> 153static inline void irqd_clr_move_pending(struct irq_data *d)
61 154{
62#define P(f) if (desc->status & f) printk("%14s set\n", #f) 155 d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
156}
63 157
64static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) 158static inline void irqd_clear(struct irq_data *d, unsigned int mask)
65{ 159{
66 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", 160 d->state_use_accessors &= ~mask;
67 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
68 printk("->handle_irq(): %p, ", desc->handle_irq);
69 print_symbol("%s\n", (unsigned long)desc->handle_irq);
70 printk("->chip(): %p, ", desc->chip);
71 print_symbol("%s\n", (unsigned long)desc->chip);
72 printk("->action(): %p\n", desc->action);
73 if (desc->action) {
74 printk("->action->handler(): %p, ", desc->action->handler);
75 print_symbol("%s\n", (unsigned long)desc->action->handler);
76 }
77
78 P(IRQ_INPROGRESS);
79 P(IRQ_DISABLED);
80 P(IRQ_PENDING);
81 P(IRQ_REPLAY);
82 P(IRQ_AUTODETECT);
83 P(IRQ_WAITING);
84 P(IRQ_LEVEL);
85 P(IRQ_MASKED);
86#ifdef CONFIG_IRQ_PER_CPU
87 P(IRQ_PER_CPU);
88#endif
89 P(IRQ_NOPROBE);
90 P(IRQ_NOREQUEST);
91 P(IRQ_NOAUTOEN);
92} 161}
93 162
94#undef P 163static inline void irqd_set(struct irq_data *d, unsigned int mask)
164{
165 d->state_use_accessors |= mask;
166}
95 167
168static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
169{
170 return d->state_use_accessors & mask;
171}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
new file mode 100644
index 000000000000..4c60a50e66b2
--- /dev/null
+++ b/kernel/irq/irqdesc.c
@@ -0,0 +1,466 @@
1/*
2 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
3 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
4 *
5 * This file contains the interrupt descriptor management code
6 *
7 * Detailed information is available in Documentation/DocBook/genericirq
8 *
9 */
10#include <linux/irq.h>
11#include <linux/slab.h>
12#include <linux/module.h>
13#include <linux/interrupt.h>
14#include <linux/kernel_stat.h>
15#include <linux/radix-tree.h>
16#include <linux/bitmap.h>
17
18#include "internals.h"
19
20/*
21 * lockdep: we want to handle all irq_desc locks as a single lock-class:
22 */
23static struct lock_class_key irq_desc_lock_class;
24
25#if defined(CONFIG_SMP)
26static void __init init_irq_default_affinity(void)
27{
28 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
29 cpumask_setall(irq_default_affinity);
30}
31#else
32static void __init init_irq_default_affinity(void)
33{
34}
35#endif
36
37#ifdef CONFIG_SMP
38static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
39{
40 if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node))
41 return -ENOMEM;
42
43#ifdef CONFIG_GENERIC_PENDING_IRQ
44 if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
45 free_cpumask_var(desc->irq_data.affinity);
46 return -ENOMEM;
47 }
48#endif
49 return 0;
50}
51
52static void desc_smp_init(struct irq_desc *desc, int node)
53{
54 desc->irq_data.node = node;
55 cpumask_copy(desc->irq_data.affinity, irq_default_affinity);
56#ifdef CONFIG_GENERIC_PENDING_IRQ
57 cpumask_clear(desc->pending_mask);
58#endif
59}
60
61static inline int desc_node(struct irq_desc *desc)
62{
63 return desc->irq_data.node;
64}
65
66#else
67static inline int
68alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
69static inline void desc_smp_init(struct irq_desc *desc, int node) { }
70static inline int desc_node(struct irq_desc *desc) { return 0; }
71#endif
72
73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
74{
75 int cpu;
76
77 desc->irq_data.irq = irq;
78 desc->irq_data.chip = &no_irq_chip;
79 desc->irq_data.chip_data = NULL;
80 desc->irq_data.handler_data = NULL;
81 desc->irq_data.msi_desc = NULL;
82 irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
83 irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
84 desc->handle_irq = handle_bad_irq;
85 desc->depth = 1;
86 desc->irq_count = 0;
87 desc->irqs_unhandled = 0;
88 desc->name = NULL;
89 for_each_possible_cpu(cpu)
90 *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
91 desc_smp_init(desc, node);
92}
93
94int nr_irqs = NR_IRQS;
95EXPORT_SYMBOL_GPL(nr_irqs);
96
97static DEFINE_MUTEX(sparse_irq_lock);
98static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
99
100#ifdef CONFIG_SPARSE_IRQ
101
102static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
103
104static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
105{
106 radix_tree_insert(&irq_desc_tree, irq, desc);
107}
108
109struct irq_desc *irq_to_desc(unsigned int irq)
110{
111 return radix_tree_lookup(&irq_desc_tree, irq);
112}
113
114static void delete_irq_desc(unsigned int irq)
115{
116 radix_tree_delete(&irq_desc_tree, irq);
117}
118
119#ifdef CONFIG_SMP
120static void free_masks(struct irq_desc *desc)
121{
122#ifdef CONFIG_GENERIC_PENDING_IRQ
123 free_cpumask_var(desc->pending_mask);
124#endif
125 free_cpumask_var(desc->irq_data.affinity);
126}
127#else
128static inline void free_masks(struct irq_desc *desc) { }
129#endif
130
131static struct irq_desc *alloc_desc(int irq, int node)
132{
133 struct irq_desc *desc;
134 gfp_t gfp = GFP_KERNEL;
135
136 desc = kzalloc_node(sizeof(*desc), gfp, node);
137 if (!desc)
138 return NULL;
139 /* allocate based on nr_cpu_ids */
140 desc->kstat_irqs = alloc_percpu(unsigned int);
141 if (!desc->kstat_irqs)
142 goto err_desc;
143
144 if (alloc_masks(desc, gfp, node))
145 goto err_kstat;
146
147 raw_spin_lock_init(&desc->lock);
148 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
149
150 desc_set_defaults(irq, desc, node);
151
152 return desc;
153
154err_kstat:
155 free_percpu(desc->kstat_irqs);
156err_desc:
157 kfree(desc);
158 return NULL;
159}
160
161static void free_desc(unsigned int irq)
162{
163 struct irq_desc *desc = irq_to_desc(irq);
164
165 unregister_irq_proc(irq, desc);
166
167 mutex_lock(&sparse_irq_lock);
168 delete_irq_desc(irq);
169 mutex_unlock(&sparse_irq_lock);
170
171 free_masks(desc);
172 free_percpu(desc->kstat_irqs);
173 kfree(desc);
174}
175
176static int alloc_descs(unsigned int start, unsigned int cnt, int node)
177{
178 struct irq_desc *desc;
179 int i;
180
181 for (i = 0; i < cnt; i++) {
182 desc = alloc_desc(start + i, node);
183 if (!desc)
184 goto err;
185 mutex_lock(&sparse_irq_lock);
186 irq_insert_desc(start + i, desc);
187 mutex_unlock(&sparse_irq_lock);
188 }
189 return start;
190
191err:
192 for (i--; i >= 0; i--)
193 free_desc(start + i);
194
195 mutex_lock(&sparse_irq_lock);
196 bitmap_clear(allocated_irqs, start, cnt);
197 mutex_unlock(&sparse_irq_lock);
198 return -ENOMEM;
199}
200
201static int irq_expand_nr_irqs(unsigned int nr)
202{
203 if (nr > IRQ_BITMAP_BITS)
204 return -ENOMEM;
205 nr_irqs = nr;
206 return 0;
207}
208
209int __init early_irq_init(void)
210{
211 int i, initcnt, node = first_online_node;
212 struct irq_desc *desc;
213
214 init_irq_default_affinity();
215
216 /* Let arch update nr_irqs and return the nr of preallocated irqs */
217 initcnt = arch_probe_nr_irqs();
218 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
219
220 if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
221 nr_irqs = IRQ_BITMAP_BITS;
222
223 if (WARN_ON(initcnt > IRQ_BITMAP_BITS))
224 initcnt = IRQ_BITMAP_BITS;
225
226 if (initcnt > nr_irqs)
227 nr_irqs = initcnt;
228
229 for (i = 0; i < initcnt; i++) {
230 desc = alloc_desc(i, node);
231 set_bit(i, allocated_irqs);
232 irq_insert_desc(i, desc);
233 }
234 return arch_early_irq_init();
235}
236
237#else /* !CONFIG_SPARSE_IRQ */
238
239struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
240 [0 ... NR_IRQS-1] = {
241 .handle_irq = handle_bad_irq,
242 .depth = 1,
243 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
244 }
245};
246
247int __init early_irq_init(void)
248{
249 int count, i, node = first_online_node;
250 struct irq_desc *desc;
251
252 init_irq_default_affinity();
253
254 printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
255
256 desc = irq_desc;
257 count = ARRAY_SIZE(irq_desc);
258
259 for (i = 0; i < count; i++) {
260 desc[i].kstat_irqs = alloc_percpu(unsigned int);
261 alloc_masks(&desc[i], GFP_KERNEL, node);
262 raw_spin_lock_init(&desc[i].lock);
263 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
264 desc_set_defaults(i, &desc[i], node);
265 }
266 return arch_early_irq_init();
267}
268
269struct irq_desc *irq_to_desc(unsigned int irq)
270{
271 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
272}
273
274static void free_desc(unsigned int irq)
275{
276 dynamic_irq_cleanup(irq);
277}
278
279static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
280{
281 return start;
282}
283
284static int irq_expand_nr_irqs(unsigned int nr)
285{
286 return -ENOMEM;
287}
288
289#endif /* !CONFIG_SPARSE_IRQ */
290
291/**
292 * generic_handle_irq - Invoke the handler for a particular irq
293 * @irq: The irq number to handle
294 *
295 */
296int generic_handle_irq(unsigned int irq)
297{
298 struct irq_desc *desc = irq_to_desc(irq);
299
300 if (!desc)
301 return -EINVAL;
302 generic_handle_irq_desc(irq, desc);
303 return 0;
304}
305EXPORT_SYMBOL_GPL(generic_handle_irq);
306
307/* Dynamic interrupt handling */
308
309/**
310 * irq_free_descs - free irq descriptors
311 * @from: Start of descriptor range
312 * @cnt: Number of consecutive irqs to free
313 */
314void irq_free_descs(unsigned int from, unsigned int cnt)
315{
316 int i;
317
318 if (from >= nr_irqs || (from + cnt) > nr_irqs)
319 return;
320
321 for (i = 0; i < cnt; i++)
322 free_desc(from + i);
323
324 mutex_lock(&sparse_irq_lock);
325 bitmap_clear(allocated_irqs, from, cnt);
326 mutex_unlock(&sparse_irq_lock);
327}
328EXPORT_SYMBOL_GPL(irq_free_descs);
329
330/**
331 * irq_alloc_descs - allocate and initialize a range of irq descriptors
332 * @irq: Allocate for specific irq number if irq >= 0
333 * @from: Start the search from this irq number
334 * @cnt: Number of consecutive irqs to allocate.
335 * @node: Preferred node on which the irq descriptor should be allocated
336 *
337 * Returns the first irq number or error code
338 */
339int __ref
340irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
341{
342 int start, ret;
343
344 if (!cnt)
345 return -EINVAL;
346
347 if (irq >= 0) {
348 if (from > irq)
349 return -EINVAL;
350 from = irq;
351 }
352
353 mutex_lock(&sparse_irq_lock);
354
355 start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
356 from, cnt, 0);
357 ret = -EEXIST;
358 if (irq >=0 && start != irq)
359 goto err;
360
361 if (start + cnt > nr_irqs) {
362 ret = irq_expand_nr_irqs(start + cnt);
363 if (ret)
364 goto err;
365 }
366
367 bitmap_set(allocated_irqs, start, cnt);
368 mutex_unlock(&sparse_irq_lock);
369 return alloc_descs(start, cnt, node);
370
371err:
372 mutex_unlock(&sparse_irq_lock);
373 return ret;
374}
375EXPORT_SYMBOL_GPL(irq_alloc_descs);
376
377/**
378 * irq_reserve_irqs - mark irqs allocated
379 * @from: mark from irq number
380 * @cnt: number of irqs to mark
381 *
382 * Returns 0 on success or an appropriate error code
383 */
384int irq_reserve_irqs(unsigned int from, unsigned int cnt)
385{
386 unsigned int start;
387 int ret = 0;
388
389 if (!cnt || (from + cnt) > nr_irqs)
390 return -EINVAL;
391
392 mutex_lock(&sparse_irq_lock);
393 start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
394 if (start == from)
395 bitmap_set(allocated_irqs, start, cnt);
396 else
397 ret = -EEXIST;
398 mutex_unlock(&sparse_irq_lock);
399 return ret;
400}
401
402/**
403 * irq_get_next_irq - get next allocated irq number
404 * @offset: where to start the search
405 *
406 * Returns next irq number after offset or nr_irqs if none is found.
407 */
408unsigned int irq_get_next_irq(unsigned int offset)
409{
410 return find_next_bit(allocated_irqs, nr_irqs, offset);
411}
412
413struct irq_desc *
414__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus)
415{
416 struct irq_desc *desc = irq_to_desc(irq);
417
418 if (desc) {
419 if (bus)
420 chip_bus_lock(desc);
421 raw_spin_lock_irqsave(&desc->lock, *flags);
422 }
423 return desc;
424}
425
426void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
427{
428 raw_spin_unlock_irqrestore(&desc->lock, flags);
429 if (bus)
430 chip_bus_sync_unlock(desc);
431}
432
433/**
434 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
435 * @irq: irq number to initialize
436 */
437void dynamic_irq_cleanup(unsigned int irq)
438{
439 struct irq_desc *desc = irq_to_desc(irq);
440 unsigned long flags;
441
442 raw_spin_lock_irqsave(&desc->lock, flags);
443 desc_set_defaults(irq, desc, desc_node(desc));
444 raw_spin_unlock_irqrestore(&desc->lock, flags);
445}
446
447unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
448{
449 struct irq_desc *desc = irq_to_desc(irq);
450
451 return desc && desc->kstat_irqs ?
452 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
453}
454
455unsigned int kstat_irqs(unsigned int irq)
456{
457 struct irq_desc *desc = irq_to_desc(irq);
458 int cpu;
459 int sum = 0;
460
461 if (!desc || !desc->kstat_irqs)
462 return 0;
463 for_each_possible_cpu(cpu)
464 sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
465 return sum;
466}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c3003e9d91a3..0a7840aeb0fb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,17 @@
17 17
18#include "internals.h" 18#include "internals.h"
19 19
20#ifdef CONFIG_IRQ_FORCED_THREADING
21__read_mostly bool force_irqthreads;
22
23static int __init setup_forced_irqthreads(char *arg)
24{
25 force_irqthreads = true;
26 return 0;
27}
28early_param("threadirqs", setup_forced_irqthreads);
29#endif
30
20/** 31/**
21 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 32 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
22 * @irq: interrupt number to wait for 33 * @irq: interrupt number to wait for
@@ -30,7 +41,7 @@
30void synchronize_irq(unsigned int irq) 41void synchronize_irq(unsigned int irq)
31{ 42{
32 struct irq_desc *desc = irq_to_desc(irq); 43 struct irq_desc *desc = irq_to_desc(irq);
33 unsigned int status; 44 bool inprogress;
34 45
35 if (!desc) 46 if (!desc)
36 return; 47 return;
@@ -42,16 +53,16 @@ void synchronize_irq(unsigned int irq)
42 * Wait until we're out of the critical section. This might 53 * Wait until we're out of the critical section. This might
43 * give the wrong answer due to the lack of memory barriers. 54 * give the wrong answer due to the lack of memory barriers.
44 */ 55 */
45 while (desc->status & IRQ_INPROGRESS) 56 while (irqd_irq_inprogress(&desc->irq_data))
46 cpu_relax(); 57 cpu_relax();
47 58
48 /* Ok, that indicated we're done: double-check carefully. */ 59 /* Ok, that indicated we're done: double-check carefully. */
49 raw_spin_lock_irqsave(&desc->lock, flags); 60 raw_spin_lock_irqsave(&desc->lock, flags);
50 status = desc->status; 61 inprogress = irqd_irq_inprogress(&desc->irq_data);
51 raw_spin_unlock_irqrestore(&desc->lock, flags); 62 raw_spin_unlock_irqrestore(&desc->lock, flags);
52 63
53 /* Oops, that failed? */ 64 /* Oops, that failed? */
54 } while (status & IRQ_INPROGRESS); 65 } while (inprogress);
55 66
56 /* 67 /*
57 * We made sure that no hardirq handler is running. Now verify 68 * We made sure that no hardirq handler is running. Now verify
@@ -73,8 +84,8 @@ int irq_can_set_affinity(unsigned int irq)
73{ 84{
74 struct irq_desc *desc = irq_to_desc(irq); 85 struct irq_desc *desc = irq_to_desc(irq);
75 86
76 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || 87 if (!desc || !irqd_can_balance(&desc->irq_data) ||
77 !desc->chip->set_affinity) 88 !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
78 return 0; 89 return 0;
79 90
80 return 1; 91 return 1;
@@ -100,66 +111,180 @@ void irq_set_thread_affinity(struct irq_desc *desc)
100 } 111 }
101} 112}
102 113
114#ifdef CONFIG_GENERIC_PENDING_IRQ
115static inline bool irq_can_move_pcntxt(struct irq_data *data)
116{
117 return irqd_can_move_in_process_context(data);
118}
119static inline bool irq_move_pending(struct irq_data *data)
120{
121 return irqd_is_setaffinity_pending(data);
122}
123static inline void
124irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
125{
126 cpumask_copy(desc->pending_mask, mask);
127}
128static inline void
129irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
130{
131 cpumask_copy(mask, desc->pending_mask);
132}
133#else
134static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; }
135static inline bool irq_move_pending(struct irq_data *data) { return false; }
136static inline void
137irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { }
138static inline void
139irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
140#endif
141
142int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
143{
144 struct irq_chip *chip = irq_data_get_irq_chip(data);
145 struct irq_desc *desc = irq_data_to_desc(data);
146 int ret = 0;
147
148 if (!chip || !chip->irq_set_affinity)
149 return -EINVAL;
150
151 if (irq_can_move_pcntxt(data)) {
152 ret = chip->irq_set_affinity(data, mask, false);
153 switch (ret) {
154 case IRQ_SET_MASK_OK:
155 cpumask_copy(data->affinity, mask);
156 case IRQ_SET_MASK_OK_NOCOPY:
157 irq_set_thread_affinity(desc);
158 ret = 0;
159 }
160 } else {
161 irqd_set_move_pending(data);
162 irq_copy_pending(desc, mask);
163 }
164
165 if (desc->affinity_notify) {
166 kref_get(&desc->affinity_notify->kref);
167 schedule_work(&desc->affinity_notify->work);
168 }
169 irqd_set(data, IRQD_AFFINITY_SET);
170
171 return ret;
172}
173
103/** 174/**
104 * irq_set_affinity - Set the irq affinity of a given irq 175 * irq_set_affinity - Set the irq affinity of a given irq
105 * @irq: Interrupt to set affinity 176 * @irq: Interrupt to set affinity
106 * @cpumask: cpumask 177 * @mask: cpumask
107 * 178 *
108 */ 179 */
109int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) 180int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
110{ 181{
111 struct irq_desc *desc = irq_to_desc(irq); 182 struct irq_desc *desc = irq_to_desc(irq);
112 unsigned long flags; 183 unsigned long flags;
184 int ret;
113 185
114 if (!desc->chip->set_affinity) 186 if (!desc)
115 return -EINVAL; 187 return -EINVAL;
116 188
117 raw_spin_lock_irqsave(&desc->lock, flags); 189 raw_spin_lock_irqsave(&desc->lock, flags);
118 190 ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask);
119#ifdef CONFIG_GENERIC_PENDING_IRQ
120 if (desc->status & IRQ_MOVE_PCNTXT) {
121 if (!desc->chip->set_affinity(irq, cpumask)) {
122 cpumask_copy(desc->affinity, cpumask);
123 irq_set_thread_affinity(desc);
124 }
125 }
126 else {
127 desc->status |= IRQ_MOVE_PENDING;
128 cpumask_copy(desc->pending_mask, cpumask);
129 }
130#else
131 if (!desc->chip->set_affinity(irq, cpumask)) {
132 cpumask_copy(desc->affinity, cpumask);
133 irq_set_thread_affinity(desc);
134 }
135#endif
136 desc->status |= IRQ_AFFINITY_SET;
137 raw_spin_unlock_irqrestore(&desc->lock, flags); 191 raw_spin_unlock_irqrestore(&desc->lock, flags);
138 return 0; 192 return ret;
139} 193}
140 194
141int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) 195int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
142{ 196{
197 unsigned long flags;
198 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
199
200 if (!desc)
201 return -EINVAL;
202 desc->affinity_hint = m;
203 irq_put_desc_unlock(desc, flags);
204 return 0;
205}
206EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
207
208static void irq_affinity_notify(struct work_struct *work)
209{
210 struct irq_affinity_notify *notify =
211 container_of(work, struct irq_affinity_notify, work);
212 struct irq_desc *desc = irq_to_desc(notify->irq);
213 cpumask_var_t cpumask;
214 unsigned long flags;
215
216 if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL))
217 goto out;
218
219 raw_spin_lock_irqsave(&desc->lock, flags);
220 if (irq_move_pending(&desc->irq_data))
221 irq_get_pending(cpumask, desc);
222 else
223 cpumask_copy(cpumask, desc->irq_data.affinity);
224 raw_spin_unlock_irqrestore(&desc->lock, flags);
225
226 notify->notify(notify, cpumask);
227
228 free_cpumask_var(cpumask);
229out:
230 kref_put(&notify->kref, notify->release);
231}
232
233/**
234 * irq_set_affinity_notifier - control notification of IRQ affinity changes
235 * @irq: Interrupt for which to enable/disable notification
236 * @notify: Context for notification, or %NULL to disable
237 * notification. Function pointers must be initialised;
238 * the other fields will be initialised by this function.
239 *
240 * Must be called in process context. Notification may only be enabled
241 * after the IRQ is allocated and must be disabled before the IRQ is
242 * freed using free_irq().
243 */
244int
245irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
246{
143 struct irq_desc *desc = irq_to_desc(irq); 247 struct irq_desc *desc = irq_to_desc(irq);
248 struct irq_affinity_notify *old_notify;
144 unsigned long flags; 249 unsigned long flags;
145 250
251 /* The release function is promised process context */
252 might_sleep();
253
146 if (!desc) 254 if (!desc)
147 return -EINVAL; 255 return -EINVAL;
148 256
257 /* Complete initialisation of *notify */
258 if (notify) {
259 notify->irq = irq;
260 kref_init(&notify->kref);
261 INIT_WORK(&notify->work, irq_affinity_notify);
262 }
263
149 raw_spin_lock_irqsave(&desc->lock, flags); 264 raw_spin_lock_irqsave(&desc->lock, flags);
150 desc->affinity_hint = m; 265 old_notify = desc->affinity_notify;
266 desc->affinity_notify = notify;
151 raw_spin_unlock_irqrestore(&desc->lock, flags); 267 raw_spin_unlock_irqrestore(&desc->lock, flags);
152 268
269 if (old_notify)
270 kref_put(&old_notify->kref, old_notify->release);
271
153 return 0; 272 return 0;
154} 273}
155EXPORT_SYMBOL_GPL(irq_set_affinity_hint); 274EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
156 275
157#ifndef CONFIG_AUTO_IRQ_AFFINITY 276#ifndef CONFIG_AUTO_IRQ_AFFINITY
158/* 277/*
159 * Generic version of the affinity autoselector. 278 * Generic version of the affinity autoselector.
160 */ 279 */
161static int setup_affinity(unsigned int irq, struct irq_desc *desc) 280static int
281setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
162{ 282{
283 struct irq_chip *chip = irq_desc_get_chip(desc);
284 struct cpumask *set = irq_default_affinity;
285 int ret;
286
287 /* Excludes PER_CPU and NO_BALANCE interrupts */
163 if (!irq_can_set_affinity(irq)) 288 if (!irq_can_set_affinity(irq))
164 return 0; 289 return 0;
165 290
@@ -167,22 +292,27 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
167 * Preserve an userspace affinity setup, but make sure that 292 * Preserve an userspace affinity setup, but make sure that
168 * one of the targets is online. 293 * one of the targets is online.
169 */ 294 */
170 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { 295 if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
171 if (cpumask_any_and(desc->affinity, cpu_online_mask) 296 if (cpumask_intersects(desc->irq_data.affinity,
172 < nr_cpu_ids) 297 cpu_online_mask))
173 goto set_affinity; 298 set = desc->irq_data.affinity;
174 else 299 else
175 desc->status &= ~IRQ_AFFINITY_SET; 300 irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
176 } 301 }
177 302
178 cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity); 303 cpumask_and(mask, cpu_online_mask, set);
179set_affinity: 304 ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
180 desc->chip->set_affinity(irq, desc->affinity); 305 switch (ret) {
181 306 case IRQ_SET_MASK_OK:
307 cpumask_copy(desc->irq_data.affinity, mask);
308 case IRQ_SET_MASK_OK_NOCOPY:
309 irq_set_thread_affinity(desc);
310 }
182 return 0; 311 return 0;
183} 312}
184#else 313#else
185static inline int setup_affinity(unsigned int irq, struct irq_desc *d) 314static inline int
315setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask)
186{ 316{
187 return irq_select_affinity(irq); 317 return irq_select_affinity(irq);
188} 318}
@@ -191,23 +321,21 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
191/* 321/*
192 * Called when affinity is set via /proc/irq 322 * Called when affinity is set via /proc/irq
193 */ 323 */
194int irq_select_affinity_usr(unsigned int irq) 324int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
195{ 325{
196 struct irq_desc *desc = irq_to_desc(irq); 326 struct irq_desc *desc = irq_to_desc(irq);
197 unsigned long flags; 327 unsigned long flags;
198 int ret; 328 int ret;
199 329
200 raw_spin_lock_irqsave(&desc->lock, flags); 330 raw_spin_lock_irqsave(&desc->lock, flags);
201 ret = setup_affinity(irq, desc); 331 ret = setup_affinity(irq, desc, mask);
202 if (!ret)
203 irq_set_thread_affinity(desc);
204 raw_spin_unlock_irqrestore(&desc->lock, flags); 332 raw_spin_unlock_irqrestore(&desc->lock, flags);
205
206 return ret; 333 return ret;
207} 334}
208 335
209#else 336#else
210static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) 337static inline int
338setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
211{ 339{
212 return 0; 340 return 0;
213} 341}
@@ -218,13 +346,23 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
218 if (suspend) { 346 if (suspend) {
219 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) 347 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
220 return; 348 return;
221 desc->status |= IRQ_SUSPENDED; 349 desc->istate |= IRQS_SUSPENDED;
222 } 350 }
223 351
224 if (!desc->depth++) { 352 if (!desc->depth++)
225 desc->status |= IRQ_DISABLED; 353 irq_disable(desc);
226 desc->chip->disable(irq); 354}
227 } 355
356static int __disable_irq_nosync(unsigned int irq)
357{
358 unsigned long flags;
359 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
360
361 if (!desc)
362 return -EINVAL;
363 __disable_irq(desc, irq, false);
364 irq_put_desc_busunlock(desc, flags);
365 return 0;
228} 366}
229 367
230/** 368/**
@@ -240,17 +378,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
240 */ 378 */
241void disable_irq_nosync(unsigned int irq) 379void disable_irq_nosync(unsigned int irq)
242{ 380{
243 struct irq_desc *desc = irq_to_desc(irq); 381 __disable_irq_nosync(irq);
244 unsigned long flags;
245
246 if (!desc)
247 return;
248
249 chip_bus_lock(irq, desc);
250 raw_spin_lock_irqsave(&desc->lock, flags);
251 __disable_irq(desc, irq, false);
252 raw_spin_unlock_irqrestore(&desc->lock, flags);
253 chip_bus_sync_unlock(irq, desc);
254} 382}
255EXPORT_SYMBOL(disable_irq_nosync); 383EXPORT_SYMBOL(disable_irq_nosync);
256 384
@@ -268,21 +396,24 @@ EXPORT_SYMBOL(disable_irq_nosync);
268 */ 396 */
269void disable_irq(unsigned int irq) 397void disable_irq(unsigned int irq)
270{ 398{
271 struct irq_desc *desc = irq_to_desc(irq); 399 if (!__disable_irq_nosync(irq))
272
273 if (!desc)
274 return;
275
276 disable_irq_nosync(irq);
277 if (desc->action)
278 synchronize_irq(irq); 400 synchronize_irq(irq);
279} 401}
280EXPORT_SYMBOL(disable_irq); 402EXPORT_SYMBOL(disable_irq);
281 403
282void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) 404void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
283{ 405{
284 if (resume) 406 if (resume) {
285 desc->status &= ~IRQ_SUSPENDED; 407 if (!(desc->istate & IRQS_SUSPENDED)) {
408 if (!desc->action)
409 return;
410 if (!(desc->action->flags & IRQF_FORCE_RESUME))
411 return;
412 /* Pretend that it got disabled ! */
413 desc->depth++;
414 }
415 desc->istate &= ~IRQS_SUSPENDED;
416 }
286 417
287 switch (desc->depth) { 418 switch (desc->depth) {
288 case 0: 419 case 0:
@@ -290,12 +421,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
290 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); 421 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
291 break; 422 break;
292 case 1: { 423 case 1: {
293 unsigned int status = desc->status & ~IRQ_DISABLED; 424 if (desc->istate & IRQS_SUSPENDED)
294
295 if (desc->status & IRQ_SUSPENDED)
296 goto err_out; 425 goto err_out;
297 /* Prevent probing on this irq: */ 426 /* Prevent probing on this irq: */
298 desc->status = status | IRQ_NOPROBE; 427 irq_settings_set_noprobe(desc);
428 irq_enable(desc);
299 check_irq_resend(desc, irq); 429 check_irq_resend(desc, irq);
300 /* fall-through */ 430 /* fall-through */
301 } 431 }
@@ -313,21 +443,22 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
313 * IRQ line is re-enabled. 443 * IRQ line is re-enabled.
314 * 444 *
315 * This function may be called from IRQ context only when 445 * This function may be called from IRQ context only when
316 * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! 446 * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
317 */ 447 */
318void enable_irq(unsigned int irq) 448void enable_irq(unsigned int irq)
319{ 449{
320 struct irq_desc *desc = irq_to_desc(irq);
321 unsigned long flags; 450 unsigned long flags;
451 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
322 452
323 if (!desc) 453 if (!desc)
324 return; 454 return;
455 if (WARN(!desc->irq_data.chip,
456 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
457 goto out;
325 458
326 chip_bus_lock(irq, desc);
327 raw_spin_lock_irqsave(&desc->lock, flags);
328 __enable_irq(desc, irq, false); 459 __enable_irq(desc, irq, false);
329 raw_spin_unlock_irqrestore(&desc->lock, flags); 460out:
330 chip_bus_sync_unlock(irq, desc); 461 irq_put_desc_busunlock(desc, flags);
331} 462}
332EXPORT_SYMBOL(enable_irq); 463EXPORT_SYMBOL(enable_irq);
333 464
@@ -336,14 +467,14 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
336 struct irq_desc *desc = irq_to_desc(irq); 467 struct irq_desc *desc = irq_to_desc(irq);
337 int ret = -ENXIO; 468 int ret = -ENXIO;
338 469
339 if (desc->chip->set_wake) 470 if (desc->irq_data.chip->irq_set_wake)
340 ret = desc->chip->set_wake(irq, on); 471 ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);
341 472
342 return ret; 473 return ret;
343} 474}
344 475
345/** 476/**
346 * set_irq_wake - control irq power management wakeup 477 * irq_set_irq_wake - control irq power management wakeup
347 * @irq: interrupt to control 478 * @irq: interrupt to control
348 * @on: enable/disable power management wakeup 479 * @on: enable/disable power management wakeup
349 * 480 *
@@ -354,23 +485,25 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
354 * Wakeup mode lets this IRQ wake the system from sleep 485 * Wakeup mode lets this IRQ wake the system from sleep
355 * states like "suspend to RAM". 486 * states like "suspend to RAM".
356 */ 487 */
357int set_irq_wake(unsigned int irq, unsigned int on) 488int irq_set_irq_wake(unsigned int irq, unsigned int on)
358{ 489{
359 struct irq_desc *desc = irq_to_desc(irq);
360 unsigned long flags; 490 unsigned long flags;
491 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
361 int ret = 0; 492 int ret = 0;
362 493
494 if (!desc)
495 return -EINVAL;
496
363 /* wakeup-capable irqs can be shared between drivers that 497 /* wakeup-capable irqs can be shared between drivers that
364 * don't need to have the same sleep mode behaviors. 498 * don't need to have the same sleep mode behaviors.
365 */ 499 */
366 raw_spin_lock_irqsave(&desc->lock, flags);
367 if (on) { 500 if (on) {
368 if (desc->wake_depth++ == 0) { 501 if (desc->wake_depth++ == 0) {
369 ret = set_irq_wake_real(irq, on); 502 ret = set_irq_wake_real(irq, on);
370 if (ret) 503 if (ret)
371 desc->wake_depth = 0; 504 desc->wake_depth = 0;
372 else 505 else
373 desc->status |= IRQ_WAKEUP; 506 irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
374 } 507 }
375 } else { 508 } else {
376 if (desc->wake_depth == 0) { 509 if (desc->wake_depth == 0) {
@@ -380,14 +513,13 @@ int set_irq_wake(unsigned int irq, unsigned int on)
380 if (ret) 513 if (ret)
381 desc->wake_depth = 1; 514 desc->wake_depth = 1;
382 else 515 else
383 desc->status &= ~IRQ_WAKEUP; 516 irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
384 } 517 }
385 } 518 }
386 519 irq_put_desc_busunlock(desc, flags);
387 raw_spin_unlock_irqrestore(&desc->lock, flags);
388 return ret; 520 return ret;
389} 521}
390EXPORT_SYMBOL(set_irq_wake); 522EXPORT_SYMBOL(irq_set_irq_wake);
391 523
392/* 524/*
393 * Internal function that tells the architecture code whether a 525 * Internal function that tells the architecture code whether a
@@ -396,45 +528,29 @@ EXPORT_SYMBOL(set_irq_wake);
396 */ 528 */
397int can_request_irq(unsigned int irq, unsigned long irqflags) 529int can_request_irq(unsigned int irq, unsigned long irqflags)
398{ 530{
399 struct irq_desc *desc = irq_to_desc(irq);
400 struct irqaction *action;
401 unsigned long flags; 531 unsigned long flags;
532 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
533 int canrequest = 0;
402 534
403 if (!desc) 535 if (!desc)
404 return 0; 536 return 0;
405 537
406 if (desc->status & IRQ_NOREQUEST) 538 if (irq_settings_can_request(desc)) {
407 return 0; 539 if (desc->action)
408 540 if (irqflags & desc->action->flags & IRQF_SHARED)
409 raw_spin_lock_irqsave(&desc->lock, flags); 541 canrequest =1;
410 action = desc->action; 542 }
411 if (action) 543 irq_put_desc_unlock(desc, flags);
412 if (irqflags & action->flags & IRQF_SHARED) 544 return canrequest;
413 action = NULL;
414
415 raw_spin_unlock_irqrestore(&desc->lock, flags);
416
417 return !action;
418}
419
420void compat_irq_chip_set_default_handler(struct irq_desc *desc)
421{
422 /*
423 * If the architecture still has not overriden
424 * the flow handler then zap the default. This
425 * should catch incorrect flow-type setting.
426 */
427 if (desc->handle_irq == &handle_bad_irq)
428 desc->handle_irq = NULL;
429} 545}
430 546
431int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 547int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
432 unsigned long flags) 548 unsigned long flags)
433{ 549{
434 int ret; 550 struct irq_chip *chip = desc->irq_data.chip;
435 struct irq_chip *chip = desc->chip; 551 int ret, unmask = 0;
436 552
437 if (!chip || !chip->set_type) { 553 if (!chip || !chip->irq_set_type) {
438 /* 554 /*
439 * IRQF_TRIGGER_* but the PIC does not support multiple 555 * IRQF_TRIGGER_* but the PIC does not support multiple
440 * flow-types? 556 * flow-types?
@@ -444,23 +560,41 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
444 return 0; 560 return 0;
445 } 561 }
446 562
447 /* caller masked out all except trigger mode flags */ 563 flags &= IRQ_TYPE_SENSE_MASK;
448 ret = chip->set_type(irq, flags); 564
449 565 if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
450 if (ret) 566 if (!irqd_irq_masked(&desc->irq_data))
451 pr_err("setting trigger mode %d for irq %u failed (%pF)\n", 567 mask_irq(desc);
452 (int)flags, irq, chip->set_type); 568 if (!irqd_irq_disabled(&desc->irq_data))
453 else { 569 unmask = 1;
454 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
455 flags |= IRQ_LEVEL;
456 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
457 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
458 desc->status |= flags;
459
460 if (chip != desc->chip)
461 irq_chip_set_defaults(desc->chip);
462 } 570 }
463 571
572 /* caller masked out all except trigger mode flags */
573 ret = chip->irq_set_type(&desc->irq_data, flags);
574
575 switch (ret) {
576 case IRQ_SET_MASK_OK:
577 irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
578 irqd_set(&desc->irq_data, flags);
579
580 case IRQ_SET_MASK_OK_NOCOPY:
581 flags = irqd_get_trigger_type(&desc->irq_data);
582 irq_settings_set_trigger_mask(desc, flags);
583 irqd_clear(&desc->irq_data, IRQD_LEVEL);
584 irq_settings_clr_level(desc);
585 if (flags & IRQ_TYPE_LEVEL_MASK) {
586 irq_settings_set_level(desc);
587 irqd_set(&desc->irq_data, IRQD_LEVEL);
588 }
589
590 ret = 0;
591 break;
592 default:
593 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
594 flags, irq, chip->irq_set_type);
595 }
596 if (unmask)
597 unmask_irq(desc);
464 return ret; 598 return ret;
465} 599}
466 600
@@ -504,10 +638,13 @@ static int irq_wait_for_interrupt(struct irqaction *action)
504 * handler finished. unmask if the interrupt has not been disabled and 638 * handler finished. unmask if the interrupt has not been disabled and
505 * is marked MASKED. 639 * is marked MASKED.
506 */ 640 */
507static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 641static void irq_finalize_oneshot(struct irq_desc *desc,
642 struct irqaction *action, bool force)
508{ 643{
644 if (!(desc->istate & IRQS_ONESHOT))
645 return;
509again: 646again:
510 chip_bus_lock(irq, desc); 647 chip_bus_lock(desc);
511 raw_spin_lock_irq(&desc->lock); 648 raw_spin_lock_irq(&desc->lock);
512 649
513 /* 650 /*
@@ -517,26 +654,42 @@ again:
517 * The thread is faster done than the hard interrupt handler 654 * The thread is faster done than the hard interrupt handler
518 * on the other CPU. If we unmask the irq line then the 655 * on the other CPU. If we unmask the irq line then the
519 * interrupt can come in again and masks the line, leaves due 656 * interrupt can come in again and masks the line, leaves due
520 * to IRQ_INPROGRESS and the irq line is masked forever. 657 * to IRQS_INPROGRESS and the irq line is masked forever.
658 *
659 * This also serializes the state of shared oneshot handlers
660 * versus "desc->threads_onehsot |= action->thread_mask;" in
661 * irq_wake_thread(). See the comment there which explains the
662 * serialization.
521 */ 663 */
522 if (unlikely(desc->status & IRQ_INPROGRESS)) { 664 if (unlikely(irqd_irq_inprogress(&desc->irq_data))) {
523 raw_spin_unlock_irq(&desc->lock); 665 raw_spin_unlock_irq(&desc->lock);
524 chip_bus_sync_unlock(irq, desc); 666 chip_bus_sync_unlock(desc);
525 cpu_relax(); 667 cpu_relax();
526 goto again; 668 goto again;
527 } 669 }
528 670
529 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 671 /*
530 desc->status &= ~IRQ_MASKED; 672 * Now check again, whether the thread should run. Otherwise
531 desc->chip->unmask(irq); 673 * we would clear the threads_oneshot bit of this thread which
532 } 674 * was just set.
675 */
676 if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
677 goto out_unlock;
678
679 desc->threads_oneshot &= ~action->thread_mask;
680
681 if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) &&
682 irqd_irq_masked(&desc->irq_data))
683 unmask_irq(desc);
684
685out_unlock:
533 raw_spin_unlock_irq(&desc->lock); 686 raw_spin_unlock_irq(&desc->lock);
534 chip_bus_sync_unlock(irq, desc); 687 chip_bus_sync_unlock(desc);
535} 688}
536 689
537#ifdef CONFIG_SMP 690#ifdef CONFIG_SMP
538/* 691/*
539 * Check whether we need to change the affinity of the interrupt thread. 692 * Check whether we need to chasnge the affinity of the interrupt thread.
540 */ 693 */
541static void 694static void
542irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) 695irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
@@ -556,7 +709,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
556 } 709 }
557 710
558 raw_spin_lock_irq(&desc->lock); 711 raw_spin_lock_irq(&desc->lock);
559 cpumask_copy(mask, desc->affinity); 712 cpumask_copy(mask, desc->irq_data.affinity);
560 raw_spin_unlock_irq(&desc->lock); 713 raw_spin_unlock_irq(&desc->lock);
561 714
562 set_cpus_allowed_ptr(current, mask); 715 set_cpus_allowed_ptr(current, mask);
@@ -568,14 +721,57 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
568#endif 721#endif
569 722
570/* 723/*
724 * Interrupts which are not explicitely requested as threaded
725 * interrupts rely on the implicit bh/preempt disable of the hard irq
726 * context. So we need to disable bh here to avoid deadlocks and other
727 * side effects.
728 */
729static irqreturn_t
730irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
731{
732 irqreturn_t ret;
733
734 local_bh_disable();
735 ret = action->thread_fn(action->irq, action->dev_id);
736 irq_finalize_oneshot(desc, action, false);
737 local_bh_enable();
738 return ret;
739}
740
741/*
742 * Interrupts explicitely requested as threaded interupts want to be
743 * preemtible - many of them need to sleep and wait for slow busses to
744 * complete.
745 */
746static irqreturn_t irq_thread_fn(struct irq_desc *desc,
747 struct irqaction *action)
748{
749 irqreturn_t ret;
750
751 ret = action->thread_fn(action->irq, action->dev_id);
752 irq_finalize_oneshot(desc, action, false);
753 return ret;
754}
755
756/*
571 * Interrupt handler thread 757 * Interrupt handler thread
572 */ 758 */
573static int irq_thread(void *data) 759static int irq_thread(void *data)
574{ 760{
575 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; 761 static const struct sched_param param = {
762 .sched_priority = MAX_USER_RT_PRIO/2,
763 };
576 struct irqaction *action = data; 764 struct irqaction *action = data;
577 struct irq_desc *desc = irq_to_desc(action->irq); 765 struct irq_desc *desc = irq_to_desc(action->irq);
578 int wake, oneshot = desc->status & IRQ_ONESHOT; 766 irqreturn_t (*handler_fn)(struct irq_desc *desc,
767 struct irqaction *action);
768 int wake;
769
770 if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
771 &action->thread_flags))
772 handler_fn = irq_forced_thread_fn;
773 else
774 handler_fn = irq_thread_fn;
579 775
580 sched_setscheduler(current, SCHED_FIFO, &param); 776 sched_setscheduler(current, SCHED_FIFO, &param);
581 current->irqaction = action; 777 current->irqaction = action;
@@ -587,23 +783,23 @@ static int irq_thread(void *data)
587 atomic_inc(&desc->threads_active); 783 atomic_inc(&desc->threads_active);
588 784
589 raw_spin_lock_irq(&desc->lock); 785 raw_spin_lock_irq(&desc->lock);
590 if (unlikely(desc->status & IRQ_DISABLED)) { 786 if (unlikely(irqd_irq_disabled(&desc->irq_data))) {
591 /* 787 /*
592 * CHECKME: We might need a dedicated 788 * CHECKME: We might need a dedicated
593 * IRQ_THREAD_PENDING flag here, which 789 * IRQ_THREAD_PENDING flag here, which
594 * retriggers the thread in check_irq_resend() 790 * retriggers the thread in check_irq_resend()
595 * but AFAICT IRQ_PENDING should be fine as it 791 * but AFAICT IRQS_PENDING should be fine as it
596 * retriggers the interrupt itself --- tglx 792 * retriggers the interrupt itself --- tglx
597 */ 793 */
598 desc->status |= IRQ_PENDING; 794 desc->istate |= IRQS_PENDING;
599 raw_spin_unlock_irq(&desc->lock); 795 raw_spin_unlock_irq(&desc->lock);
600 } else { 796 } else {
601 raw_spin_unlock_irq(&desc->lock); 797 irqreturn_t action_ret;
602
603 action->thread_fn(action->irq, action->dev_id);
604 798
605 if (oneshot) 799 raw_spin_unlock_irq(&desc->lock);
606 irq_finalize_oneshot(action->irq, desc); 800 action_ret = handler_fn(desc, action);
801 if (!noirqdebug)
802 note_interrupt(action->irq, desc, action_ret);
607 } 803 }
608 804
609 wake = atomic_dec_and_test(&desc->threads_active); 805 wake = atomic_dec_and_test(&desc->threads_active);
@@ -612,6 +808,9 @@ static int irq_thread(void *data)
612 wake_up(&desc->wait_for_threads); 808 wake_up(&desc->wait_for_threads);
613 } 809 }
614 810
811 /* Prevent a stale desc->threads_oneshot */
812 irq_finalize_oneshot(desc, action, true);
813
615 /* 814 /*
616 * Clear irqaction. Otherwise exit_irq_thread() would make 815 * Clear irqaction. Otherwise exit_irq_thread() would make
617 * fuzz about an active irq thread going into nirvana. 816 * fuzz about an active irq thread going into nirvana.
@@ -626,6 +825,7 @@ static int irq_thread(void *data)
626void exit_irq_thread(void) 825void exit_irq_thread(void)
627{ 826{
628 struct task_struct *tsk = current; 827 struct task_struct *tsk = current;
828 struct irq_desc *desc;
629 829
630 if (!tsk->irqaction) 830 if (!tsk->irqaction)
631 return; 831 return;
@@ -634,6 +834,14 @@ void exit_irq_thread(void)
634 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", 834 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
635 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); 835 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
636 836
837 desc = irq_to_desc(tsk->irqaction->irq);
838
839 /*
840 * Prevent a stale desc->threads_oneshot. Must be called
841 * before setting the IRQTF_DIED flag.
842 */
843 irq_finalize_oneshot(desc, tsk->irqaction, true);
844
637 /* 845 /*
638 * Set the THREAD DIED flag to prevent further wakeups of the 846 * Set the THREAD DIED flag to prevent further wakeups of the
639 * soon to be gone threaded handler. 847 * soon to be gone threaded handler.
@@ -641,6 +849,22 @@ void exit_irq_thread(void)
641 set_bit(IRQTF_DIED, &tsk->irqaction->flags); 849 set_bit(IRQTF_DIED, &tsk->irqaction->flags);
642} 850}
643 851
852static void irq_setup_forced_threading(struct irqaction *new)
853{
854 if (!force_irqthreads)
855 return;
856 if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
857 return;
858
859 new->flags |= IRQF_ONESHOT;
860
861 if (!new->thread_fn) {
862 set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
863 new->thread_fn = new->handler;
864 new->handler = irq_default_primary_handler;
865 }
866}
867
644/* 868/*
645 * Internal function to register an irqaction - typically used to 869 * Internal function to register an irqaction - typically used to
646 * allocate special interrupts that are part of the architecture. 870 * allocate special interrupts that are part of the architecture.
@@ -650,14 +874,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
650{ 874{
651 struct irqaction *old, **old_ptr; 875 struct irqaction *old, **old_ptr;
652 const char *old_name = NULL; 876 const char *old_name = NULL;
653 unsigned long flags; 877 unsigned long flags, thread_mask = 0;
654 int nested, shared = 0; 878 int ret, nested, shared = 0;
655 int ret; 879 cpumask_var_t mask;
656 880
657 if (!desc) 881 if (!desc)
658 return -EINVAL; 882 return -EINVAL;
659 883
660 if (desc->chip == &no_irq_chip) 884 if (desc->irq_data.chip == &no_irq_chip)
661 return -ENOSYS; 885 return -ENOSYS;
662 /* 886 /*
663 * Some drivers like serial.c use request_irq() heavily, 887 * Some drivers like serial.c use request_irq() heavily,
@@ -676,15 +900,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
676 rand_initialize_irq(irq); 900 rand_initialize_irq(irq);
677 } 901 }
678 902
679 /* Oneshot interrupts are not allowed with shared */
680 if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
681 return -EINVAL;
682
683 /* 903 /*
684 * Check whether the interrupt nests into another interrupt 904 * Check whether the interrupt nests into another interrupt
685 * thread. 905 * thread.
686 */ 906 */
687 nested = desc->status & IRQ_NESTED_THREAD; 907 nested = irq_settings_is_nested_thread(desc);
688 if (nested) { 908 if (nested) {
689 if (!new->thread_fn) 909 if (!new->thread_fn)
690 return -EINVAL; 910 return -EINVAL;
@@ -694,6 +914,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
694 * dummy function which warns when called. 914 * dummy function which warns when called.
695 */ 915 */
696 new->handler = irq_nested_primary_handler; 916 new->handler = irq_nested_primary_handler;
917 } else {
918 if (irq_settings_can_thread(desc))
919 irq_setup_forced_threading(new);
697 } 920 }
698 921
699 /* 922 /*
@@ -717,6 +940,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
717 new->thread = t; 940 new->thread = t;
718 } 941 }
719 942
943 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
944 ret = -ENOMEM;
945 goto out_thread;
946 }
947
720 /* 948 /*
721 * The following block of code has to be executed atomically 949 * The following block of code has to be executed atomically
722 */ 950 */
@@ -728,32 +956,41 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
728 * Can't share interrupts unless both agree to and are 956 * Can't share interrupts unless both agree to and are
729 * the same type (level, edge, polarity). So both flag 957 * the same type (level, edge, polarity). So both flag
730 * fields must have IRQF_SHARED set and the bits which 958 * fields must have IRQF_SHARED set and the bits which
731 * set the trigger type must match. 959 * set the trigger type must match. Also all must
960 * agree on ONESHOT.
732 */ 961 */
733 if (!((old->flags & new->flags) & IRQF_SHARED) || 962 if (!((old->flags & new->flags) & IRQF_SHARED) ||
734 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) { 963 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
964 ((old->flags ^ new->flags) & IRQF_ONESHOT)) {
735 old_name = old->name; 965 old_name = old->name;
736 goto mismatch; 966 goto mismatch;
737 } 967 }
738 968
739#if defined(CONFIG_IRQ_PER_CPU)
740 /* All handlers must agree on per-cpuness */ 969 /* All handlers must agree on per-cpuness */
741 if ((old->flags & IRQF_PERCPU) != 970 if ((old->flags & IRQF_PERCPU) !=
742 (new->flags & IRQF_PERCPU)) 971 (new->flags & IRQF_PERCPU))
743 goto mismatch; 972 goto mismatch;
744#endif
745 973
746 /* add new interrupt at end of irq queue */ 974 /* add new interrupt at end of irq queue */
747 do { 975 do {
976 thread_mask |= old->thread_mask;
748 old_ptr = &old->next; 977 old_ptr = &old->next;
749 old = *old_ptr; 978 old = *old_ptr;
750 } while (old); 979 } while (old);
751 shared = 1; 980 shared = 1;
752 } 981 }
753 982
754 if (!shared) { 983 /*
755 irq_chip_set_defaults(desc->chip); 984 * Setup the thread mask for this irqaction. Unlikely to have
985 * 32 resp 64 irqs sharing one line, but who knows.
986 */
987 if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) {
988 ret = -EBUSY;
989 goto out_mask;
990 }
991 new->thread_mask = 1 << ffz(thread_mask);
756 992
993 if (!shared) {
757 init_waitqueue_head(&desc->wait_for_threads); 994 init_waitqueue_head(&desc->wait_for_threads);
758 995
759 /* Setup the type (level, edge polarity) if configured: */ 996 /* Setup the type (level, edge polarity) if configured: */
@@ -762,42 +999,44 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
762 new->flags & IRQF_TRIGGER_MASK); 999 new->flags & IRQF_TRIGGER_MASK);
763 1000
764 if (ret) 1001 if (ret)
765 goto out_thread; 1002 goto out_mask;
766 } else 1003 }
767 compat_irq_chip_set_default_handler(desc); 1004
768#if defined(CONFIG_IRQ_PER_CPU) 1005 desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
769 if (new->flags & IRQF_PERCPU) 1006 IRQS_ONESHOT | IRQS_WAITING);
770 desc->status |= IRQ_PER_CPU; 1007 irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
771#endif
772 1008
773 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT | 1009 if (new->flags & IRQF_PERCPU) {
774 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); 1010 irqd_set(&desc->irq_data, IRQD_PER_CPU);
1011 irq_settings_set_per_cpu(desc);
1012 }
775 1013
776 if (new->flags & IRQF_ONESHOT) 1014 if (new->flags & IRQF_ONESHOT)
777 desc->status |= IRQ_ONESHOT; 1015 desc->istate |= IRQS_ONESHOT;
778 1016
779 if (!(desc->status & IRQ_NOAUTOEN)) { 1017 if (irq_settings_can_autoenable(desc))
780 desc->depth = 0; 1018 irq_startup(desc);
781 desc->status &= ~IRQ_DISABLED; 1019 else
782 desc->chip->startup(irq);
783 } else
784 /* Undo nested disables: */ 1020 /* Undo nested disables: */
785 desc->depth = 1; 1021 desc->depth = 1;
786 1022
787 /* Exclude IRQ from balancing if requested */ 1023 /* Exclude IRQ from balancing if requested */
788 if (new->flags & IRQF_NOBALANCING) 1024 if (new->flags & IRQF_NOBALANCING) {
789 desc->status |= IRQ_NO_BALANCING; 1025 irq_settings_set_no_balancing(desc);
1026 irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
1027 }
790 1028
791 /* Set default affinity mask once everything is setup */ 1029 /* Set default affinity mask once everything is setup */
792 setup_affinity(irq, desc); 1030 setup_affinity(irq, desc, mask);
793 1031
794 } else if ((new->flags & IRQF_TRIGGER_MASK) 1032 } else if (new->flags & IRQF_TRIGGER_MASK) {
795 && (new->flags & IRQF_TRIGGER_MASK) 1033 unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
796 != (desc->status & IRQ_TYPE_SENSE_MASK)) { 1034 unsigned int omsk = irq_settings_get_trigger_mask(desc);
797 /* hope the handler works with the actual trigger mode... */ 1035
798 pr_warning("IRQ %d uses trigger mode %d; requested %d\n", 1036 if (nmsk != omsk)
799 irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK), 1037 /* hope the handler works with current trigger mode */
800 (int)(new->flags & IRQF_TRIGGER_MASK)); 1038 pr_warning("IRQ %d uses trigger mode %u; requested %u\n",
1039 irq, nmsk, omsk);
801 } 1040 }
802 1041
803 new->irq = irq; 1042 new->irq = irq;
@@ -811,8 +1050,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
811 * Check whether we disabled the irq via the spurious handler 1050 * Check whether we disabled the irq via the spurious handler
812 * before. Reenable it and give it another chance. 1051 * before. Reenable it and give it another chance.
813 */ 1052 */
814 if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { 1053 if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
815 desc->status &= ~IRQ_SPURIOUS_DISABLED; 1054 desc->istate &= ~IRQS_SPURIOUS_DISABLED;
816 __enable_irq(desc, irq, false); 1055 __enable_irq(desc, irq, false);
817 } 1056 }
818 1057
@@ -828,6 +1067,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
828 register_irq_proc(irq, desc); 1067 register_irq_proc(irq, desc);
829 new->dir = NULL; 1068 new->dir = NULL;
830 register_handler_proc(irq, new); 1069 register_handler_proc(irq, new);
1070 free_cpumask_var(mask);
831 1071
832 return 0; 1072 return 0;
833 1073
@@ -842,8 +1082,11 @@ mismatch:
842#endif 1082#endif
843 ret = -EBUSY; 1083 ret = -EBUSY;
844 1084
845out_thread: 1085out_mask:
846 raw_spin_unlock_irqrestore(&desc->lock, flags); 1086 raw_spin_unlock_irqrestore(&desc->lock, flags);
1087 free_cpumask_var(mask);
1088
1089out_thread:
847 if (new->thread) { 1090 if (new->thread) {
848 struct task_struct *t = new->thread; 1091 struct task_struct *t = new->thread;
849 1092
@@ -864,9 +1107,14 @@ out_thread:
864 */ 1107 */
865int setup_irq(unsigned int irq, struct irqaction *act) 1108int setup_irq(unsigned int irq, struct irqaction *act)
866{ 1109{
1110 int retval;
867 struct irq_desc *desc = irq_to_desc(irq); 1111 struct irq_desc *desc = irq_to_desc(irq);
868 1112
869 return __setup_irq(irq, desc, act); 1113 chip_bus_lock(desc);
1114 retval = __setup_irq(irq, desc, act);
1115 chip_bus_sync_unlock(desc);
1116
1117 return retval;
870} 1118}
871EXPORT_SYMBOL_GPL(setup_irq); 1119EXPORT_SYMBOL_GPL(setup_irq);
872 1120
@@ -912,18 +1160,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
912 1160
913 /* Currently used only by UML, might disappear one day: */ 1161 /* Currently used only by UML, might disappear one day: */
914#ifdef CONFIG_IRQ_RELEASE_METHOD 1162#ifdef CONFIG_IRQ_RELEASE_METHOD
915 if (desc->chip->release) 1163 if (desc->irq_data.chip->release)
916 desc->chip->release(irq, dev_id); 1164 desc->irq_data.chip->release(irq, dev_id);
917#endif 1165#endif
918 1166
919 /* If this was the last handler, shut down the IRQ line: */ 1167 /* If this was the last handler, shut down the IRQ line: */
920 if (!desc->action) { 1168 if (!desc->action)
921 desc->status |= IRQ_DISABLED; 1169 irq_shutdown(desc);
922 if (desc->chip->shutdown)
923 desc->chip->shutdown(irq);
924 else
925 desc->chip->disable(irq);
926 }
927 1170
928#ifdef CONFIG_SMP 1171#ifdef CONFIG_SMP
929 /* make sure affinity_hint is cleaned up */ 1172 /* make sure affinity_hint is cleaned up */
@@ -997,9 +1240,14 @@ void free_irq(unsigned int irq, void *dev_id)
997 if (!desc) 1240 if (!desc)
998 return; 1241 return;
999 1242
1000 chip_bus_lock(irq, desc); 1243#ifdef CONFIG_SMP
1244 if (WARN_ON(desc->affinity_notify))
1245 desc->affinity_notify = NULL;
1246#endif
1247
1248 chip_bus_lock(desc);
1001 kfree(__free_irq(irq, dev_id)); 1249 kfree(__free_irq(irq, dev_id));
1002 chip_bus_sync_unlock(irq, desc); 1250 chip_bus_sync_unlock(desc);
1003} 1251}
1004EXPORT_SYMBOL(free_irq); 1252EXPORT_SYMBOL(free_irq);
1005 1253
@@ -1067,7 +1315,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1067 if (!desc) 1315 if (!desc)
1068 return -EINVAL; 1316 return -EINVAL;
1069 1317
1070 if (desc->status & IRQ_NOREQUEST) 1318 if (!irq_settings_can_request(desc))
1071 return -EINVAL; 1319 return -EINVAL;
1072 1320
1073 if (!handler) { 1321 if (!handler) {
@@ -1086,14 +1334,14 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1086 action->name = devname; 1334 action->name = devname;
1087 action->dev_id = dev_id; 1335 action->dev_id = dev_id;
1088 1336
1089 chip_bus_lock(irq, desc); 1337 chip_bus_lock(desc);
1090 retval = __setup_irq(irq, desc, action); 1338 retval = __setup_irq(irq, desc, action);
1091 chip_bus_sync_unlock(irq, desc); 1339 chip_bus_sync_unlock(desc);
1092 1340
1093 if (retval) 1341 if (retval)
1094 kfree(action); 1342 kfree(action);
1095 1343
1096#ifdef CONFIG_DEBUG_SHIRQ 1344#ifdef CONFIG_DEBUG_SHIRQ_FIXME
1097 if (!retval && (irqflags & IRQF_SHARED)) { 1345 if (!retval && (irqflags & IRQF_SHARED)) {
1098 /* 1346 /*
1099 * It's a shared IRQ -- the driver ought to be prepared for it 1347 * It's a shared IRQ -- the driver ought to be prepared for it
@@ -1142,7 +1390,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
1142 if (!desc) 1390 if (!desc)
1143 return -EINVAL; 1391 return -EINVAL;
1144 1392
1145 if (desc->status & IRQ_NESTED_THREAD) { 1393 if (irq_settings_is_nested_thread(desc)) {
1146 ret = request_threaded_irq(irq, NULL, handler, 1394 ret = request_threaded_irq(irq, NULL, handler,
1147 flags, name, dev_id); 1395 flags, name, dev_id);
1148 return !ret ? IRQC_IS_NESTED : ret; 1396 return !ret ? IRQC_IS_NESTED : ret;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 241962280836..47420908fba0 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,27 +4,28 @@
4 4
5#include "internals.h" 5#include "internals.h"
6 6
7void move_masked_irq(int irq) 7void irq_move_masked_irq(struct irq_data *idata)
8{ 8{
9 struct irq_desc *desc = irq_to_desc(irq); 9 struct irq_desc *desc = irq_data_to_desc(idata);
10 struct irq_chip *chip = idata->chip;
10 11
11 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 12 if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
12 return; 13 return;
13 14
14 /* 15 /*
15 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. 16 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
16 */ 17 */
17 if (CHECK_IRQ_PER_CPU(desc->status)) { 18 if (!irqd_can_balance(&desc->irq_data)) {
18 WARN_ON(1); 19 WARN_ON(1);
19 return; 20 return;
20 } 21 }
21 22
22 desc->status &= ~IRQ_MOVE_PENDING; 23 irqd_clr_move_pending(&desc->irq_data);
23 24
24 if (unlikely(cpumask_empty(desc->pending_mask))) 25 if (unlikely(cpumask_empty(desc->pending_mask)))
25 return; 26 return;
26 27
27 if (!desc->chip->set_affinity) 28 if (!chip->irq_set_affinity)
28 return; 29 return;
29 30
30 assert_raw_spin_locked(&desc->lock); 31 assert_raw_spin_locked(&desc->lock);
@@ -34,7 +35,7 @@ void move_masked_irq(int irq)
34 * do the disable, re-program, enable sequence. 35 * do the disable, re-program, enable sequence.
35 * This is *not* particularly important for level triggered 36 * This is *not* particularly important for level triggered
36 * but in a edge trigger case, we might be setting rte 37 * but in a edge trigger case, we might be setting rte
37 * when an active trigger is comming in. This could 38 * when an active trigger is coming in. This could
38 * cause some ioapics to mal-function. 39 * cause some ioapics to mal-function.
39 * Being paranoid i guess! 40 * Being paranoid i guess!
40 * 41 *
@@ -43,26 +44,34 @@ void move_masked_irq(int irq)
43 */ 44 */
44 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) 45 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
45 < nr_cpu_ids)) 46 < nr_cpu_ids))
46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) { 47 if (!chip->irq_set_affinity(&desc->irq_data,
47 cpumask_copy(desc->affinity, desc->pending_mask); 48 desc->pending_mask, false)) {
49 cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
48 irq_set_thread_affinity(desc); 50 irq_set_thread_affinity(desc);
49 } 51 }
50 52
51 cpumask_clear(desc->pending_mask); 53 cpumask_clear(desc->pending_mask);
52} 54}
53 55
54void move_native_irq(int irq) 56void irq_move_irq(struct irq_data *idata)
55{ 57{
56 struct irq_desc *desc = irq_to_desc(irq); 58 bool masked;
57 59
58 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 60 if (likely(!irqd_is_setaffinity_pending(idata)))
59 return; 61 return;
60 62
61 if (unlikely(desc->status & IRQ_DISABLED)) 63 if (unlikely(irqd_irq_disabled(idata)))
62 return; 64 return;
63 65
64 desc->chip->mask(irq); 66 /*
65 move_masked_irq(irq); 67 * Be careful vs. already masked interrupts. If this is a
66 desc->chip->unmask(irq); 68 * threaded interrupt with ONESHOT set, we can end up with an
69 * interrupt storm.
70 */
71 masked = irqd_irq_masked(idata);
72 if (!masked)
73 idata->chip->irq_mask(idata);
74 irq_move_masked_irq(idata);
75 if (!masked)
76 idata->chip->irq_unmask(idata);
67} 77}
68
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
deleted file mode 100644
index 65d3845665ac..000000000000
--- a/kernel/irq/numa_migrate.c
+++ /dev/null
@@ -1,120 +0,0 @@
1/*
2 * NUMA irq-desc migration code
3 *
4 * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to
5 * the new "home node" of the IRQ.
6 */
7
8#include <linux/irq.h>
9#include <linux/slab.h>
10#include <linux/module.h>
11#include <linux/random.h>
12#include <linux/interrupt.h>
13#include <linux/kernel_stat.h>
14
15#include "internals.h"
16
17static void init_copy_kstat_irqs(struct irq_desc *old_desc,
18 struct irq_desc *desc,
19 int node, int nr)
20{
21 init_kstat_irqs(desc, node, nr);
22
23 if (desc->kstat_irqs != old_desc->kstat_irqs)
24 memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
25 nr * sizeof(*desc->kstat_irqs));
26}
27
28static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
29{
30 if (old_desc->kstat_irqs == desc->kstat_irqs)
31 return;
32
33 kfree(old_desc->kstat_irqs);
34 old_desc->kstat_irqs = NULL;
35}
36
37static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
38 struct irq_desc *desc, int node)
39{
40 memcpy(desc, old_desc, sizeof(struct irq_desc));
41 if (!alloc_desc_masks(desc, node, false)) {
42 printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
43 "for migration.\n", irq);
44 return false;
45 }
46 raw_spin_lock_init(&desc->lock);
47 desc->node = node;
48 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
49 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
50 init_copy_desc_masks(old_desc, desc);
51 arch_init_copy_chip_data(old_desc, desc, node);
52 return true;
53}
54
55static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
56{
57 free_kstat_irqs(old_desc, desc);
58 free_desc_masks(old_desc, desc);
59 arch_free_chip_data(old_desc, desc);
60}
61
62static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
63 int node)
64{
65 struct irq_desc *desc;
66 unsigned int irq;
67 unsigned long flags;
68
69 irq = old_desc->irq;
70
71 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
72
73 /* We have to check it to avoid races with another CPU */
74 desc = irq_to_desc(irq);
75
76 if (desc && old_desc != desc)
77 goto out_unlock;
78
79 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
80 if (!desc) {
81 printk(KERN_ERR "irq %d: can not get new irq_desc "
82 "for migration.\n", irq);
83 /* still use old one */
84 desc = old_desc;
85 goto out_unlock;
86 }
87 if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
88 /* still use old one */
89 kfree(desc);
90 desc = old_desc;
91 goto out_unlock;
92 }
93
94 replace_irq_desc(irq, desc);
95 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
96
97 /* free the old one */
98 free_one_irq_desc(old_desc, desc);
99 kfree(old_desc);
100
101 return desc;
102
103out_unlock:
104 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
105
106 return desc;
107}
108
109struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
110{
111 /* those static or target node is -1, do not move them */
112 if (desc->irq < NR_IRQS_LEGACY || node == -1)
113 return desc;
114
115 if (desc->node != node)
116 desc = __real_move_irq_desc(desc, node);
117
118 return desc;
119}
120
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 0d4005d85b03..f76fc00c9877 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -18,7 +18,7 @@
18 * During system-wide suspend or hibernation device drivers need to be prevented 18 * During system-wide suspend or hibernation device drivers need to be prevented
19 * from receiving interrupts and this function is provided for this purpose. 19 * from receiving interrupts and this function is provided for this purpose.
20 * It marks all interrupt lines in use, except for the timer ones, as disabled 20 * It marks all interrupt lines in use, except for the timer ones, as disabled
21 * and sets the IRQ_SUSPENDED flag for each of them. 21 * and sets the IRQS_SUSPENDED flag for each of them.
22 */ 22 */
23void suspend_device_irqs(void) 23void suspend_device_irqs(void)
24{ 24{
@@ -34,7 +34,7 @@ void suspend_device_irqs(void)
34 } 34 }
35 35
36 for_each_irq_desc(irq, desc) 36 for_each_irq_desc(irq, desc)
37 if (desc->status & IRQ_SUSPENDED) 37 if (desc->istate & IRQS_SUSPENDED)
38 synchronize_irq(irq); 38 synchronize_irq(irq);
39} 39}
40EXPORT_SYMBOL_GPL(suspend_device_irqs); 40EXPORT_SYMBOL_GPL(suspend_device_irqs);
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs);
43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() 43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
44 * 44 *
45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that 45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that
46 * have the IRQ_SUSPENDED flag set. 46 * have the IRQS_SUSPENDED flag set.
47 */ 47 */
48void resume_device_irqs(void) 48void resume_device_irqs(void)
49{ 49{
@@ -53,9 +53,6 @@ void resume_device_irqs(void)
53 for_each_irq_desc(irq, desc) { 53 for_each_irq_desc(irq, desc) {
54 unsigned long flags; 54 unsigned long flags;
55 55
56 if (!(desc->status & IRQ_SUSPENDED))
57 continue;
58
59 raw_spin_lock_irqsave(&desc->lock, flags); 56 raw_spin_lock_irqsave(&desc->lock, flags);
60 __enable_irq(desc, irq, true); 57 __enable_irq(desc, irq, true);
61 raw_spin_unlock_irqrestore(&desc->lock, flags); 58 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -71,9 +68,24 @@ int check_wakeup_irqs(void)
71 struct irq_desc *desc; 68 struct irq_desc *desc;
72 int irq; 69 int irq;
73 70
74 for_each_irq_desc(irq, desc) 71 for_each_irq_desc(irq, desc) {
75 if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING)) 72 if (irqd_is_wakeup_set(&desc->irq_data)) {
76 return -EBUSY; 73 if (desc->istate & IRQS_PENDING)
74 return -EBUSY;
75 continue;
76 }
77 /*
78 * Check the non wakeup interrupts whether they need
79 * to be masked before finally going into suspend
80 * state. That's for hardware which has no wakeup
81 * source configuration facility. The chip
82 * implementation indicates that with
83 * IRQCHIP_MASK_ON_SUSPEND.
84 */
85 if (desc->istate & IRQS_SUSPENDED &&
86 irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
87 mask_irq(desc);
88 }
77 89
78 return 0; 90 return 0;
79} 91}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 09a2ee540bd2..4bd4faa6323a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -11,6 +11,7 @@
11#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/kernel_stat.h>
14 15
15#include "internals.h" 16#include "internals.h"
16 17
@@ -18,16 +19,19 @@ static struct proc_dir_entry *root_irq_dir;
18 19
19#ifdef CONFIG_SMP 20#ifdef CONFIG_SMP
20 21
21static int irq_affinity_proc_show(struct seq_file *m, void *v) 22static int show_irq_affinity(int type, struct seq_file *m, void *v)
22{ 23{
23 struct irq_desc *desc = irq_to_desc((long)m->private); 24 struct irq_desc *desc = irq_to_desc((long)m->private);
24 const struct cpumask *mask = desc->affinity; 25 const struct cpumask *mask = desc->irq_data.affinity;
25 26
26#ifdef CONFIG_GENERIC_PENDING_IRQ 27#ifdef CONFIG_GENERIC_PENDING_IRQ
27 if (desc->status & IRQ_MOVE_PENDING) 28 if (irqd_is_setaffinity_pending(&desc->irq_data))
28 mask = desc->pending_mask; 29 mask = desc->pending_mask;
29#endif 30#endif
30 seq_cpumask(m, mask); 31 if (type)
32 seq_cpumask_list(m, mask);
33 else
34 seq_cpumask(m, mask);
31 seq_putc(m, '\n'); 35 seq_putc(m, '\n');
32 return 0; 36 return 0;
33} 37}
@@ -58,21 +62,34 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
58#endif 62#endif
59 63
60int no_irq_affinity; 64int no_irq_affinity;
61static ssize_t irq_affinity_proc_write(struct file *file, 65static int irq_affinity_proc_show(struct seq_file *m, void *v)
66{
67 return show_irq_affinity(0, m, v);
68}
69
70static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
71{
72 return show_irq_affinity(1, m, v);
73}
74
75
76static ssize_t write_irq_affinity(int type, struct file *file,
62 const char __user *buffer, size_t count, loff_t *pos) 77 const char __user *buffer, size_t count, loff_t *pos)
63{ 78{
64 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; 79 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
65 cpumask_var_t new_value; 80 cpumask_var_t new_value;
66 int err; 81 int err;
67 82
68 if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || 83 if (!irq_can_set_affinity(irq) || no_irq_affinity)
69 irq_balancing_disabled(irq))
70 return -EIO; 84 return -EIO;
71 85
72 if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) 86 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
73 return -ENOMEM; 87 return -ENOMEM;
74 88
75 err = cpumask_parse_user(buffer, count, new_value); 89 if (type)
90 err = cpumask_parselist_user(buffer, count, new_value);
91 else
92 err = cpumask_parse_user(buffer, count, new_value);
76 if (err) 93 if (err)
77 goto free_cpumask; 94 goto free_cpumask;
78 95
@@ -89,7 +106,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
89 if (!cpumask_intersects(new_value, cpu_online_mask)) { 106 if (!cpumask_intersects(new_value, cpu_online_mask)) {
90 /* Special case for empty set - allow the architecture 107 /* Special case for empty set - allow the architecture
91 code to set default SMP affinity. */ 108 code to set default SMP affinity. */
92 err = irq_select_affinity_usr(irq) ? -EINVAL : count; 109 err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count;
93 } else { 110 } else {
94 irq_set_affinity(irq, new_value); 111 irq_set_affinity(irq, new_value);
95 err = count; 112 err = count;
@@ -100,11 +117,28 @@ free_cpumask:
100 return err; 117 return err;
101} 118}
102 119
120static ssize_t irq_affinity_proc_write(struct file *file,
121 const char __user *buffer, size_t count, loff_t *pos)
122{
123 return write_irq_affinity(0, file, buffer, count, pos);
124}
125
126static ssize_t irq_affinity_list_proc_write(struct file *file,
127 const char __user *buffer, size_t count, loff_t *pos)
128{
129 return write_irq_affinity(1, file, buffer, count, pos);
130}
131
103static int irq_affinity_proc_open(struct inode *inode, struct file *file) 132static int irq_affinity_proc_open(struct inode *inode, struct file *file)
104{ 133{
105 return single_open(file, irq_affinity_proc_show, PDE(inode)->data); 134 return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
106} 135}
107 136
137static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
138{
139 return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data);
140}
141
108static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) 142static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
109{ 143{
110 return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); 144 return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
@@ -125,6 +159,14 @@ static const struct file_operations irq_affinity_hint_proc_fops = {
125 .release = single_release, 159 .release = single_release,
126}; 160};
127 161
162static const struct file_operations irq_affinity_list_proc_fops = {
163 .open = irq_affinity_list_proc_open,
164 .read = seq_read,
165 .llseek = seq_lseek,
166 .release = single_release,
167 .write = irq_affinity_list_proc_write,
168};
169
128static int default_affinity_show(struct seq_file *m, void *v) 170static int default_affinity_show(struct seq_file *m, void *v)
129{ 171{
130 seq_cpumask(m, irq_default_affinity); 172 seq_cpumask(m, irq_default_affinity);
@@ -185,7 +227,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
185{ 227{
186 struct irq_desc *desc = irq_to_desc((long) m->private); 228 struct irq_desc *desc = irq_to_desc((long) m->private);
187 229
188 seq_printf(m, "%d\n", desc->node); 230 seq_printf(m, "%d\n", desc->irq_data.node);
189 return 0; 231 return 0;
190} 232}
191 233
@@ -214,7 +256,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
214 256
215static int irq_spurious_proc_open(struct inode *inode, struct file *file) 257static int irq_spurious_proc_open(struct inode *inode, struct file *file)
216{ 258{
217 return single_open(file, irq_spurious_proc_show, NULL); 259 return single_open(file, irq_spurious_proc_show, PDE(inode)->data);
218} 260}
219 261
220static const struct file_operations irq_spurious_proc_fops = { 262static const struct file_operations irq_spurious_proc_fops = {
@@ -269,7 +311,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
269{ 311{
270 char name [MAX_NAMELEN]; 312 char name [MAX_NAMELEN];
271 313
272 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) 314 if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir)
273 return; 315 return;
274 316
275 memset(name, 0, MAX_NAMELEN); 317 memset(name, 0, MAX_NAMELEN);
@@ -289,6 +331,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
289 proc_create_data("affinity_hint", 0400, desc->dir, 331 proc_create_data("affinity_hint", 0400, desc->dir,
290 &irq_affinity_hint_proc_fops, (void *)(long)irq); 332 &irq_affinity_hint_proc_fops, (void *)(long)irq);
291 333
334 /* create /proc/irq/<irq>/smp_affinity_list */
335 proc_create_data("smp_affinity_list", 0600, desc->dir,
336 &irq_affinity_list_proc_fops, (void *)(long)irq);
337
292 proc_create_data("node", 0444, desc->dir, 338 proc_create_data("node", 0444, desc->dir,
293 &irq_node_proc_fops, (void *)(long)irq); 339 &irq_node_proc_fops, (void *)(long)irq);
294#endif 340#endif
@@ -297,6 +343,25 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
297 &irq_spurious_proc_fops, (void *)(long)irq); 343 &irq_spurious_proc_fops, (void *)(long)irq);
298} 344}
299 345
346void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
347{
348 char name [MAX_NAMELEN];
349
350 if (!root_irq_dir || !desc->dir)
351 return;
352#ifdef CONFIG_SMP
353 remove_proc_entry("smp_affinity", desc->dir);
354 remove_proc_entry("affinity_hint", desc->dir);
355 remove_proc_entry("smp_affinity_list", desc->dir);
356 remove_proc_entry("node", desc->dir);
357#endif
358 remove_proc_entry("spurious", desc->dir);
359
360 memset(name, 0, MAX_NAMELEN);
361 sprintf(name, "%u", irq);
362 remove_proc_entry(name, root_irq_dir);
363}
364
300#undef MAX_NAMELEN 365#undef MAX_NAMELEN
301 366
302void unregister_handler_proc(unsigned int irq, struct irqaction *action) 367void unregister_handler_proc(unsigned int irq, struct irqaction *action)
@@ -339,3 +404,83 @@ void init_irq_proc(void)
339 } 404 }
340} 405}
341 406
407#ifdef CONFIG_GENERIC_IRQ_SHOW
408
409int __weak arch_show_interrupts(struct seq_file *p, int prec)
410{
411 return 0;
412}
413
414#ifndef ACTUAL_NR_IRQS
415# define ACTUAL_NR_IRQS nr_irqs
416#endif
417
418int show_interrupts(struct seq_file *p, void *v)
419{
420 static int prec;
421
422 unsigned long flags, any_count = 0;
423 int i = *(loff_t *) v, j;
424 struct irqaction *action;
425 struct irq_desc *desc;
426
427 if (i > ACTUAL_NR_IRQS)
428 return 0;
429
430 if (i == ACTUAL_NR_IRQS)
431 return arch_show_interrupts(p, prec);
432
433 /* print header and calculate the width of the first column */
434 if (i == 0) {
435 for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
436 j *= 10;
437
438 seq_printf(p, "%*s", prec + 8, "");
439 for_each_online_cpu(j)
440 seq_printf(p, "CPU%-8d", j);
441 seq_putc(p, '\n');
442 }
443
444 desc = irq_to_desc(i);
445 if (!desc)
446 return 0;
447
448 raw_spin_lock_irqsave(&desc->lock, flags);
449 for_each_online_cpu(j)
450 any_count |= kstat_irqs_cpu(i, j);
451 action = desc->action;
452 if (!action && !any_count)
453 goto out;
454
455 seq_printf(p, "%*d: ", prec, i);
456 for_each_online_cpu(j)
457 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
458
459 if (desc->irq_data.chip) {
460 if (desc->irq_data.chip->irq_print_chip)
461 desc->irq_data.chip->irq_print_chip(&desc->irq_data, p);
462 else if (desc->irq_data.chip->name)
463 seq_printf(p, " %8s", desc->irq_data.chip->name);
464 else
465 seq_printf(p, " %8s", "-");
466 } else {
467 seq_printf(p, " %8s", "None");
468 }
469#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
470 seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
471#endif
472 if (desc->name)
473 seq_printf(p, "-%-8s", desc->name);
474
475 if (action) {
476 seq_printf(p, " %s", action->name);
477 while ((action = action->next) != NULL)
478 seq_printf(p, ", %s", action->name);
479 }
480
481 seq_putc(p, '\n');
482out:
483 raw_spin_unlock_irqrestore(&desc->lock, flags);
484 return 0;
485}
486#endif
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 090c3763f3a2..14dd5761e8c9 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -23,7 +23,7 @@
23#ifdef CONFIG_HARDIRQS_SW_RESEND 23#ifdef CONFIG_HARDIRQS_SW_RESEND
24 24
25/* Bitmap to handle software resend of interrupts: */ 25/* Bitmap to handle software resend of interrupts: */
26static DECLARE_BITMAP(irqs_resend, NR_IRQS); 26static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
27 27
28/* 28/*
29 * Run software resends of IRQ's 29 * Run software resends of IRQ's
@@ -55,22 +55,21 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
55 */ 55 */
56void check_irq_resend(struct irq_desc *desc, unsigned int irq) 56void check_irq_resend(struct irq_desc *desc, unsigned int irq)
57{ 57{
58 unsigned int status = desc->status;
59
60 /*
61 * Make sure the interrupt is enabled, before resending it:
62 */
63 desc->chip->enable(irq);
64
65 /* 58 /*
66 * We do not resend level type interrupts. Level type 59 * We do not resend level type interrupts. Level type
67 * interrupts are resent by hardware when they are still 60 * interrupts are resent by hardware when they are still
68 * active. 61 * active.
69 */ 62 */
70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 63 if (irq_settings_is_level(desc))
71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; 64 return;
65 if (desc->istate & IRQS_REPLAY)
66 return;
67 if (desc->istate & IRQS_PENDING) {
68 desc->istate &= ~IRQS_PENDING;
69 desc->istate |= IRQS_REPLAY;
72 70
73 if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) { 71 if (!desc->irq_data.chip->irq_retrigger ||
72 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
74#ifdef CONFIG_HARDIRQS_SW_RESEND 73#ifdef CONFIG_HARDIRQS_SW_RESEND
75 /* Set it pending and activate the softirq: */ 74 /* Set it pending and activate the softirq: */
76 set_bit(irq, irqs_resend); 75 set_bit(irq, irqs_resend);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
new file mode 100644
index 000000000000..f1667833d444
--- /dev/null
+++ b/kernel/irq/settings.h
@@ -0,0 +1,142 @@
1/*
2 * Internal header to deal with irq_desc->status which will be renamed
3 * to irq_desc->settings.
4 */
5enum {
6 _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS,
7 _IRQ_PER_CPU = IRQ_PER_CPU,
8 _IRQ_LEVEL = IRQ_LEVEL,
9 _IRQ_NOPROBE = IRQ_NOPROBE,
10 _IRQ_NOREQUEST = IRQ_NOREQUEST,
11 _IRQ_NOTHREAD = IRQ_NOTHREAD,
12 _IRQ_NOAUTOEN = IRQ_NOAUTOEN,
13 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
14 _IRQ_NO_BALANCING = IRQ_NO_BALANCING,
15 _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
16 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
17};
18
19#define IRQ_PER_CPU GOT_YOU_MORON
20#define IRQ_NO_BALANCING GOT_YOU_MORON
21#define IRQ_LEVEL GOT_YOU_MORON
22#define IRQ_NOPROBE GOT_YOU_MORON
23#define IRQ_NOREQUEST GOT_YOU_MORON
24#define IRQ_NOTHREAD GOT_YOU_MORON
25#define IRQ_NOAUTOEN GOT_YOU_MORON
26#define IRQ_NESTED_THREAD GOT_YOU_MORON
27#undef IRQF_MODIFY_MASK
28#define IRQF_MODIFY_MASK GOT_YOU_MORON
29
30static inline void
31irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
32{
33 desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK);
34 desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
35}
36
37static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
38{
39 return desc->status_use_accessors & _IRQ_PER_CPU;
40}
41
42static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
43{
44 desc->status_use_accessors |= _IRQ_PER_CPU;
45}
46
47static inline void irq_settings_set_no_balancing(struct irq_desc *desc)
48{
49 desc->status_use_accessors |= _IRQ_NO_BALANCING;
50}
51
52static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
53{
54 return desc->status_use_accessors & _IRQ_NO_BALANCING;
55}
56
57static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc)
58{
59 return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK;
60}
61
62static inline void
63irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask)
64{
65 desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK;
66 desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK;
67}
68
69static inline bool irq_settings_is_level(struct irq_desc *desc)
70{
71 return desc->status_use_accessors & _IRQ_LEVEL;
72}
73
74static inline void irq_settings_clr_level(struct irq_desc *desc)
75{
76 desc->status_use_accessors &= ~_IRQ_LEVEL;
77}
78
79static inline void irq_settings_set_level(struct irq_desc *desc)
80{
81 desc->status_use_accessors |= _IRQ_LEVEL;
82}
83
84static inline bool irq_settings_can_request(struct irq_desc *desc)
85{
86 return !(desc->status_use_accessors & _IRQ_NOREQUEST);
87}
88
89static inline void irq_settings_clr_norequest(struct irq_desc *desc)
90{
91 desc->status_use_accessors &= ~_IRQ_NOREQUEST;
92}
93
94static inline void irq_settings_set_norequest(struct irq_desc *desc)
95{
96 desc->status_use_accessors |= _IRQ_NOREQUEST;
97}
98
99static inline bool irq_settings_can_thread(struct irq_desc *desc)
100{
101 return !(desc->status_use_accessors & _IRQ_NOTHREAD);
102}
103
104static inline void irq_settings_clr_nothread(struct irq_desc *desc)
105{
106 desc->status_use_accessors &= ~_IRQ_NOTHREAD;
107}
108
109static inline void irq_settings_set_nothread(struct irq_desc *desc)
110{
111 desc->status_use_accessors |= _IRQ_NOTHREAD;
112}
113
114static inline bool irq_settings_can_probe(struct irq_desc *desc)
115{
116 return !(desc->status_use_accessors & _IRQ_NOPROBE);
117}
118
119static inline void irq_settings_clr_noprobe(struct irq_desc *desc)
120{
121 desc->status_use_accessors &= ~_IRQ_NOPROBE;
122}
123
124static inline void irq_settings_set_noprobe(struct irq_desc *desc)
125{
126 desc->status_use_accessors |= _IRQ_NOPROBE;
127}
128
129static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
130{
131 return desc->status_use_accessors & _IRQ_MOVE_PCNTXT;
132}
133
134static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
135{
136 return !(desc->status_use_accessors & _IRQ_NOAUTOEN);
137}
138
139static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
140{
141 return desc->status_use_accessors & _IRQ_NESTED_THREAD;
142}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 89fb90ae534f..aa57d5da18c1 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -14,75 +14,100 @@
14#include <linux/moduleparam.h> 14#include <linux/moduleparam.h>
15#include <linux/timer.h> 15#include <linux/timer.h>
16 16
17#include "internals.h"
18
17static int irqfixup __read_mostly; 19static int irqfixup __read_mostly;
18 20
19#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) 21#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
20static void poll_spurious_irqs(unsigned long dummy); 22static void poll_spurious_irqs(unsigned long dummy);
21static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); 23static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
24static int irq_poll_cpu;
25static atomic_t irq_poll_active;
26
27/*
28 * We wait here for a poller to finish.
29 *
30 * If the poll runs on this CPU, then we yell loudly and return
31 * false. That will leave the interrupt line disabled in the worst
32 * case, but it should never happen.
33 *
34 * We wait until the poller is done and then recheck disabled and
35 * action (about to be disabled). Only if it's still active, we return
36 * true and let the handler run.
37 */
38bool irq_wait_for_poll(struct irq_desc *desc)
39{
40 if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
41 "irq poll in progress on cpu %d for irq %d\n",
42 smp_processor_id(), desc->irq_data.irq))
43 return false;
44
45#ifdef CONFIG_SMP
46 do {
47 raw_spin_unlock(&desc->lock);
48 while (irqd_irq_inprogress(&desc->irq_data))
49 cpu_relax();
50 raw_spin_lock(&desc->lock);
51 } while (irqd_irq_inprogress(&desc->irq_data));
52 /* Might have been disabled in meantime */
53 return !irqd_irq_disabled(&desc->irq_data) && desc->action;
54#else
55 return false;
56#endif
57}
58
22 59
23/* 60/*
24 * Recovery handler for misrouted interrupts. 61 * Recovery handler for misrouted interrupts.
25 */ 62 */
26static int try_one_irq(int irq, struct irq_desc *desc) 63static int try_one_irq(int irq, struct irq_desc *desc, bool force)
27{ 64{
65 irqreturn_t ret = IRQ_NONE;
28 struct irqaction *action; 66 struct irqaction *action;
29 int ok = 0, work = 0;
30 67
31 raw_spin_lock(&desc->lock); 68 raw_spin_lock(&desc->lock);
32 /* Already running on another processor */
33 if (desc->status & IRQ_INPROGRESS) {
34 /*
35 * Already running: If it is shared get the other
36 * CPU to go looking for our mystery interrupt too
37 */
38 if (desc->action && (desc->action->flags & IRQF_SHARED))
39 desc->status |= IRQ_PENDING;
40 raw_spin_unlock(&desc->lock);
41 return ok;
42 }
43 /* Honour the normal IRQ locking */
44 desc->status |= IRQ_INPROGRESS;
45 action = desc->action;
46 raw_spin_unlock(&desc->lock);
47 69
48 while (action) { 70 /* PER_CPU and nested thread interrupts are never polled */
49 /* Only shared IRQ handlers are safe to call */ 71 if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc))
50 if (action->flags & IRQF_SHARED) { 72 goto out;
51 if (action->handler(irq, action->dev_id) ==
52 IRQ_HANDLED)
53 ok = 1;
54 }
55 action = action->next;
56 }
57 local_irq_disable();
58 /* Now clean up the flags */
59 raw_spin_lock(&desc->lock);
60 action = desc->action;
61 73
62 /* 74 /*
63 * While we were looking for a fixup someone queued a real 75 * Do not poll disabled interrupts unless the spurious
64 * IRQ clashing with our walk: 76 * disabled poller asks explicitely.
65 */ 77 */
66 while ((desc->status & IRQ_PENDING) && action) { 78 if (irqd_irq_disabled(&desc->irq_data) && !force)
79 goto out;
80
81 /*
82 * All handlers must agree on IRQF_SHARED, so we test just the
83 * first. Check for action->next as well.
84 */
85 action = desc->action;
86 if (!action || !(action->flags & IRQF_SHARED) ||
87 (action->flags & __IRQF_TIMER) || !action->next)
88 goto out;
89
90 /* Already running on another processor */
91 if (irqd_irq_inprogress(&desc->irq_data)) {
67 /* 92 /*
68 * Perform real IRQ processing for the IRQ we deferred 93 * Already running: If it is shared get the other
94 * CPU to go looking for our mystery interrupt too
69 */ 95 */
70 work = 1; 96 desc->istate |= IRQS_PENDING;
71 raw_spin_unlock(&desc->lock); 97 goto out;
72 handle_IRQ_event(irq, action);
73 raw_spin_lock(&desc->lock);
74 desc->status &= ~IRQ_PENDING;
75 } 98 }
76 desc->status &= ~IRQ_INPROGRESS;
77 /*
78 * If we did actual work for the real IRQ line we must let the
79 * IRQ controller clean up too
80 */
81 if (work && desc->chip && desc->chip->end)
82 desc->chip->end(irq);
83 raw_spin_unlock(&desc->lock);
84 99
85 return ok; 100 /* Mark it poll in progress */
101 desc->istate |= IRQS_POLL_INPROGRESS;
102 do {
103 if (handle_irq_event(desc) == IRQ_HANDLED)
104 ret = IRQ_HANDLED;
105 action = desc->action;
106 } while ((desc->istate & IRQS_PENDING) && action);
107 desc->istate &= ~IRQS_POLL_INPROGRESS;
108out:
109 raw_spin_unlock(&desc->lock);
110 return ret == IRQ_HANDLED;
86} 111}
87 112
88static int misrouted_irq(int irq) 113static int misrouted_irq(int irq)
@@ -90,6 +115,11 @@ static int misrouted_irq(int irq)
90 struct irq_desc *desc; 115 struct irq_desc *desc;
91 int i, ok = 0; 116 int i, ok = 0;
92 117
118 if (atomic_inc_return(&irq_poll_active) == 1)
119 goto out;
120
121 irq_poll_cpu = smp_processor_id();
122
93 for_each_irq_desc(i, desc) { 123 for_each_irq_desc(i, desc) {
94 if (!i) 124 if (!i)
95 continue; 125 continue;
@@ -97,9 +127,11 @@ static int misrouted_irq(int irq)
97 if (i == irq) /* Already tried */ 127 if (i == irq) /* Already tried */
98 continue; 128 continue;
99 129
100 if (try_one_irq(i, desc)) 130 if (try_one_irq(i, desc, false))
101 ok = 1; 131 ok = 1;
102 } 132 }
133out:
134 atomic_dec(&irq_poll_active);
103 /* So the caller can adjust the irq error counts */ 135 /* So the caller can adjust the irq error counts */
104 return ok; 136 return ok;
105} 137}
@@ -109,27 +141,39 @@ static void poll_spurious_irqs(unsigned long dummy)
109 struct irq_desc *desc; 141 struct irq_desc *desc;
110 int i; 142 int i;
111 143
144 if (atomic_inc_return(&irq_poll_active) != 1)
145 goto out;
146 irq_poll_cpu = smp_processor_id();
147
112 for_each_irq_desc(i, desc) { 148 for_each_irq_desc(i, desc) {
113 unsigned int status; 149 unsigned int state;
114 150
115 if (!i) 151 if (!i)
116 continue; 152 continue;
117 153
118 /* Racy but it doesn't matter */ 154 /* Racy but it doesn't matter */
119 status = desc->status; 155 state = desc->istate;
120 barrier(); 156 barrier();
121 if (!(status & IRQ_SPURIOUS_DISABLED)) 157 if (!(state & IRQS_SPURIOUS_DISABLED))
122 continue; 158 continue;
123 159
124 local_irq_disable(); 160 local_irq_disable();
125 try_one_irq(i, desc); 161 try_one_irq(i, desc, true);
126 local_irq_enable(); 162 local_irq_enable();
127 } 163 }
128 164out:
165 atomic_dec(&irq_poll_active);
129 mod_timer(&poll_spurious_irq_timer, 166 mod_timer(&poll_spurious_irq_timer,
130 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 167 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
131} 168}
132 169
170static inline int bad_action_ret(irqreturn_t action_ret)
171{
172 if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD)))
173 return 0;
174 return 1;
175}
176
133/* 177/*
134 * If 99,900 of the previous 100,000 interrupts have not been handled 178 * If 99,900 of the previous 100,000 interrupts have not been handled
135 * then assume that the IRQ is stuck in some manner. Drop a diagnostic 179 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -137,17 +181,15 @@ static void poll_spurious_irqs(unsigned long dummy)
137 * 181 *
138 * (The other 100-of-100,000 interrupts may have been a correctly 182 * (The other 100-of-100,000 interrupts may have been a correctly
139 * functioning device sharing an IRQ with the failing one) 183 * functioning device sharing an IRQ with the failing one)
140 *
141 * Called under desc->lock
142 */ 184 */
143
144static void 185static void
145__report_bad_irq(unsigned int irq, struct irq_desc *desc, 186__report_bad_irq(unsigned int irq, struct irq_desc *desc,
146 irqreturn_t action_ret) 187 irqreturn_t action_ret)
147{ 188{
148 struct irqaction *action; 189 struct irqaction *action;
190 unsigned long flags;
149 191
150 if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { 192 if (bad_action_ret(action_ret)) {
151 printk(KERN_ERR "irq event %d: bogus return value %x\n", 193 printk(KERN_ERR "irq event %d: bogus return value %x\n",
152 irq, action_ret); 194 irq, action_ret);
153 } else { 195 } else {
@@ -157,14 +199,23 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
157 dump_stack(); 199 dump_stack();
158 printk(KERN_ERR "handlers:\n"); 200 printk(KERN_ERR "handlers:\n");
159 201
202 /*
203 * We need to take desc->lock here. note_interrupt() is called
204 * w/o desc->lock held, but IRQ_PROGRESS set. We might race
205 * with something else removing an action. It's ok to take
206 * desc->lock here. See synchronize_irq().
207 */
208 raw_spin_lock_irqsave(&desc->lock, flags);
160 action = desc->action; 209 action = desc->action;
161 while (action) { 210 while (action) {
162 printk(KERN_ERR "[<%p>]", action->handler); 211 printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
163 print_symbol(" (%s)", 212 if (action->thread_fn)
164 (unsigned long)action->handler); 213 printk(KERN_CONT " threaded [<%p>] %pf",
165 printk("\n"); 214 action->thread_fn, action->thread_fn);
215 printk(KERN_CONT "\n");
166 action = action->next; 216 action = action->next;
167 } 217 }
218 raw_spin_unlock_irqrestore(&desc->lock, flags);
168} 219}
169 220
170static void 221static void
@@ -216,7 +267,19 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
216void note_interrupt(unsigned int irq, struct irq_desc *desc, 267void note_interrupt(unsigned int irq, struct irq_desc *desc,
217 irqreturn_t action_ret) 268 irqreturn_t action_ret)
218{ 269{
219 if (unlikely(action_ret != IRQ_HANDLED)) { 270 if (desc->istate & IRQS_POLL_INPROGRESS)
271 return;
272
273 /* we get here again via the threaded handler */
274 if (action_ret == IRQ_WAKE_THREAD)
275 return;
276
277 if (bad_action_ret(action_ret)) {
278 report_bad_irq(irq, desc, action_ret);
279 return;
280 }
281
282 if (unlikely(action_ret == IRQ_NONE)) {
220 /* 283 /*
221 * If we are seeing only the odd spurious IRQ caused by 284 * If we are seeing only the odd spurious IRQ caused by
222 * bus asynchronicity then don't eventually trigger an error, 285 * bus asynchronicity then don't eventually trigger an error,
@@ -228,8 +291,6 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
228 else 291 else
229 desc->irqs_unhandled++; 292 desc->irqs_unhandled++;
230 desc->last_unhandled = jiffies; 293 desc->last_unhandled = jiffies;
231 if (unlikely(action_ret != IRQ_NONE))
232 report_bad_irq(irq, desc, action_ret);
233 } 294 }
234 295
235 if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { 296 if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
@@ -252,9 +313,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
252 * Now kill the IRQ 313 * Now kill the IRQ
253 */ 314 */
254 printk(KERN_EMERG "Disabling IRQ #%d\n", irq); 315 printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
255 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; 316 desc->istate |= IRQS_SPURIOUS_DISABLED;
256 desc->depth++; 317 desc->depth++;
257 desc->chip->disable(irq); 318 irq_disable(desc);
258 319
259 mod_timer(&poll_spurious_irq_timer, 320 mod_timer(&poll_spurious_irq_timer,
260 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 321 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
new file mode 100644
index 000000000000..c58fa7da8aef
--- /dev/null
+++ b/kernel/irq_work.c
@@ -0,0 +1,166 @@
1/*
2 * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
3 *
4 * Provides a framework for enqueueing and running callbacks from hardirq
5 * context. The enqueueing is NMI-safe.
6 */
7
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/irq_work.h>
11#include <linux/hardirq.h>
12
13/*
14 * An entry can be in one of four states:
15 *
16 * free NULL, 0 -> {claimed} : free to be used
17 * claimed NULL, 3 -> {pending} : claimed to be enqueued
18 * pending next, 3 -> {busy} : queued, pending callback
19 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
20 *
21 * We use the lower two bits of the next pointer to keep PENDING and BUSY
22 * flags.
23 */
24
25#define IRQ_WORK_PENDING 1UL
26#define IRQ_WORK_BUSY 2UL
27#define IRQ_WORK_FLAGS 3UL
28
29static inline bool irq_work_is_set(struct irq_work *entry, int flags)
30{
31 return (unsigned long)entry->next & flags;
32}
33
34static inline struct irq_work *irq_work_next(struct irq_work *entry)
35{
36 unsigned long next = (unsigned long)entry->next;
37 next &= ~IRQ_WORK_FLAGS;
38 return (struct irq_work *)next;
39}
40
41static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
42{
43 unsigned long next = (unsigned long)entry;
44 next |= flags;
45 return (struct irq_work *)next;
46}
47
48static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
49
50/*
51 * Claim the entry so that no one else will poke at it.
52 */
53static bool irq_work_claim(struct irq_work *entry)
54{
55 struct irq_work *next, *nflags;
56
57 do {
58 next = entry->next;
59 if ((unsigned long)next & IRQ_WORK_PENDING)
60 return false;
61 nflags = next_flags(next, IRQ_WORK_FLAGS);
62 } while (cmpxchg(&entry->next, next, nflags) != next);
63
64 return true;
65}
66
67
68void __weak arch_irq_work_raise(void)
69{
70 /*
71 * Lame architectures will get the timer tick callback
72 */
73}
74
75/*
76 * Queue the entry and raise the IPI if needed.
77 */
78static void __irq_work_queue(struct irq_work *entry)
79{
80 struct irq_work *next;
81
82 preempt_disable();
83
84 do {
85 next = __this_cpu_read(irq_work_list);
86 /* Can assign non-atomic because we keep the flags set. */
87 entry->next = next_flags(next, IRQ_WORK_FLAGS);
88 } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
89
90 /* The list was empty, raise self-interrupt to start processing. */
91 if (!irq_work_next(entry))
92 arch_irq_work_raise();
93
94 preempt_enable();
95}
96
97/*
98 * Enqueue the irq_work @entry, returns true on success, failure when the
99 * @entry was already enqueued by someone else.
100 *
101 * Can be re-enqueued while the callback is still in progress.
102 */
103bool irq_work_queue(struct irq_work *entry)
104{
105 if (!irq_work_claim(entry)) {
106 /*
107 * Already enqueued, can't do!
108 */
109 return false;
110 }
111
112 __irq_work_queue(entry);
113 return true;
114}
115EXPORT_SYMBOL_GPL(irq_work_queue);
116
117/*
118 * Run the irq_work entries on this cpu. Requires to be ran from hardirq
119 * context with local IRQs disabled.
120 */
121void irq_work_run(void)
122{
123 struct irq_work *list;
124
125 if (this_cpu_read(irq_work_list) == NULL)
126 return;
127
128 BUG_ON(!in_irq());
129 BUG_ON(!irqs_disabled());
130
131 list = this_cpu_xchg(irq_work_list, NULL);
132
133 while (list != NULL) {
134 struct irq_work *entry = list;
135
136 list = irq_work_next(list);
137
138 /*
139 * Clear the PENDING bit, after this point the @entry
140 * can be re-used.
141 */
142 entry->next = next_flags(NULL, IRQ_WORK_BUSY);
143 entry->func(entry);
144 /*
145 * Clear the BUSY bit and return to the free state if
146 * no-one else claimed it meanwhile.
147 */
148 (void)cmpxchg(&entry->next,
149 next_flags(NULL, IRQ_WORK_BUSY),
150 NULL);
151 }
152}
153EXPORT_SYMBOL_GPL(irq_work_run);
154
155/*
156 * Synchronize against the irq_work @entry, ensures the entry is not
157 * currently in use.
158 */
159void irq_work_sync(struct irq_work *entry)
160{
161 WARN_ON_ONCE(irqs_disabled());
162
163 while (irq_work_is_set(entry, IRQ_WORK_BUSY))
164 cpu_relax();
165}
166EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
new file mode 100644
index 000000000000..a8ce45097f3d
--- /dev/null
+++ b/kernel/jump_label.c
@@ -0,0 +1,393 @@
1/*
2 * jump label support
3 *
4 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
5 * Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com>
6 *
7 */
8#include <linux/memory.h>
9#include <linux/uaccess.h>
10#include <linux/module.h>
11#include <linux/list.h>
12#include <linux/slab.h>
13#include <linux/sort.h>
14#include <linux/err.h>
15#include <linux/jump_label.h>
16
17#ifdef HAVE_JUMP_LABEL
18
19/* mutex to protect coming/going of the the jump_label table */
20static DEFINE_MUTEX(jump_label_mutex);
21
22void jump_label_lock(void)
23{
24 mutex_lock(&jump_label_mutex);
25}
26
27void jump_label_unlock(void)
28{
29 mutex_unlock(&jump_label_mutex);
30}
31
32bool jump_label_enabled(struct jump_label_key *key)
33{
34 return !!atomic_read(&key->enabled);
35}
36
37static int jump_label_cmp(const void *a, const void *b)
38{
39 const struct jump_entry *jea = a;
40 const struct jump_entry *jeb = b;
41
42 if (jea->key < jeb->key)
43 return -1;
44
45 if (jea->key > jeb->key)
46 return 1;
47
48 return 0;
49}
50
51static void
52jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
53{
54 unsigned long size;
55
56 size = (((unsigned long)stop - (unsigned long)start)
57 / sizeof(struct jump_entry));
58 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
59}
60
61static void jump_label_update(struct jump_label_key *key, int enable);
62
63void jump_label_inc(struct jump_label_key *key)
64{
65 if (atomic_inc_not_zero(&key->enabled))
66 return;
67
68 jump_label_lock();
69 if (atomic_add_return(1, &key->enabled) == 1)
70 jump_label_update(key, JUMP_LABEL_ENABLE);
71 jump_label_unlock();
72}
73
74void jump_label_dec(struct jump_label_key *key)
75{
76 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
77 return;
78
79 jump_label_update(key, JUMP_LABEL_DISABLE);
80 jump_label_unlock();
81}
82
83static int addr_conflict(struct jump_entry *entry, void *start, void *end)
84{
85 if (entry->code <= (unsigned long)end &&
86 entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start)
87 return 1;
88
89 return 0;
90}
91
92static int __jump_label_text_reserved(struct jump_entry *iter_start,
93 struct jump_entry *iter_stop, void *start, void *end)
94{
95 struct jump_entry *iter;
96
97 iter = iter_start;
98 while (iter < iter_stop) {
99 if (addr_conflict(iter, start, end))
100 return 1;
101 iter++;
102 }
103
104 return 0;
105}
106
107static void __jump_label_update(struct jump_label_key *key,
108 struct jump_entry *entry,
109 struct jump_entry *stop, int enable)
110{
111 for (; (entry < stop) &&
112 (entry->key == (jump_label_t)(unsigned long)key);
113 entry++) {
114 /*
115 * entry->code set to 0 invalidates module init text sections
116 * kernel_text_address() verifies we are not in core kernel
117 * init code, see jump_label_invalidate_module_init().
118 */
119 if (entry->code && kernel_text_address(entry->code))
120 arch_jump_label_transform(entry, enable);
121 }
122}
123
124/*
125 * Not all archs need this.
126 */
127void __weak arch_jump_label_text_poke_early(jump_label_t addr)
128{
129}
130
131static __init int jump_label_init(void)
132{
133 struct jump_entry *iter_start = __start___jump_table;
134 struct jump_entry *iter_stop = __stop___jump_table;
135 struct jump_label_key *key = NULL;
136 struct jump_entry *iter;
137
138 jump_label_lock();
139 jump_label_sort_entries(iter_start, iter_stop);
140
141 for (iter = iter_start; iter < iter_stop; iter++) {
142 arch_jump_label_text_poke_early(iter->code);
143 if (iter->key == (jump_label_t)(unsigned long)key)
144 continue;
145
146 key = (struct jump_label_key *)(unsigned long)iter->key;
147 atomic_set(&key->enabled, 0);
148 key->entries = iter;
149#ifdef CONFIG_MODULES
150 key->next = NULL;
151#endif
152 }
153 jump_label_unlock();
154
155 return 0;
156}
157early_initcall(jump_label_init);
158
159#ifdef CONFIG_MODULES
160
161struct jump_label_mod {
162 struct jump_label_mod *next;
163 struct jump_entry *entries;
164 struct module *mod;
165};
166
167static int __jump_label_mod_text_reserved(void *start, void *end)
168{
169 struct module *mod;
170
171 mod = __module_text_address((unsigned long)start);
172 if (!mod)
173 return 0;
174
175 WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
176
177 return __jump_label_text_reserved(mod->jump_entries,
178 mod->jump_entries + mod->num_jump_entries,
179 start, end);
180}
181
182static void __jump_label_mod_update(struct jump_label_key *key, int enable)
183{
184 struct jump_label_mod *mod = key->next;
185
186 while (mod) {
187 struct module *m = mod->mod;
188
189 __jump_label_update(key, mod->entries,
190 m->jump_entries + m->num_jump_entries,
191 enable);
192 mod = mod->next;
193 }
194}
195
196/***
197 * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
198 * @mod: module to patch
199 *
200 * Allow for run-time selection of the optimal nops. Before the module
201 * loads patch these with arch_get_jump_label_nop(), which is specified by
202 * the arch specific jump label code.
203 */
204void jump_label_apply_nops(struct module *mod)
205{
206 struct jump_entry *iter_start = mod->jump_entries;
207 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
208 struct jump_entry *iter;
209
210 /* if the module doesn't have jump label entries, just return */
211 if (iter_start == iter_stop)
212 return;
213
214 for (iter = iter_start; iter < iter_stop; iter++)
215 arch_jump_label_text_poke_early(iter->code);
216}
217
218static int jump_label_add_module(struct module *mod)
219{
220 struct jump_entry *iter_start = mod->jump_entries;
221 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
222 struct jump_entry *iter;
223 struct jump_label_key *key = NULL;
224 struct jump_label_mod *jlm;
225
226 /* if the module doesn't have jump label entries, just return */
227 if (iter_start == iter_stop)
228 return 0;
229
230 jump_label_sort_entries(iter_start, iter_stop);
231
232 for (iter = iter_start; iter < iter_stop; iter++) {
233 if (iter->key == (jump_label_t)(unsigned long)key)
234 continue;
235
236 key = (struct jump_label_key *)(unsigned long)iter->key;
237
238 if (__module_address(iter->key) == mod) {
239 atomic_set(&key->enabled, 0);
240 key->entries = iter;
241 key->next = NULL;
242 continue;
243 }
244
245 jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL);
246 if (!jlm)
247 return -ENOMEM;
248
249 jlm->mod = mod;
250 jlm->entries = iter;
251 jlm->next = key->next;
252 key->next = jlm;
253
254 if (jump_label_enabled(key))
255 __jump_label_update(key, iter, iter_stop,
256 JUMP_LABEL_ENABLE);
257 }
258
259 return 0;
260}
261
262static void jump_label_del_module(struct module *mod)
263{
264 struct jump_entry *iter_start = mod->jump_entries;
265 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
266 struct jump_entry *iter;
267 struct jump_label_key *key = NULL;
268 struct jump_label_mod *jlm, **prev;
269
270 for (iter = iter_start; iter < iter_stop; iter++) {
271 if (iter->key == (jump_label_t)(unsigned long)key)
272 continue;
273
274 key = (struct jump_label_key *)(unsigned long)iter->key;
275
276 if (__module_address(iter->key) == mod)
277 continue;
278
279 prev = &key->next;
280 jlm = key->next;
281
282 while (jlm && jlm->mod != mod) {
283 prev = &jlm->next;
284 jlm = jlm->next;
285 }
286
287 if (jlm) {
288 *prev = jlm->next;
289 kfree(jlm);
290 }
291 }
292}
293
294static void jump_label_invalidate_module_init(struct module *mod)
295{
296 struct jump_entry *iter_start = mod->jump_entries;
297 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
298 struct jump_entry *iter;
299
300 for (iter = iter_start; iter < iter_stop; iter++) {
301 if (within_module_init(iter->code, mod))
302 iter->code = 0;
303 }
304}
305
306static int
307jump_label_module_notify(struct notifier_block *self, unsigned long val,
308 void *data)
309{
310 struct module *mod = data;
311 int ret = 0;
312
313 switch (val) {
314 case MODULE_STATE_COMING:
315 jump_label_lock();
316 ret = jump_label_add_module(mod);
317 if (ret)
318 jump_label_del_module(mod);
319 jump_label_unlock();
320 break;
321 case MODULE_STATE_GOING:
322 jump_label_lock();
323 jump_label_del_module(mod);
324 jump_label_unlock();
325 break;
326 case MODULE_STATE_LIVE:
327 jump_label_lock();
328 jump_label_invalidate_module_init(mod);
329 jump_label_unlock();
330 break;
331 }
332
333 return notifier_from_errno(ret);
334}
335
336struct notifier_block jump_label_module_nb = {
337 .notifier_call = jump_label_module_notify,
338 .priority = 1, /* higher than tracepoints */
339};
340
341static __init int jump_label_init_module(void)
342{
343 return register_module_notifier(&jump_label_module_nb);
344}
345early_initcall(jump_label_init_module);
346
347#endif /* CONFIG_MODULES */
348
349/***
350 * jump_label_text_reserved - check if addr range is reserved
351 * @start: start text addr
352 * @end: end text addr
353 *
354 * checks if the text addr located between @start and @end
355 * overlaps with any of the jump label patch addresses. Code
356 * that wants to modify kernel text should first verify that
357 * it does not overlap with any of the jump label addresses.
358 * Caller must hold jump_label_mutex.
359 *
360 * returns 1 if there is an overlap, 0 otherwise
361 */
362int jump_label_text_reserved(void *start, void *end)
363{
364 int ret = __jump_label_text_reserved(__start___jump_table,
365 __stop___jump_table, start, end);
366
367 if (ret)
368 return ret;
369
370#ifdef CONFIG_MODULES
371 ret = __jump_label_mod_text_reserved(start, end);
372#endif
373 return ret;
374}
375
376static void jump_label_update(struct jump_label_key *key, int enable)
377{
378 struct jump_entry *entry = key->entries, *stop = __stop___jump_table;
379
380#ifdef CONFIG_MODULES
381 struct module *mod = __module_address((jump_label_t)key);
382
383 __jump_label_mod_update(key, enable);
384
385 if (mod)
386 stop = mod->jump_entries + mod->num_jump_entries;
387#endif
388 /* if there are no users, entry can be NULL */
389 if (entry)
390 __jump_label_update(key, entry, stop, enable);
391}
392
393#endif
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6f6d091b5757..079f1d39a8b8 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -64,14 +64,14 @@ static inline int is_kernel_text(unsigned long addr)
64 if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || 64 if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
65 arch_is_kernel_text(addr)) 65 arch_is_kernel_text(addr))
66 return 1; 66 return 1;
67 return in_gate_area_no_task(addr); 67 return in_gate_area_no_mm(addr);
68} 68}
69 69
70static inline int is_kernel(unsigned long addr) 70static inline int is_kernel(unsigned long addr)
71{ 71{
72 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) 72 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
73 return 1; 73 return 1;
74 return in_gate_area_no_task(addr); 74 return in_gate_area_no_mm(addr);
75} 75}
76 76
77static int is_ksym_addr(unsigned long addr) 77static int is_ksym_addr(unsigned long addr)
@@ -342,13 +342,15 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
342} 342}
343 343
344/* Look up a kernel symbol and return it in a text buffer. */ 344/* Look up a kernel symbol and return it in a text buffer. */
345int sprint_symbol(char *buffer, unsigned long address) 345static int __sprint_symbol(char *buffer, unsigned long address,
346 int symbol_offset)
346{ 347{
347 char *modname; 348 char *modname;
348 const char *name; 349 const char *name;
349 unsigned long offset, size; 350 unsigned long offset, size;
350 int len; 351 int len;
351 352
353 address += symbol_offset;
352 name = kallsyms_lookup(address, &size, &offset, &modname, buffer); 354 name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
353 if (!name) 355 if (!name)
354 return sprintf(buffer, "0x%lx", address); 356 return sprintf(buffer, "0x%lx", address);
@@ -357,17 +359,53 @@ int sprint_symbol(char *buffer, unsigned long address)
357 strcpy(buffer, name); 359 strcpy(buffer, name);
358 len = strlen(buffer); 360 len = strlen(buffer);
359 buffer += len; 361 buffer += len;
362 offset -= symbol_offset;
360 363
361 if (modname) 364 if (modname)
362 len += sprintf(buffer, "+%#lx/%#lx [%s]", 365 len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname);
363 offset, size, modname);
364 else 366 else
365 len += sprintf(buffer, "+%#lx/%#lx", offset, size); 367 len += sprintf(buffer, "+%#lx/%#lx", offset, size);
366 368
367 return len; 369 return len;
368} 370}
371
372/**
373 * sprint_symbol - Look up a kernel symbol and return it in a text buffer
374 * @buffer: buffer to be stored
375 * @address: address to lookup
376 *
377 * This function looks up a kernel symbol with @address and stores its name,
378 * offset, size and module name to @buffer if possible. If no symbol was found,
379 * just saves its @address as is.
380 *
381 * This function returns the number of bytes stored in @buffer.
382 */
383int sprint_symbol(char *buffer, unsigned long address)
384{
385 return __sprint_symbol(buffer, address, 0);
386}
387
369EXPORT_SYMBOL_GPL(sprint_symbol); 388EXPORT_SYMBOL_GPL(sprint_symbol);
370 389
390/**
391 * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
392 * @buffer: buffer to be stored
393 * @address: address to lookup
394 *
395 * This function is for stack backtrace and does the same thing as
396 * sprint_symbol() but with modified/decreased @address. If there is a
397 * tail-call to the function marked "noreturn", gcc optimized out code after
398 * the call so that the stack-saved return address could point outside of the
399 * caller. This function ensures that kallsyms will find the original caller
400 * by decreasing @address.
401 *
402 * This function returns the number of bytes stored in @buffer.
403 */
404int sprint_backtrace(char *buffer, unsigned long address)
405{
406 return __sprint_symbol(buffer, address, -1);
407}
408
371/* Look up a kernel symbol and print it to the kernel messages. */ 409/* Look up a kernel symbol and print it to the kernel messages. */
372void __print_symbol(const char *fmt, unsigned long address) 410void __print_symbol(const char *fmt, unsigned long address)
373{ 411{
@@ -477,13 +515,11 @@ static int s_show(struct seq_file *m, void *p)
477 */ 515 */
478 type = iter->exported ? toupper(iter->type) : 516 type = iter->exported ? toupper(iter->type) :
479 tolower(iter->type); 517 tolower(iter->type);
480 seq_printf(m, "%0*lx %c %s\t[%s]\n", 518 seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
481 (int)(2 * sizeof(void *)), 519 type, iter->name, iter->module_name);
482 iter->value, type, iter->name, iter->module_name);
483 } else 520 } else
484 seq_printf(m, "%0*lx %c %s\n", 521 seq_printf(m, "%pK %c %s\n", (void *)iter->value,
485 (int)(2 * sizeof(void *)), 522 iter->type, iter->name);
486 iter->value, iter->type, iter->name);
487 return 0; 523 return 0;
488} 524}
489 525
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c0613f7d6730..8d814cbc8109 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -33,6 +33,7 @@
33#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/kmsg_dump.h> 35#include <linux/kmsg_dump.h>
36#include <linux/syscore_ops.h>
36 37
37#include <asm/page.h> 38#include <asm/page.h>
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
@@ -144,7 +145,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
144 /* Initialize the list of destination pages */ 145 /* Initialize the list of destination pages */
145 INIT_LIST_HEAD(&image->dest_pages); 146 INIT_LIST_HEAD(&image->dest_pages);
146 147
147 /* Initialize the list of unuseable pages */ 148 /* Initialize the list of unusable pages */
148 INIT_LIST_HEAD(&image->unuseable_pages); 149 INIT_LIST_HEAD(&image->unuseable_pages);
149 150
150 /* Read in the segments */ 151 /* Read in the segments */
@@ -163,7 +164,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
163 * just verifies it is an address we can use. 164 * just verifies it is an address we can use.
164 * 165 *
165 * Since the kernel does everything in page size chunks ensure 166 * Since the kernel does everything in page size chunks ensure
166 * the destination addreses are page aligned. Too many 167 * the destination addresses are page aligned. Too many
167 * special cases crop of when we don't do this. The most 168 * special cases crop of when we don't do this. The most
168 * insidious is getting overlapping destination addresses 169 * insidious is getting overlapping destination addresses
169 * simply because addresses are changed to page size 170 * simply because addresses are changed to page size
@@ -454,7 +455,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
454 /* Deal with the destination pages I have inadvertently allocated. 455 /* Deal with the destination pages I have inadvertently allocated.
455 * 456 *
456 * Ideally I would convert multi-page allocations into single 457 * Ideally I would convert multi-page allocations into single
457 * page allocations, and add everyting to image->dest_pages. 458 * page allocations, and add everything to image->dest_pages.
458 * 459 *
459 * For now it is simpler to just free the pages. 460 * For now it is simpler to just free the pages.
460 */ 461 */
@@ -602,7 +603,7 @@ static void kimage_free_extra_pages(struct kimage *image)
602 /* Walk through and free any extra destination pages I may have */ 603 /* Walk through and free any extra destination pages I may have */
603 kimage_free_page_list(&image->dest_pages); 604 kimage_free_page_list(&image->dest_pages);
604 605
605 /* Walk through and free any unuseable pages I have cached */ 606 /* Walk through and free any unusable pages I have cached */
606 kimage_free_page_list(&image->unuseable_pages); 607 kimage_free_page_list(&image->unuseable_pages);
607 608
608} 609}
@@ -816,7 +817,7 @@ static int kimage_load_normal_segment(struct kimage *image,
816 817
817 ptr = kmap(page); 818 ptr = kmap(page);
818 /* Start with a clear page */ 819 /* Start with a clear page */
819 memset(ptr, 0, PAGE_SIZE); 820 clear_page(ptr);
820 ptr += maddr & ~PAGE_MASK; 821 ptr += maddr & ~PAGE_MASK;
821 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 822 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
822 if (mchunk > mbytes) 823 if (mchunk > mbytes)
@@ -1099,7 +1100,8 @@ size_t crash_get_memory_size(void)
1099 return size; 1100 return size;
1100} 1101}
1101 1102
1102static void free_reserved_phys_range(unsigned long begin, unsigned long end) 1103void __weak crash_free_reserved_phys_range(unsigned long begin,
1104 unsigned long end)
1103{ 1105{
1104 unsigned long addr; 1106 unsigned long addr;
1105 1107
@@ -1135,7 +1137,7 @@ int crash_shrink_memory(unsigned long new_size)
1135 start = roundup(start, PAGE_SIZE); 1137 start = roundup(start, PAGE_SIZE);
1136 end = roundup(start + new_size, PAGE_SIZE); 1138 end = roundup(start + new_size, PAGE_SIZE);
1137 1139
1138 free_reserved_phys_range(end, crashk_res.end); 1140 crash_free_reserved_phys_range(end, crashk_res.end);
1139 1141
1140 if ((start == end) && (crashk_res.parent != NULL)) 1142 if ((start == end) && (crashk_res.parent != NULL))
1141 release_resource(&crashk_res); 1143 release_resource(&crashk_res);
@@ -1529,8 +1531,7 @@ int kernel_kexec(void)
1529 if (error) 1531 if (error)
1530 goto Enable_cpus; 1532 goto Enable_cpus;
1531 local_irq_disable(); 1533 local_irq_disable();
1532 /* Suspend system devices */ 1534 error = syscore_suspend();
1533 error = sysdev_suspend(PMSG_FREEZE);
1534 if (error) 1535 if (error)
1535 goto Enable_irqs; 1536 goto Enable_irqs;
1536 } else 1537 } else
@@ -1545,7 +1546,7 @@ int kernel_kexec(void)
1545 1546
1546#ifdef CONFIG_KEXEC_JUMP 1547#ifdef CONFIG_KEXEC_JUMP
1547 if (kexec_image->preserve_context) { 1548 if (kexec_image->preserve_context) {
1548 sysdev_resume(); 1549 syscore_resume();
1549 Enable_irqs: 1550 Enable_irqs:
1550 local_irq_enable(); 1551 local_irq_enable();
1551 Enable_cpus: 1552 Enable_cpus:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9cd0591c96a2..47613dfb7b28 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,6 +25,7 @@
25#include <linux/kmod.h> 25#include <linux/kmod.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/completion.h> 27#include <linux/completion.h>
28#include <linux/cred.h>
28#include <linux/file.h> 29#include <linux/file.h>
29#include <linux/fdtable.h> 30#include <linux/fdtable.h>
30#include <linux/workqueue.h> 31#include <linux/workqueue.h>
@@ -43,6 +44,13 @@ extern int max_threads;
43 44
44static struct workqueue_struct *khelper_wq; 45static struct workqueue_struct *khelper_wq;
45 46
47#define CAP_BSET (void *)1
48#define CAP_PI (void *)2
49
50static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
51static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
52static DEFINE_SPINLOCK(umh_sysctl_lock);
53
46#ifdef CONFIG_MODULES 54#ifdef CONFIG_MODULES
47 55
48/* 56/*
@@ -132,6 +140,7 @@ EXPORT_SYMBOL(__request_module);
132static int ____call_usermodehelper(void *data) 140static int ____call_usermodehelper(void *data)
133{ 141{
134 struct subprocess_info *sub_info = data; 142 struct subprocess_info *sub_info = data;
143 struct cred *new;
135 int retval; 144 int retval;
136 145
137 spin_lock_irq(&current->sighand->siglock); 146 spin_lock_irq(&current->sighand->siglock);
@@ -147,12 +156,27 @@ static int ____call_usermodehelper(void *data)
147 */ 156 */
148 set_user_nice(current, 0); 157 set_user_nice(current, 0);
149 158
159 retval = -ENOMEM;
160 new = prepare_kernel_cred(current);
161 if (!new)
162 goto fail;
163
164 spin_lock(&umh_sysctl_lock);
165 new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
166 new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
167 new->cap_inheritable);
168 spin_unlock(&umh_sysctl_lock);
169
150 if (sub_info->init) { 170 if (sub_info->init) {
151 retval = sub_info->init(sub_info); 171 retval = sub_info->init(sub_info, new);
152 if (retval) 172 if (retval) {
173 abort_creds(new);
153 goto fail; 174 goto fail;
175 }
154 } 176 }
155 177
178 commit_creds(new);
179
156 retval = kernel_execve(sub_info->path, 180 retval = kernel_execve(sub_info->path,
157 (const char *const *)sub_info->argv, 181 (const char *const *)sub_info->argv,
158 (const char *const *)sub_info->envp); 182 (const char *const *)sub_info->envp);
@@ -245,7 +269,6 @@ static void __call_usermodehelper(struct work_struct *work)
245 } 269 }
246} 270}
247 271
248#ifdef CONFIG_PM_SLEEP
249/* 272/*
250 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY 273 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
251 * (used for preventing user land processes from being created after the user 274 * (used for preventing user land processes from being created after the user
@@ -301,6 +324,15 @@ void usermodehelper_enable(void)
301 usermodehelper_disabled = 0; 324 usermodehelper_disabled = 0;
302} 325}
303 326
327/**
328 * usermodehelper_is_disabled - check if new helpers are allowed to be started
329 */
330bool usermodehelper_is_disabled(void)
331{
332 return usermodehelper_disabled;
333}
334EXPORT_SYMBOL_GPL(usermodehelper_is_disabled);
335
304static void helper_lock(void) 336static void helper_lock(void)
305{ 337{
306 atomic_inc(&running_helpers); 338 atomic_inc(&running_helpers);
@@ -312,12 +344,6 @@ static void helper_unlock(void)
312 if (atomic_dec_and_test(&running_helpers)) 344 if (atomic_dec_and_test(&running_helpers))
313 wake_up(&running_helpers_waitq); 345 wake_up(&running_helpers_waitq);
314} 346}
315#else /* CONFIG_PM_SLEEP */
316#define usermodehelper_disabled 0
317
318static inline void helper_lock(void) {}
319static inline void helper_unlock(void) {}
320#endif /* CONFIG_PM_SLEEP */
321 347
322/** 348/**
323 * call_usermodehelper_setup - prepare to call a usermode helper 349 * call_usermodehelper_setup - prepare to call a usermode helper
@@ -364,7 +390,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
364 * context in which call_usermodehelper_exec is called. 390 * context in which call_usermodehelper_exec is called.
365 */ 391 */
366void call_usermodehelper_setfns(struct subprocess_info *info, 392void call_usermodehelper_setfns(struct subprocess_info *info,
367 int (*init)(struct subprocess_info *info), 393 int (*init)(struct subprocess_info *info, struct cred *new),
368 void (*cleanup)(struct subprocess_info *info), 394 void (*cleanup)(struct subprocess_info *info),
369 void *data) 395 void *data)
370{ 396{
@@ -418,6 +444,84 @@ unlock:
418} 444}
419EXPORT_SYMBOL(call_usermodehelper_exec); 445EXPORT_SYMBOL(call_usermodehelper_exec);
420 446
447static int proc_cap_handler(struct ctl_table *table, int write,
448 void __user *buffer, size_t *lenp, loff_t *ppos)
449{
450 struct ctl_table t;
451 unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
452 kernel_cap_t new_cap;
453 int err, i;
454
455 if (write && (!capable(CAP_SETPCAP) ||
456 !capable(CAP_SYS_MODULE)))
457 return -EPERM;
458
459 /*
460 * convert from the global kernel_cap_t to the ulong array to print to
461 * userspace if this is a read.
462 */
463 spin_lock(&umh_sysctl_lock);
464 for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
465 if (table->data == CAP_BSET)
466 cap_array[i] = usermodehelper_bset.cap[i];
467 else if (table->data == CAP_PI)
468 cap_array[i] = usermodehelper_inheritable.cap[i];
469 else
470 BUG();
471 }
472 spin_unlock(&umh_sysctl_lock);
473
474 t = *table;
475 t.data = &cap_array;
476
477 /*
478 * actually read or write and array of ulongs from userspace. Remember
479 * these are least significant 32 bits first
480 */
481 err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
482 if (err < 0)
483 return err;
484
485 /*
486 * convert from the sysctl array of ulongs to the kernel_cap_t
487 * internal representation
488 */
489 for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
490 new_cap.cap[i] = cap_array[i];
491
492 /*
493 * Drop everything not in the new_cap (but don't add things)
494 */
495 spin_lock(&umh_sysctl_lock);
496 if (write) {
497 if (table->data == CAP_BSET)
498 usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
499 if (table->data == CAP_PI)
500 usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
501 }
502 spin_unlock(&umh_sysctl_lock);
503
504 return 0;
505}
506
507struct ctl_table usermodehelper_table[] = {
508 {
509 .procname = "bset",
510 .data = CAP_BSET,
511 .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
512 .mode = 0600,
513 .proc_handler = proc_cap_handler,
514 },
515 {
516 .procname = "inheritable",
517 .data = CAP_PI,
518 .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
519 .mode = 0600,
520 .proc_handler = proc_cap_handler,
521 },
522 { }
523};
524
421void __init usermodehelper_init(void) 525void __init usermodehelper_init(void)
422{ 526{
423 khelper_wq = create_singlethread_workqueue("khelper"); 527 khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 282035f3ae96..77981813a1e7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,6 +47,7 @@
47#include <linux/memory.h> 47#include <linux/memory.h>
48#include <linux/ftrace.h> 48#include <linux/ftrace.h>
49#include <linux/cpu.h> 49#include <linux/cpu.h>
50#include <linux/jump_label.h>
50 51
51#include <asm-generic/sections.h> 52#include <asm-generic/sections.h>
52#include <asm/cacheflush.h> 53#include <asm/cacheflush.h>
@@ -73,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
73/* NOTE: change this value only with kprobe_mutex held */ 74/* NOTE: change this value only with kprobe_mutex held */
74static bool kprobes_all_disarmed; 75static bool kprobes_all_disarmed;
75 76
76static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 77/* This protects kprobe_table and optimizing_list */
78static DEFINE_MUTEX(kprobe_mutex);
77static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 79static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
78static struct { 80static struct {
79 spinlock_t lock ____cacheline_aligned_in_smp; 81 spinlock_t lock ____cacheline_aligned_in_smp;
@@ -315,12 +317,12 @@ void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
315/* We have preemption disabled.. so it is safe to use __ versions */ 317/* We have preemption disabled.. so it is safe to use __ versions */
316static inline void set_kprobe_instance(struct kprobe *kp) 318static inline void set_kprobe_instance(struct kprobe *kp)
317{ 319{
318 __get_cpu_var(kprobe_instance) = kp; 320 __this_cpu_write(kprobe_instance, kp);
319} 321}
320 322
321static inline void reset_kprobe_instance(void) 323static inline void reset_kprobe_instance(void)
322{ 324{
323 __get_cpu_var(kprobe_instance) = NULL; 325 __this_cpu_write(kprobe_instance, NULL);
324} 326}
325 327
326/* 328/*
@@ -352,13 +354,20 @@ static inline int kprobe_aggrprobe(struct kprobe *p)
352 return p->pre_handler == aggr_pre_handler; 354 return p->pre_handler == aggr_pre_handler;
353} 355}
354 356
357/* Return true(!0) if the kprobe is unused */
358static inline int kprobe_unused(struct kprobe *p)
359{
360 return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
361 list_empty(&p->list);
362}
363
355/* 364/*
356 * Keep all fields in the kprobe consistent 365 * Keep all fields in the kprobe consistent
357 */ 366 */
358static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) 367static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
359{ 368{
360 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); 369 memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
361 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); 370 memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
362} 371}
363 372
364#ifdef CONFIG_OPTPROBES 373#ifdef CONFIG_OPTPROBES
@@ -382,6 +391,17 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
382 } 391 }
383} 392}
384 393
394/* Free optimized instructions and optimized_kprobe */
395static __kprobes void free_aggr_kprobe(struct kprobe *p)
396{
397 struct optimized_kprobe *op;
398
399 op = container_of(p, struct optimized_kprobe, kp);
400 arch_remove_optimized_kprobe(op);
401 arch_remove_kprobe(p);
402 kfree(op);
403}
404
385/* Return true(!0) if the kprobe is ready for optimization. */ 405/* Return true(!0) if the kprobe is ready for optimization. */
386static inline int kprobe_optready(struct kprobe *p) 406static inline int kprobe_optready(struct kprobe *p)
387{ 407{
@@ -395,11 +415,38 @@ static inline int kprobe_optready(struct kprobe *p)
395 return 0; 415 return 0;
396} 416}
397 417
418/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
419static inline int kprobe_disarmed(struct kprobe *p)
420{
421 struct optimized_kprobe *op;
422
423 /* If kprobe is not aggr/opt probe, just return kprobe is disabled */
424 if (!kprobe_aggrprobe(p))
425 return kprobe_disabled(p);
426
427 op = container_of(p, struct optimized_kprobe, kp);
428
429 return kprobe_disabled(p) && list_empty(&op->list);
430}
431
432/* Return true(!0) if the probe is queued on (un)optimizing lists */
433static int __kprobes kprobe_queued(struct kprobe *p)
434{
435 struct optimized_kprobe *op;
436
437 if (kprobe_aggrprobe(p)) {
438 op = container_of(p, struct optimized_kprobe, kp);
439 if (!list_empty(&op->list))
440 return 1;
441 }
442 return 0;
443}
444
398/* 445/*
399 * Return an optimized kprobe whose optimizing code replaces 446 * Return an optimized kprobe whose optimizing code replaces
400 * instructions including addr (exclude breakpoint). 447 * instructions including addr (exclude breakpoint).
401 */ 448 */
402struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) 449static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
403{ 450{
404 int i; 451 int i;
405 struct kprobe *p = NULL; 452 struct kprobe *p = NULL;
@@ -420,30 +467,23 @@ struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
420 467
421/* Optimization staging list, protected by kprobe_mutex */ 468/* Optimization staging list, protected by kprobe_mutex */
422static LIST_HEAD(optimizing_list); 469static LIST_HEAD(optimizing_list);
470static LIST_HEAD(unoptimizing_list);
423 471
424static void kprobe_optimizer(struct work_struct *work); 472static void kprobe_optimizer(struct work_struct *work);
425static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); 473static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
474static DECLARE_COMPLETION(optimizer_comp);
426#define OPTIMIZE_DELAY 5 475#define OPTIMIZE_DELAY 5
427 476
428/* Kprobe jump optimizer */ 477/*
429static __kprobes void kprobe_optimizer(struct work_struct *work) 478 * Optimize (replace a breakpoint with a jump) kprobes listed on
479 * optimizing_list.
480 */
481static __kprobes void do_optimize_kprobes(void)
430{ 482{
431 struct optimized_kprobe *op, *tmp; 483 /* Optimization never be done when disarmed */
432 484 if (kprobes_all_disarmed || !kprobes_allow_optimization ||
433 /* Lock modules while optimizing kprobes */ 485 list_empty(&optimizing_list))
434 mutex_lock(&module_mutex); 486 return;
435 mutex_lock(&kprobe_mutex);
436 if (kprobes_all_disarmed || !kprobes_allow_optimization)
437 goto end;
438
439 /*
440 * Wait for quiesence period to ensure all running interrupts
441 * are done. Because optprobe may modify multiple instructions
442 * there is a chance that Nth instruction is interrupted. In that
443 * case, running interrupt can return to 2nd-Nth byte of jump
444 * instruction. This wait is for avoiding it.
445 */
446 synchronize_sched();
447 487
448 /* 488 /*
449 * The optimization/unoptimization refers online_cpus via 489 * The optimization/unoptimization refers online_cpus via
@@ -457,17 +497,111 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
457 */ 497 */
458 get_online_cpus(); 498 get_online_cpus();
459 mutex_lock(&text_mutex); 499 mutex_lock(&text_mutex);
460 list_for_each_entry_safe(op, tmp, &optimizing_list, list) { 500 arch_optimize_kprobes(&optimizing_list);
461 WARN_ON(kprobe_disabled(&op->kp)); 501 mutex_unlock(&text_mutex);
462 if (arch_optimize_kprobe(op) < 0) 502 put_online_cpus();
463 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 503}
464 list_del_init(&op->list); 504
505/*
506 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
507 * if need) kprobes listed on unoptimizing_list.
508 */
509static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
510{
511 struct optimized_kprobe *op, *tmp;
512
513 /* Unoptimization must be done anytime */
514 if (list_empty(&unoptimizing_list))
515 return;
516
517 /* Ditto to do_optimize_kprobes */
518 get_online_cpus();
519 mutex_lock(&text_mutex);
520 arch_unoptimize_kprobes(&unoptimizing_list, free_list);
521 /* Loop free_list for disarming */
522 list_for_each_entry_safe(op, tmp, free_list, list) {
523 /* Disarm probes if marked disabled */
524 if (kprobe_disabled(&op->kp))
525 arch_disarm_kprobe(&op->kp);
526 if (kprobe_unused(&op->kp)) {
527 /*
528 * Remove unused probes from hash list. After waiting
529 * for synchronization, these probes are reclaimed.
530 * (reclaiming is done by do_free_cleaned_kprobes.)
531 */
532 hlist_del_rcu(&op->kp.hlist);
533 } else
534 list_del_init(&op->list);
465 } 535 }
466 mutex_unlock(&text_mutex); 536 mutex_unlock(&text_mutex);
467 put_online_cpus(); 537 put_online_cpus();
468end: 538}
539
540/* Reclaim all kprobes on the free_list */
541static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
542{
543 struct optimized_kprobe *op, *tmp;
544
545 list_for_each_entry_safe(op, tmp, free_list, list) {
546 BUG_ON(!kprobe_unused(&op->kp));
547 list_del_init(&op->list);
548 free_aggr_kprobe(&op->kp);
549 }
550}
551
552/* Start optimizer after OPTIMIZE_DELAY passed */
553static __kprobes void kick_kprobe_optimizer(void)
554{
555 if (!delayed_work_pending(&optimizing_work))
556 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
557}
558
559/* Kprobe jump optimizer */
560static __kprobes void kprobe_optimizer(struct work_struct *work)
561{
562 LIST_HEAD(free_list);
563
564 /* Lock modules while optimizing kprobes */
565 mutex_lock(&module_mutex);
566 mutex_lock(&kprobe_mutex);
567
568 /*
569 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
570 * kprobes before waiting for quiesence period.
571 */
572 do_unoptimize_kprobes(&free_list);
573
574 /*
575 * Step 2: Wait for quiesence period to ensure all running interrupts
576 * are done. Because optprobe may modify multiple instructions
577 * there is a chance that Nth instruction is interrupted. In that
578 * case, running interrupt can return to 2nd-Nth byte of jump
579 * instruction. This wait is for avoiding it.
580 */
581 synchronize_sched();
582
583 /* Step 3: Optimize kprobes after quiesence period */
584 do_optimize_kprobes();
585
586 /* Step 4: Free cleaned kprobes after quiesence period */
587 do_free_cleaned_kprobes(&free_list);
588
469 mutex_unlock(&kprobe_mutex); 589 mutex_unlock(&kprobe_mutex);
470 mutex_unlock(&module_mutex); 590 mutex_unlock(&module_mutex);
591
592 /* Step 5: Kick optimizer again if needed */
593 if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
594 kick_kprobe_optimizer();
595 else
596 /* Wake up all waiters */
597 complete_all(&optimizer_comp);
598}
599
600/* Wait for completing optimization and unoptimization */
601static __kprobes void wait_for_kprobe_optimizer(void)
602{
603 if (delayed_work_pending(&optimizing_work))
604 wait_for_completion(&optimizer_comp);
471} 605}
472 606
473/* Optimize kprobe if p is ready to be optimized */ 607/* Optimize kprobe if p is ready to be optimized */
@@ -493,42 +627,99 @@ static __kprobes void optimize_kprobe(struct kprobe *p)
493 /* Check if it is already optimized. */ 627 /* Check if it is already optimized. */
494 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) 628 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
495 return; 629 return;
496
497 op->kp.flags |= KPROBE_FLAG_OPTIMIZED; 630 op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
498 list_add(&op->list, &optimizing_list); 631
499 if (!delayed_work_pending(&optimizing_work)) 632 if (!list_empty(&op->list))
500 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); 633 /* This is under unoptimizing. Just dequeue the probe */
634 list_del_init(&op->list);
635 else {
636 list_add(&op->list, &optimizing_list);
637 kick_kprobe_optimizer();
638 }
639}
640
641/* Short cut to direct unoptimizing */
642static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
643{
644 get_online_cpus();
645 arch_unoptimize_kprobe(op);
646 put_online_cpus();
647 if (kprobe_disabled(&op->kp))
648 arch_disarm_kprobe(&op->kp);
501} 649}
502 650
503/* Unoptimize a kprobe if p is optimized */ 651/* Unoptimize a kprobe if p is optimized */
504static __kprobes void unoptimize_kprobe(struct kprobe *p) 652static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force)
505{ 653{
506 struct optimized_kprobe *op; 654 struct optimized_kprobe *op;
507 655
508 if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) { 656 if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
509 op = container_of(p, struct optimized_kprobe, kp); 657 return; /* This is not an optprobe nor optimized */
510 if (!list_empty(&op->list)) 658
511 /* Dequeue from the optimization queue */ 659 op = container_of(p, struct optimized_kprobe, kp);
660 if (!kprobe_optimized(p)) {
661 /* Unoptimized or unoptimizing case */
662 if (force && !list_empty(&op->list)) {
663 /*
664 * Only if this is unoptimizing kprobe and forced,
665 * forcibly unoptimize it. (No need to unoptimize
666 * unoptimized kprobe again :)
667 */
512 list_del_init(&op->list); 668 list_del_init(&op->list);
513 else 669 force_unoptimize_kprobe(op);
514 /* Replace jump with break */ 670 }
515 arch_unoptimize_kprobe(op); 671 return;
516 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 672 }
673
674 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
675 if (!list_empty(&op->list)) {
676 /* Dequeue from the optimization queue */
677 list_del_init(&op->list);
678 return;
679 }
680 /* Optimized kprobe case */
681 if (force)
682 /* Forcibly update the code: this is a special case */
683 force_unoptimize_kprobe(op);
684 else {
685 list_add(&op->list, &unoptimizing_list);
686 kick_kprobe_optimizer();
517 } 687 }
518} 688}
519 689
690/* Cancel unoptimizing for reusing */
691static void reuse_unused_kprobe(struct kprobe *ap)
692{
693 struct optimized_kprobe *op;
694
695 BUG_ON(!kprobe_unused(ap));
696 /*
697 * Unused kprobe MUST be on the way of delayed unoptimizing (means
698 * there is still a relative jump) and disabled.
699 */
700 op = container_of(ap, struct optimized_kprobe, kp);
701 if (unlikely(list_empty(&op->list)))
702 printk(KERN_WARNING "Warning: found a stray unused "
703 "aggrprobe@%p\n", ap->addr);
704 /* Enable the probe again */
705 ap->flags &= ~KPROBE_FLAG_DISABLED;
706 /* Optimize it again (remove from op->list) */
707 BUG_ON(!kprobe_optready(ap));
708 optimize_kprobe(ap);
709}
710
520/* Remove optimized instructions */ 711/* Remove optimized instructions */
521static void __kprobes kill_optimized_kprobe(struct kprobe *p) 712static void __kprobes kill_optimized_kprobe(struct kprobe *p)
522{ 713{
523 struct optimized_kprobe *op; 714 struct optimized_kprobe *op;
524 715
525 op = container_of(p, struct optimized_kprobe, kp); 716 op = container_of(p, struct optimized_kprobe, kp);
526 if (!list_empty(&op->list)) { 717 if (!list_empty(&op->list))
527 /* Dequeue from the optimization queue */ 718 /* Dequeue from the (un)optimization queue */
528 list_del_init(&op->list); 719 list_del_init(&op->list);
529 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 720
530 } 721 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
531 /* Don't unoptimize, because the target code will be freed. */ 722 /* Don't touch the code, because it is already freed. */
532 arch_remove_optimized_kprobe(op); 723 arch_remove_optimized_kprobe(op);
533} 724}
534 725
@@ -541,16 +732,6 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
541 arch_prepare_optimized_kprobe(op); 732 arch_prepare_optimized_kprobe(op);
542} 733}
543 734
544/* Free optimized instructions and optimized_kprobe */
545static __kprobes void free_aggr_kprobe(struct kprobe *p)
546{
547 struct optimized_kprobe *op;
548
549 op = container_of(p, struct optimized_kprobe, kp);
550 arch_remove_optimized_kprobe(op);
551 kfree(op);
552}
553
554/* Allocate new optimized_kprobe and try to prepare optimized instructions */ 735/* Allocate new optimized_kprobe and try to prepare optimized instructions */
555static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) 736static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
556{ 737{
@@ -585,7 +766,8 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
585 op = container_of(ap, struct optimized_kprobe, kp); 766 op = container_of(ap, struct optimized_kprobe, kp);
586 if (!arch_prepared_optinsn(&op->optinsn)) { 767 if (!arch_prepared_optinsn(&op->optinsn)) {
587 /* If failed to setup optimizing, fallback to kprobe */ 768 /* If failed to setup optimizing, fallback to kprobe */
588 free_aggr_kprobe(ap); 769 arch_remove_optimized_kprobe(op);
770 kfree(op);
589 return; 771 return;
590 } 772 }
591 773
@@ -594,6 +776,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
594} 776}
595 777
596#ifdef CONFIG_SYSCTL 778#ifdef CONFIG_SYSCTL
779/* This should be called with kprobe_mutex locked */
597static void __kprobes optimize_all_kprobes(void) 780static void __kprobes optimize_all_kprobes(void)
598{ 781{
599 struct hlist_head *head; 782 struct hlist_head *head;
@@ -606,17 +789,16 @@ static void __kprobes optimize_all_kprobes(void)
606 return; 789 return;
607 790
608 kprobes_allow_optimization = true; 791 kprobes_allow_optimization = true;
609 mutex_lock(&text_mutex);
610 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 792 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
611 head = &kprobe_table[i]; 793 head = &kprobe_table[i];
612 hlist_for_each_entry_rcu(p, node, head, hlist) 794 hlist_for_each_entry_rcu(p, node, head, hlist)
613 if (!kprobe_disabled(p)) 795 if (!kprobe_disabled(p))
614 optimize_kprobe(p); 796 optimize_kprobe(p);
615 } 797 }
616 mutex_unlock(&text_mutex);
617 printk(KERN_INFO "Kprobes globally optimized\n"); 798 printk(KERN_INFO "Kprobes globally optimized\n");
618} 799}
619 800
801/* This should be called with kprobe_mutex locked */
620static void __kprobes unoptimize_all_kprobes(void) 802static void __kprobes unoptimize_all_kprobes(void)
621{ 803{
622 struct hlist_head *head; 804 struct hlist_head *head;
@@ -629,21 +811,16 @@ static void __kprobes unoptimize_all_kprobes(void)
629 return; 811 return;
630 812
631 kprobes_allow_optimization = false; 813 kprobes_allow_optimization = false;
632 printk(KERN_INFO "Kprobes globally unoptimized\n");
633 get_online_cpus(); /* For avoiding text_mutex deadlock */
634 mutex_lock(&text_mutex);
635 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 814 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
636 head = &kprobe_table[i]; 815 head = &kprobe_table[i];
637 hlist_for_each_entry_rcu(p, node, head, hlist) { 816 hlist_for_each_entry_rcu(p, node, head, hlist) {
638 if (!kprobe_disabled(p)) 817 if (!kprobe_disabled(p))
639 unoptimize_kprobe(p); 818 unoptimize_kprobe(p, false);
640 } 819 }
641 } 820 }
642 821 /* Wait for unoptimizing completion */
643 mutex_unlock(&text_mutex); 822 wait_for_kprobe_optimizer();
644 put_online_cpus(); 823 printk(KERN_INFO "Kprobes globally unoptimized\n");
645 /* Allow all currently running kprobes to complete */
646 synchronize_sched();
647} 824}
648 825
649int sysctl_kprobes_optimization; 826int sysctl_kprobes_optimization;
@@ -667,44 +844,60 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
667} 844}
668#endif /* CONFIG_SYSCTL */ 845#endif /* CONFIG_SYSCTL */
669 846
847/* Put a breakpoint for a probe. Must be called with text_mutex locked */
670static void __kprobes __arm_kprobe(struct kprobe *p) 848static void __kprobes __arm_kprobe(struct kprobe *p)
671{ 849{
672 struct kprobe *old_p; 850 struct kprobe *_p;
673 851
674 /* Check collision with other optimized kprobes */ 852 /* Check collision with other optimized kprobes */
675 old_p = get_optimized_kprobe((unsigned long)p->addr); 853 _p = get_optimized_kprobe((unsigned long)p->addr);
676 if (unlikely(old_p)) 854 if (unlikely(_p))
677 unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */ 855 /* Fallback to unoptimized kprobe */
856 unoptimize_kprobe(_p, true);
678 857
679 arch_arm_kprobe(p); 858 arch_arm_kprobe(p);
680 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ 859 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
681} 860}
682 861
683static void __kprobes __disarm_kprobe(struct kprobe *p) 862/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
863static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt)
684{ 864{
685 struct kprobe *old_p; 865 struct kprobe *_p;
686 866
687 unoptimize_kprobe(p); /* Try to unoptimize */ 867 unoptimize_kprobe(p, false); /* Try to unoptimize */
688 arch_disarm_kprobe(p);
689 868
690 /* If another kprobe was blocked, optimize it. */ 869 if (!kprobe_queued(p)) {
691 old_p = get_optimized_kprobe((unsigned long)p->addr); 870 arch_disarm_kprobe(p);
692 if (unlikely(old_p)) 871 /* If another kprobe was blocked, optimize it. */
693 optimize_kprobe(old_p); 872 _p = get_optimized_kprobe((unsigned long)p->addr);
873 if (unlikely(_p) && reopt)
874 optimize_kprobe(_p);
875 }
876 /* TODO: reoptimize others after unoptimized this probe */
694} 877}
695 878
696#else /* !CONFIG_OPTPROBES */ 879#else /* !CONFIG_OPTPROBES */
697 880
698#define optimize_kprobe(p) do {} while (0) 881#define optimize_kprobe(p) do {} while (0)
699#define unoptimize_kprobe(p) do {} while (0) 882#define unoptimize_kprobe(p, f) do {} while (0)
700#define kill_optimized_kprobe(p) do {} while (0) 883#define kill_optimized_kprobe(p) do {} while (0)
701#define prepare_optimized_kprobe(p) do {} while (0) 884#define prepare_optimized_kprobe(p) do {} while (0)
702#define try_to_optimize_kprobe(p) do {} while (0) 885#define try_to_optimize_kprobe(p) do {} while (0)
703#define __arm_kprobe(p) arch_arm_kprobe(p) 886#define __arm_kprobe(p) arch_arm_kprobe(p)
704#define __disarm_kprobe(p) arch_disarm_kprobe(p) 887#define __disarm_kprobe(p, o) arch_disarm_kprobe(p)
888#define kprobe_disarmed(p) kprobe_disabled(p)
889#define wait_for_kprobe_optimizer() do {} while (0)
890
891/* There should be no unused kprobes can be reused without optimization */
892static void reuse_unused_kprobe(struct kprobe *ap)
893{
894 printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
895 BUG_ON(kprobe_unused(ap));
896}
705 897
706static __kprobes void free_aggr_kprobe(struct kprobe *p) 898static __kprobes void free_aggr_kprobe(struct kprobe *p)
707{ 899{
900 arch_remove_kprobe(p);
708 kfree(p); 901 kfree(p);
709} 902}
710 903
@@ -730,11 +923,10 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
730/* Disarm a kprobe with text_mutex */ 923/* Disarm a kprobe with text_mutex */
731static void __kprobes disarm_kprobe(struct kprobe *kp) 924static void __kprobes disarm_kprobe(struct kprobe *kp)
732{ 925{
733 get_online_cpus(); /* For avoiding text_mutex deadlock */ 926 /* Ditto */
734 mutex_lock(&text_mutex); 927 mutex_lock(&text_mutex);
735 __disarm_kprobe(kp); 928 __disarm_kprobe(kp, true);
736 mutex_unlock(&text_mutex); 929 mutex_unlock(&text_mutex);
737 put_online_cpus();
738} 930}
739 931
740/* 932/*
@@ -773,7 +965,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
773static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, 965static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
774 int trapnr) 966 int trapnr)
775{ 967{
776 struct kprobe *cur = __get_cpu_var(kprobe_instance); 968 struct kprobe *cur = __this_cpu_read(kprobe_instance);
777 969
778 /* 970 /*
779 * if we faulted "during" the execution of a user specified 971 * if we faulted "during" the execution of a user specified
@@ -788,7 +980,7 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
788 980
789static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) 981static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
790{ 982{
791 struct kprobe *cur = __get_cpu_var(kprobe_instance); 983 struct kprobe *cur = __this_cpu_read(kprobe_instance);
792 int ret = 0; 984 int ret = 0;
793 985
794 if (cur && cur->break_handler) { 986 if (cur && cur->break_handler) {
@@ -831,6 +1023,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
831 1023
832void __kprobes kretprobe_hash_lock(struct task_struct *tsk, 1024void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
833 struct hlist_head **head, unsigned long *flags) 1025 struct hlist_head **head, unsigned long *flags)
1026__acquires(hlist_lock)
834{ 1027{
835 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 1028 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
836 spinlock_t *hlist_lock; 1029 spinlock_t *hlist_lock;
@@ -842,6 +1035,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
842 1035
843static void __kprobes kretprobe_table_lock(unsigned long hash, 1036static void __kprobes kretprobe_table_lock(unsigned long hash,
844 unsigned long *flags) 1037 unsigned long *flags)
1038__acquires(hlist_lock)
845{ 1039{
846 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 1040 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
847 spin_lock_irqsave(hlist_lock, *flags); 1041 spin_lock_irqsave(hlist_lock, *flags);
@@ -849,6 +1043,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash,
849 1043
850void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, 1044void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
851 unsigned long *flags) 1045 unsigned long *flags)
1046__releases(hlist_lock)
852{ 1047{
853 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 1048 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
854 spinlock_t *hlist_lock; 1049 spinlock_t *hlist_lock;
@@ -857,7 +1052,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
857 spin_unlock_irqrestore(hlist_lock, *flags); 1052 spin_unlock_irqrestore(hlist_lock, *flags);
858} 1053}
859 1054
860void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) 1055static void __kprobes kretprobe_table_unlock(unsigned long hash,
1056 unsigned long *flags)
1057__releases(hlist_lock)
861{ 1058{
862 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 1059 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
863 spin_unlock_irqrestore(hlist_lock, *flags); 1060 spin_unlock_irqrestore(hlist_lock, *flags);
@@ -935,7 +1132,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
935 BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); 1132 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
936 1133
937 if (p->break_handler || p->post_handler) 1134 if (p->break_handler || p->post_handler)
938 unoptimize_kprobe(ap); /* Fall back to normal kprobe */ 1135 unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */
939 1136
940 if (p->break_handler) { 1137 if (p->break_handler) {
941 if (ap->break_handler) 1138 if (ap->break_handler)
@@ -986,19 +1183,21 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
986 * This is the second or subsequent kprobe at the address - handle 1183 * This is the second or subsequent kprobe at the address - handle
987 * the intricacies 1184 * the intricacies
988 */ 1185 */
989static int __kprobes register_aggr_kprobe(struct kprobe *old_p, 1186static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
990 struct kprobe *p) 1187 struct kprobe *p)
991{ 1188{
992 int ret = 0; 1189 int ret = 0;
993 struct kprobe *ap = old_p; 1190 struct kprobe *ap = orig_p;
994 1191
995 if (!kprobe_aggrprobe(old_p)) { 1192 if (!kprobe_aggrprobe(orig_p)) {
996 /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */ 1193 /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
997 ap = alloc_aggr_kprobe(old_p); 1194 ap = alloc_aggr_kprobe(orig_p);
998 if (!ap) 1195 if (!ap)
999 return -ENOMEM; 1196 return -ENOMEM;
1000 init_aggr_kprobe(ap, old_p); 1197 init_aggr_kprobe(ap, orig_p);
1001 } 1198 } else if (kprobe_unused(ap))
1199 /* This probe is going to die. Rescue it */
1200 reuse_unused_kprobe(ap);
1002 1201
1003 if (kprobe_gone(ap)) { 1202 if (kprobe_gone(ap)) {
1004 /* 1203 /*
@@ -1032,23 +1231,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
1032 return add_new_kprobe(ap, p); 1231 return add_new_kprobe(ap, p);
1033} 1232}
1034 1233
1035/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
1036static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
1037{
1038 struct kprobe *kp;
1039
1040 list_for_each_entry_rcu(kp, &p->list, list) {
1041 if (!kprobe_disabled(kp))
1042 /*
1043 * There is an active probe on the list.
1044 * We can't disable aggr_kprobe.
1045 */
1046 return 0;
1047 }
1048 p->flags |= KPROBE_FLAG_DISABLED;
1049 return 1;
1050}
1051
1052static int __kprobes in_kprobes_functions(unsigned long addr) 1234static int __kprobes in_kprobes_functions(unsigned long addr)
1053{ 1235{
1054 struct kprobe_blackpoint *kb; 1236 struct kprobe_blackpoint *kb;
@@ -1091,34 +1273,33 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
1091/* Check passed kprobe is valid and return kprobe in kprobe_table. */ 1273/* Check passed kprobe is valid and return kprobe in kprobe_table. */
1092static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) 1274static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
1093{ 1275{
1094 struct kprobe *old_p, *list_p; 1276 struct kprobe *ap, *list_p;
1095 1277
1096 old_p = get_kprobe(p->addr); 1278 ap = get_kprobe(p->addr);
1097 if (unlikely(!old_p)) 1279 if (unlikely(!ap))
1098 return NULL; 1280 return NULL;
1099 1281
1100 if (p != old_p) { 1282 if (p != ap) {
1101 list_for_each_entry_rcu(list_p, &old_p->list, list) 1283 list_for_each_entry_rcu(list_p, &ap->list, list)
1102 if (list_p == p) 1284 if (list_p == p)
1103 /* kprobe p is a valid probe */ 1285 /* kprobe p is a valid probe */
1104 goto valid; 1286 goto valid;
1105 return NULL; 1287 return NULL;
1106 } 1288 }
1107valid: 1289valid:
1108 return old_p; 1290 return ap;
1109} 1291}
1110 1292
1111/* Return error if the kprobe is being re-registered */ 1293/* Return error if the kprobe is being re-registered */
1112static inline int check_kprobe_rereg(struct kprobe *p) 1294static inline int check_kprobe_rereg(struct kprobe *p)
1113{ 1295{
1114 int ret = 0; 1296 int ret = 0;
1115 struct kprobe *old_p;
1116 1297
1117 mutex_lock(&kprobe_mutex); 1298 mutex_lock(&kprobe_mutex);
1118 old_p = __get_valid_kprobe(p); 1299 if (__get_valid_kprobe(p))
1119 if (old_p)
1120 ret = -EINVAL; 1300 ret = -EINVAL;
1121 mutex_unlock(&kprobe_mutex); 1301 mutex_unlock(&kprobe_mutex);
1302
1122 return ret; 1303 return ret;
1123} 1304}
1124 1305
@@ -1138,13 +1319,13 @@ int __kprobes register_kprobe(struct kprobe *p)
1138 if (ret) 1319 if (ret)
1139 return ret; 1320 return ret;
1140 1321
1322 jump_label_lock();
1141 preempt_disable(); 1323 preempt_disable();
1142 if (!kernel_text_address((unsigned long) p->addr) || 1324 if (!kernel_text_address((unsigned long) p->addr) ||
1143 in_kprobes_functions((unsigned long) p->addr) || 1325 in_kprobes_functions((unsigned long) p->addr) ||
1144 ftrace_text_reserved(p->addr, p->addr)) { 1326 ftrace_text_reserved(p->addr, p->addr) ||
1145 preempt_enable(); 1327 jump_label_text_reserved(p->addr, p->addr))
1146 return -EINVAL; 1328 goto fail_with_jump_label;
1147 }
1148 1329
1149 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ 1330 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
1150 p->flags &= KPROBE_FLAG_DISABLED; 1331 p->flags &= KPROBE_FLAG_DISABLED;
@@ -1158,10 +1339,9 @@ int __kprobes register_kprobe(struct kprobe *p)
1158 * We must hold a refcount of the probed module while updating 1339 * We must hold a refcount of the probed module while updating
1159 * its code to prohibit unexpected unloading. 1340 * its code to prohibit unexpected unloading.
1160 */ 1341 */
1161 if (unlikely(!try_module_get(probed_mod))) { 1342 if (unlikely(!try_module_get(probed_mod)))
1162 preempt_enable(); 1343 goto fail_with_jump_label;
1163 return -EINVAL; 1344
1164 }
1165 /* 1345 /*
1166 * If the module freed .init.text, we couldn't insert 1346 * If the module freed .init.text, we couldn't insert
1167 * kprobes in there. 1347 * kprobes in there.
@@ -1169,16 +1349,18 @@ int __kprobes register_kprobe(struct kprobe *p)
1169 if (within_module_init((unsigned long)p->addr, probed_mod) && 1349 if (within_module_init((unsigned long)p->addr, probed_mod) &&
1170 probed_mod->state != MODULE_STATE_COMING) { 1350 probed_mod->state != MODULE_STATE_COMING) {
1171 module_put(probed_mod); 1351 module_put(probed_mod);
1172 preempt_enable(); 1352 goto fail_with_jump_label;
1173 return -EINVAL;
1174 } 1353 }
1175 } 1354 }
1176 preempt_enable(); 1355 preempt_enable();
1356 jump_label_unlock();
1177 1357
1178 p->nmissed = 0; 1358 p->nmissed = 0;
1179 INIT_LIST_HEAD(&p->list); 1359 INIT_LIST_HEAD(&p->list);
1180 mutex_lock(&kprobe_mutex); 1360 mutex_lock(&kprobe_mutex);
1181 1361
1362 jump_label_lock(); /* needed to call jump_label_text_reserved() */
1363
1182 get_online_cpus(); /* For avoiding text_mutex deadlock. */ 1364 get_online_cpus(); /* For avoiding text_mutex deadlock. */
1183 mutex_lock(&text_mutex); 1365 mutex_lock(&text_mutex);
1184 1366
@@ -1206,76 +1388,136 @@ int __kprobes register_kprobe(struct kprobe *p)
1206out: 1388out:
1207 mutex_unlock(&text_mutex); 1389 mutex_unlock(&text_mutex);
1208 put_online_cpus(); 1390 put_online_cpus();
1391 jump_label_unlock();
1209 mutex_unlock(&kprobe_mutex); 1392 mutex_unlock(&kprobe_mutex);
1210 1393
1211 if (probed_mod) 1394 if (probed_mod)
1212 module_put(probed_mod); 1395 module_put(probed_mod);
1213 1396
1214 return ret; 1397 return ret;
1398
1399fail_with_jump_label:
1400 preempt_enable();
1401 jump_label_unlock();
1402 return -EINVAL;
1215} 1403}
1216EXPORT_SYMBOL_GPL(register_kprobe); 1404EXPORT_SYMBOL_GPL(register_kprobe);
1217 1405
1406/* Check if all probes on the aggrprobe are disabled */
1407static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
1408{
1409 struct kprobe *kp;
1410
1411 list_for_each_entry_rcu(kp, &ap->list, list)
1412 if (!kprobe_disabled(kp))
1413 /*
1414 * There is an active probe on the list.
1415 * We can't disable this ap.
1416 */
1417 return 0;
1418
1419 return 1;
1420}
1421
1422/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
1423static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
1424{
1425 struct kprobe *orig_p;
1426
1427 /* Get an original kprobe for return */
1428 orig_p = __get_valid_kprobe(p);
1429 if (unlikely(orig_p == NULL))
1430 return NULL;
1431
1432 if (!kprobe_disabled(p)) {
1433 /* Disable probe if it is a child probe */
1434 if (p != orig_p)
1435 p->flags |= KPROBE_FLAG_DISABLED;
1436
1437 /* Try to disarm and disable this/parent probe */
1438 if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
1439 disarm_kprobe(orig_p);
1440 orig_p->flags |= KPROBE_FLAG_DISABLED;
1441 }
1442 }
1443
1444 return orig_p;
1445}
1446
1218/* 1447/*
1219 * Unregister a kprobe without a scheduler synchronization. 1448 * Unregister a kprobe without a scheduler synchronization.
1220 */ 1449 */
1221static int __kprobes __unregister_kprobe_top(struct kprobe *p) 1450static int __kprobes __unregister_kprobe_top(struct kprobe *p)
1222{ 1451{
1223 struct kprobe *old_p, *list_p; 1452 struct kprobe *ap, *list_p;
1224 1453
1225 old_p = __get_valid_kprobe(p); 1454 /* Disable kprobe. This will disarm it if needed. */
1226 if (old_p == NULL) 1455 ap = __disable_kprobe(p);
1456 if (ap == NULL)
1227 return -EINVAL; 1457 return -EINVAL;
1228 1458
1229 if (old_p == p || 1459 if (ap == p)
1230 (kprobe_aggrprobe(old_p) &&
1231 list_is_singular(&old_p->list))) {
1232 /* 1460 /*
1233 * Only probe on the hash list. Disarm only if kprobes are 1461 * This probe is an independent(and non-optimized) kprobe
1234 * enabled and not gone - otherwise, the breakpoint would 1462 * (not an aggrprobe). Remove from the hash list.
1235 * already have been removed. We save on flushing icache.
1236 */ 1463 */
1237 if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) 1464 goto disarmed;
1238 disarm_kprobe(old_p); 1465
1239 hlist_del_rcu(&old_p->hlist); 1466 /* Following process expects this probe is an aggrprobe */
1240 } else { 1467 WARN_ON(!kprobe_aggrprobe(ap));
1468
1469 if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
1470 /*
1471 * !disarmed could be happen if the probe is under delayed
1472 * unoptimizing.
1473 */
1474 goto disarmed;
1475 else {
1476 /* If disabling probe has special handlers, update aggrprobe */
1241 if (p->break_handler && !kprobe_gone(p)) 1477 if (p->break_handler && !kprobe_gone(p))
1242 old_p->break_handler = NULL; 1478 ap->break_handler = NULL;
1243 if (p->post_handler && !kprobe_gone(p)) { 1479 if (p->post_handler && !kprobe_gone(p)) {
1244 list_for_each_entry_rcu(list_p, &old_p->list, list) { 1480 list_for_each_entry_rcu(list_p, &ap->list, list) {
1245 if ((list_p != p) && (list_p->post_handler)) 1481 if ((list_p != p) && (list_p->post_handler))
1246 goto noclean; 1482 goto noclean;
1247 } 1483 }
1248 old_p->post_handler = NULL; 1484 ap->post_handler = NULL;
1249 } 1485 }
1250noclean: 1486noclean:
1487 /*
1488 * Remove from the aggrprobe: this path will do nothing in
1489 * __unregister_kprobe_bottom().
1490 */
1251 list_del_rcu(&p->list); 1491 list_del_rcu(&p->list);
1252 if (!kprobe_disabled(old_p)) { 1492 if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
1253 try_to_disable_aggr_kprobe(old_p); 1493 /*
1254 if (!kprobes_all_disarmed) { 1494 * Try to optimize this probe again, because post
1255 if (kprobe_disabled(old_p)) 1495 * handler may have been changed.
1256 disarm_kprobe(old_p); 1496 */
1257 else 1497 optimize_kprobe(ap);
1258 /* Try to optimize this probe again */
1259 optimize_kprobe(old_p);
1260 }
1261 }
1262 } 1498 }
1263 return 0; 1499 return 0;
1500
1501disarmed:
1502 BUG_ON(!kprobe_disarmed(ap));
1503 hlist_del_rcu(&ap->hlist);
1504 return 0;
1264} 1505}
1265 1506
1266static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) 1507static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
1267{ 1508{
1268 struct kprobe *old_p; 1509 struct kprobe *ap;
1269 1510
1270 if (list_empty(&p->list)) 1511 if (list_empty(&p->list))
1512 /* This is an independent kprobe */
1271 arch_remove_kprobe(p); 1513 arch_remove_kprobe(p);
1272 else if (list_is_singular(&p->list)) { 1514 else if (list_is_singular(&p->list)) {
1273 /* "p" is the last child of an aggr_kprobe */ 1515 /* This is the last child of an aggrprobe */
1274 old_p = list_entry(p->list.next, struct kprobe, list); 1516 ap = list_entry(p->list.next, struct kprobe, list);
1275 list_del(&p->list); 1517 list_del(&p->list);
1276 arch_remove_kprobe(old_p); 1518 free_aggr_kprobe(ap);
1277 free_aggr_kprobe(old_p);
1278 } 1519 }
1520 /* Otherwise, do nothing. */
1279} 1521}
1280 1522
1281int __kprobes register_kprobes(struct kprobe **kps, int num) 1523int __kprobes register_kprobes(struct kprobe **kps, int num)
@@ -1339,18 +1581,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
1339 if (num <= 0) 1581 if (num <= 0)
1340 return -EINVAL; 1582 return -EINVAL;
1341 for (i = 0; i < num; i++) { 1583 for (i = 0; i < num; i++) {
1342 unsigned long addr; 1584 unsigned long addr, offset;
1343 jp = jps[i]; 1585 jp = jps[i];
1344 addr = arch_deref_entry_point(jp->entry); 1586 addr = arch_deref_entry_point(jp->entry);
1345 1587
1346 if (!kernel_text_address(addr)) 1588 /* Verify probepoint is a function entry point */
1347 ret = -EINVAL; 1589 if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
1348 else { 1590 offset == 0) {
1349 /* Todo: Verify probepoint is a function entry point */
1350 jp->kp.pre_handler = setjmp_pre_handler; 1591 jp->kp.pre_handler = setjmp_pre_handler;
1351 jp->kp.break_handler = longjmp_break_handler; 1592 jp->kp.break_handler = longjmp_break_handler;
1352 ret = register_kprobe(&jp->kp); 1593 ret = register_kprobe(&jp->kp);
1353 } 1594 } else
1595 ret = -EINVAL;
1596
1354 if (ret < 0) { 1597 if (ret < 0) {
1355 if (i > 0) 1598 if (i > 0)
1356 unregister_jprobes(jps, i); 1599 unregister_jprobes(jps, i);
@@ -1592,29 +1835,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1592int __kprobes disable_kprobe(struct kprobe *kp) 1835int __kprobes disable_kprobe(struct kprobe *kp)
1593{ 1836{
1594 int ret = 0; 1837 int ret = 0;
1595 struct kprobe *p;
1596 1838
1597 mutex_lock(&kprobe_mutex); 1839 mutex_lock(&kprobe_mutex);
1598 1840
1599 /* Check whether specified probe is valid. */ 1841 /* Disable this kprobe */
1600 p = __get_valid_kprobe(kp); 1842 if (__disable_kprobe(kp) == NULL)
1601 if (unlikely(p == NULL)) {
1602 ret = -EINVAL; 1843 ret = -EINVAL;
1603 goto out;
1604 }
1605 1844
1606 /* If the probe is already disabled (or gone), just return */
1607 if (kprobe_disabled(kp))
1608 goto out;
1609
1610 kp->flags |= KPROBE_FLAG_DISABLED;
1611 if (p != kp)
1612 /* When kp != p, p is always enabled. */
1613 try_to_disable_aggr_kprobe(p);
1614
1615 if (!kprobes_all_disarmed && kprobe_disabled(p))
1616 disarm_kprobe(p);
1617out:
1618 mutex_unlock(&kprobe_mutex); 1845 mutex_unlock(&kprobe_mutex);
1619 return ret; 1846 return ret;
1620} 1847}
@@ -1912,36 +2139,27 @@ static void __kprobes disarm_all_kprobes(void)
1912 mutex_lock(&kprobe_mutex); 2139 mutex_lock(&kprobe_mutex);
1913 2140
1914 /* If kprobes are already disarmed, just return */ 2141 /* If kprobes are already disarmed, just return */
1915 if (kprobes_all_disarmed) 2142 if (kprobes_all_disarmed) {
1916 goto already_disabled; 2143 mutex_unlock(&kprobe_mutex);
2144 return;
2145 }
1917 2146
1918 kprobes_all_disarmed = true; 2147 kprobes_all_disarmed = true;
1919 printk(KERN_INFO "Kprobes globally disabled\n"); 2148 printk(KERN_INFO "Kprobes globally disabled\n");
1920 2149
1921 /*
1922 * Here we call get_online_cpus() for avoiding text_mutex deadlock,
1923 * because disarming may also unoptimize kprobes.
1924 */
1925 get_online_cpus();
1926 mutex_lock(&text_mutex); 2150 mutex_lock(&text_mutex);
1927 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2151 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1928 head = &kprobe_table[i]; 2152 head = &kprobe_table[i];
1929 hlist_for_each_entry_rcu(p, node, head, hlist) { 2153 hlist_for_each_entry_rcu(p, node, head, hlist) {
1930 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 2154 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
1931 __disarm_kprobe(p); 2155 __disarm_kprobe(p, false);
1932 } 2156 }
1933 } 2157 }
1934
1935 mutex_unlock(&text_mutex); 2158 mutex_unlock(&text_mutex);
1936 put_online_cpus();
1937 mutex_unlock(&kprobe_mutex); 2159 mutex_unlock(&kprobe_mutex);
1938 /* Allow all currently running kprobes to complete */
1939 synchronize_sched();
1940 return;
1941 2160
1942already_disabled: 2161 /* Wait for disarming all kprobes by optimizer */
1943 mutex_unlock(&kprobe_mutex); 2162 wait_for_kprobe_optimizer();
1944 return;
1945} 2163}
1946 2164
1947/* 2165/*
@@ -1992,6 +2210,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
1992static const struct file_operations fops_kp = { 2210static const struct file_operations fops_kp = {
1993 .read = read_enabled_file_bool, 2211 .read = read_enabled_file_bool,
1994 .write = write_enabled_file_bool, 2212 .write = write_enabled_file_bool,
2213 .llseek = default_llseek,
1995}; 2214};
1996 2215
1997static int __kprobes debugfs_kprobe_init(void) 2216static int __kprobes debugfs_kprobe_init(void)
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 0b624e791805..3b053c04dd86 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -16,6 +16,7 @@
16#include <linux/kexec.h> 16#include <linux/kexec.h>
17#include <linux/profile.h> 17#include <linux/profile.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/capability.h>
19 20
20#define KERNEL_ATTR_RO(_name) \ 21#define KERNEL_ATTR_RO(_name) \
21static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 22static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -131,6 +132,14 @@ KERNEL_ATTR_RO(vmcoreinfo);
131 132
132#endif /* CONFIG_KEXEC */ 133#endif /* CONFIG_KEXEC */
133 134
135/* whether file capabilities are enabled */
136static ssize_t fscaps_show(struct kobject *kobj,
137 struct kobj_attribute *attr, char *buf)
138{
139 return sprintf(buf, "%d\n", file_caps_enabled);
140}
141KERNEL_ATTR_RO(fscaps);
142
134/* 143/*
135 * Make /sys/kernel/notes give the raw contents of our kernel .notes section. 144 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
136 */ 145 */
@@ -158,6 +167,7 @@ struct kobject *kernel_kobj;
158EXPORT_SYMBOL_GPL(kernel_kobj); 167EXPORT_SYMBOL_GPL(kernel_kobj);
159 168
160static struct attribute * kernel_attrs[] = { 169static struct attribute * kernel_attrs[] = {
170 &fscaps_attr.attr,
161#if defined(CONFIG_HOTPLUG) 171#if defined(CONFIG_HOTPLUG)
162 &uevent_seqnum_attr.attr, 172 &uevent_seqnum_attr.attr,
163 &uevent_helper_attr.attr, 173 &uevent_helper_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2dc3786349d1..4ba7cccb4994 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,6 +27,7 @@ struct kthread_create_info
27 /* Information passed to kthread() from kthreadd. */ 27 /* Information passed to kthread() from kthreadd. */
28 int (*threadfn)(void *data); 28 int (*threadfn)(void *data);
29 void *data; 29 void *data;
30 int node;
30 31
31 /* Result passed back to kthread_create() from kthreadd. */ 32 /* Result passed back to kthread_create() from kthreadd. */
32 struct task_struct *result; 33 struct task_struct *result;
@@ -98,10 +99,23 @@ static int kthread(void *_create)
98 do_exit(ret); 99 do_exit(ret);
99} 100}
100 101
102/* called from do_fork() to get node information for about to be created task */
103int tsk_fork_get_node(struct task_struct *tsk)
104{
105#ifdef CONFIG_NUMA
106 if (tsk == kthreadd_task)
107 return tsk->pref_node_fork;
108#endif
109 return numa_node_id();
110}
111
101static void create_kthread(struct kthread_create_info *create) 112static void create_kthread(struct kthread_create_info *create)
102{ 113{
103 int pid; 114 int pid;
104 115
116#ifdef CONFIG_NUMA
117 current->pref_node_fork = create->node;
118#endif
105 /* We want our own signal handler (we take no signals by default). */ 119 /* We want our own signal handler (we take no signals by default). */
106 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); 120 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
107 if (pid < 0) { 121 if (pid < 0) {
@@ -111,33 +125,38 @@ static void create_kthread(struct kthread_create_info *create)
111} 125}
112 126
113/** 127/**
114 * kthread_create - create a kthread. 128 * kthread_create_on_node - create a kthread.
115 * @threadfn: the function to run until signal_pending(current). 129 * @threadfn: the function to run until signal_pending(current).
116 * @data: data ptr for @threadfn. 130 * @data: data ptr for @threadfn.
131 * @node: memory node number.
117 * @namefmt: printf-style name for the thread. 132 * @namefmt: printf-style name for the thread.
118 * 133 *
119 * Description: This helper function creates and names a kernel 134 * Description: This helper function creates and names a kernel
120 * thread. The thread will be stopped: use wake_up_process() to start 135 * thread. The thread will be stopped: use wake_up_process() to start
121 * it. See also kthread_run(). 136 * it. See also kthread_run().
122 * 137 *
138 * If thread is going to be bound on a particular cpu, give its node
139 * in @node, to get NUMA affinity for kthread stack, or else give -1.
123 * When woken, the thread will run @threadfn() with @data as its 140 * When woken, the thread will run @threadfn() with @data as its
124 * argument. @threadfn() can either call do_exit() directly if it is a 141 * argument. @threadfn() can either call do_exit() directly if it is a
125 * standalone thread for which noone will call kthread_stop(), or 142 * standalone thread for which no one will call kthread_stop(), or
126 * return when 'kthread_should_stop()' is true (which means 143 * return when 'kthread_should_stop()' is true (which means
127 * kthread_stop() has been called). The return value should be zero 144 * kthread_stop() has been called). The return value should be zero
128 * or a negative error number; it will be passed to kthread_stop(). 145 * or a negative error number; it will be passed to kthread_stop().
129 * 146 *
130 * Returns a task_struct or ERR_PTR(-ENOMEM). 147 * Returns a task_struct or ERR_PTR(-ENOMEM).
131 */ 148 */
132struct task_struct *kthread_create(int (*threadfn)(void *data), 149struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
133 void *data, 150 void *data,
134 const char namefmt[], 151 int node,
135 ...) 152 const char namefmt[],
153 ...)
136{ 154{
137 struct kthread_create_info create; 155 struct kthread_create_info create;
138 156
139 create.threadfn = threadfn; 157 create.threadfn = threadfn;
140 create.data = data; 158 create.data = data;
159 create.node = node;
141 init_completion(&create.done); 160 init_completion(&create.done);
142 161
143 spin_lock(&kthread_create_lock); 162 spin_lock(&kthread_create_lock);
@@ -148,7 +167,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
148 wait_for_completion(&create.done); 167 wait_for_completion(&create.done);
149 168
150 if (!IS_ERR(create.result)) { 169 if (!IS_ERR(create.result)) {
151 struct sched_param param = { .sched_priority = 0 }; 170 static const struct sched_param param = { .sched_priority = 0 };
152 va_list args; 171 va_list args;
153 172
154 va_start(args, namefmt); 173 va_start(args, namefmt);
@@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
164 } 183 }
165 return create.result; 184 return create.result;
166} 185}
167EXPORT_SYMBOL(kthread_create); 186EXPORT_SYMBOL(kthread_create_on_node);
168 187
169/** 188/**
170 * kthread_bind - bind a just-created kthread to a cpu. 189 * kthread_bind - bind a just-created kthread to a cpu.
@@ -183,8 +202,8 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
183 return; 202 return;
184 } 203 }
185 204
186 p->cpus_allowed = cpumask_of_cpu(cpu); 205 /* It's safe because the task is inactive. */
187 p->rt.nr_cpus_allowed = 1; 206 do_set_cpus_allowed(p, cpumask_of(cpu));
188 p->flags |= PF_THREAD_BOUND; 207 p->flags |= PF_THREAD_BOUND;
189} 208}
190EXPORT_SYMBOL(kthread_bind); 209EXPORT_SYMBOL(kthread_bind);
@@ -265,6 +284,17 @@ int kthreadd(void *unused)
265 return 0; 284 return 0;
266} 285}
267 286
287void __init_kthread_worker(struct kthread_worker *worker,
288 const char *name,
289 struct lock_class_key *key)
290{
291 spin_lock_init(&worker->lock);
292 lockdep_set_class_and_name(&worker->lock, key, name);
293 INIT_LIST_HEAD(&worker->work_list);
294 worker->task = NULL;
295}
296EXPORT_SYMBOL_GPL(__init_kthread_worker);
297
268/** 298/**
269 * kthread_worker_fn - kthread function to process kthread_worker 299 * kthread_worker_fn - kthread function to process kthread_worker
270 * @worker_ptr: pointer to initialized kthread_worker 300 * @worker_ptr: pointer to initialized kthread_worker
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 877fb306d415..376066e10413 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -153,7 +153,7 @@ static inline void store_stacktrace(struct task_struct *tsk,
153} 153}
154 154
155/** 155/**
156 * __account_scheduler_latency - record an occured latency 156 * __account_scheduler_latency - record an occurred latency
157 * @tsk - the task struct of the task hitting the latency 157 * @tsk - the task struct of the task hitting the latency
158 * @usecs - the duration of the latency in microseconds 158 * @usecs - the duration of the latency in microseconds
159 * @inter - 1 if the sleep was interruptible, 0 if uninterruptible 159 * @inter - 1 if the sleep was interruptible, 0 if uninterruptible
@@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
194 194
195 account_global_scheduler_latency(tsk, &lat); 195 account_global_scheduler_latency(tsk, &lat);
196 196
197 /* 197 for (i = 0; i < tsk->latency_record_count; i++) {
198 * short term hack; if we're > 32 we stop; future we recycle:
199 */
200 tsk->latency_record_count++;
201 if (tsk->latency_record_count >= LT_SAVECOUNT)
202 goto out_unlock;
203
204 for (i = 0; i < LT_SAVECOUNT; i++) {
205 struct latency_record *mylat; 198 struct latency_record *mylat;
206 int same = 1; 199 int same = 1;
207 200
@@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
227 } 220 }
228 } 221 }
229 222
223 /*
224 * short term hack; if we're > 32 we stop; future we recycle:
225 */
226 if (tsk->latency_record_count >= LT_SAVECOUNT)
227 goto out_unlock;
228
230 /* Allocated a new one: */ 229 /* Allocated a new one: */
231 i = tsk->latency_record_count; 230 i = tsk->latency_record_count++;
232 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); 231 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
233 232
234out_unlock: 233out_unlock:
@@ -242,24 +241,19 @@ static int lstats_show(struct seq_file *m, void *v)
242 seq_puts(m, "Latency Top version : v0.1\n"); 241 seq_puts(m, "Latency Top version : v0.1\n");
243 242
244 for (i = 0; i < MAXLR; i++) { 243 for (i = 0; i < MAXLR; i++) {
245 if (latency_record[i].backtrace[0]) { 244 struct latency_record *lr = &latency_record[i];
245
246 if (lr->backtrace[0]) {
246 int q; 247 int q;
247 seq_printf(m, "%i %lu %lu ", 248 seq_printf(m, "%i %lu %lu",
248 latency_record[i].count, 249 lr->count, lr->time, lr->max);
249 latency_record[i].time,
250 latency_record[i].max);
251 for (q = 0; q < LT_BACKTRACEDEPTH; q++) { 250 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
252 char sym[KSYM_SYMBOL_LEN]; 251 unsigned long bt = lr->backtrace[q];
253 char *c; 252 if (!bt)
254 if (!latency_record[i].backtrace[q])
255 break; 253 break;
256 if (latency_record[i].backtrace[q] == ULONG_MAX) 254 if (bt == ULONG_MAX)
257 break; 255 break;
258 sprint_symbol(sym, latency_record[i].backtrace[q]); 256 seq_printf(m, " %ps", (void *)bt);
259 c = strchr(sym, '+');
260 if (c)
261 *c = 0;
262 seq_printf(m, "%s ", sym);
263 } 257 }
264 seq_printf(m, "\n"); 258 seq_printf(m, "\n");
265 } 259 }
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f2852a510232..298c9276dfdb 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -490,6 +490,18 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
490 usage[i] = '\0'; 490 usage[i] = '\0';
491} 491}
492 492
493static int __print_lock_name(struct lock_class *class)
494{
495 char str[KSYM_NAME_LEN];
496 const char *name;
497
498 name = class->name;
499 if (!name)
500 name = __get_key_name(class->key, str);
501
502 return printk("%s", name);
503}
504
493static void print_lock_name(struct lock_class *class) 505static void print_lock_name(struct lock_class *class)
494{ 506{
495 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; 507 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
@@ -639,6 +651,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
639 } 651 }
640#endif 652#endif
641 653
654 if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
655 debug_locks_off();
656 printk(KERN_ERR
657 "BUG: looking up invalid subclass: %u\n", subclass);
658 printk(KERN_ERR
659 "turning off the locking correctness validator.\n");
660 dump_stack();
661 return NULL;
662 }
663
642 /* 664 /*
643 * Static locks do not have their class-keys yet - for them the key 665 * Static locks do not have their class-keys yet - for them the key
644 * is the lock object itself: 666 * is the lock object itself:
@@ -774,7 +796,9 @@ out_unlock_set:
774 raw_local_irq_restore(flags); 796 raw_local_irq_restore(flags);
775 797
776 if (!subclass || force) 798 if (!subclass || force)
777 lock->class_cache = class; 799 lock->class_cache[0] = class;
800 else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
801 lock->class_cache[subclass] = class;
778 802
779 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) 803 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
780 return NULL; 804 return NULL;
@@ -1041,6 +1065,56 @@ print_circular_bug_entry(struct lock_list *target, int depth)
1041 return 0; 1065 return 0;
1042} 1066}
1043 1067
1068static void
1069print_circular_lock_scenario(struct held_lock *src,
1070 struct held_lock *tgt,
1071 struct lock_list *prt)
1072{
1073 struct lock_class *source = hlock_class(src);
1074 struct lock_class *target = hlock_class(tgt);
1075 struct lock_class *parent = prt->class;
1076
1077 /*
1078 * A direct locking problem where unsafe_class lock is taken
1079 * directly by safe_class lock, then all we need to show
1080 * is the deadlock scenario, as it is obvious that the
1081 * unsafe lock is taken under the safe lock.
1082 *
1083 * But if there is a chain instead, where the safe lock takes
1084 * an intermediate lock (middle_class) where this lock is
1085 * not the same as the safe lock, then the lock chain is
1086 * used to describe the problem. Otherwise we would need
1087 * to show a different CPU case for each link in the chain
1088 * from the safe_class lock to the unsafe_class lock.
1089 */
1090 if (parent != source) {
1091 printk("Chain exists of:\n ");
1092 __print_lock_name(source);
1093 printk(" --> ");
1094 __print_lock_name(parent);
1095 printk(" --> ");
1096 __print_lock_name(target);
1097 printk("\n\n");
1098 }
1099
1100 printk(" Possible unsafe locking scenario:\n\n");
1101 printk(" CPU0 CPU1\n");
1102 printk(" ---- ----\n");
1103 printk(" lock(");
1104 __print_lock_name(target);
1105 printk(");\n");
1106 printk(" lock(");
1107 __print_lock_name(parent);
1108 printk(");\n");
1109 printk(" lock(");
1110 __print_lock_name(target);
1111 printk(");\n");
1112 printk(" lock(");
1113 __print_lock_name(source);
1114 printk(");\n");
1115 printk("\n *** DEADLOCK ***\n\n");
1116}
1117
1044/* 1118/*
1045 * When a circular dependency is detected, print the 1119 * When a circular dependency is detected, print the
1046 * header first: 1120 * header first:
@@ -1084,6 +1158,7 @@ static noinline int print_circular_bug(struct lock_list *this,
1084{ 1158{
1085 struct task_struct *curr = current; 1159 struct task_struct *curr = current;
1086 struct lock_list *parent; 1160 struct lock_list *parent;
1161 struct lock_list *first_parent;
1087 int depth; 1162 int depth;
1088 1163
1089 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1164 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
@@ -1097,6 +1172,7 @@ static noinline int print_circular_bug(struct lock_list *this,
1097 print_circular_bug_header(target, depth, check_src, check_tgt); 1172 print_circular_bug_header(target, depth, check_src, check_tgt);
1098 1173
1099 parent = get_lock_parent(target); 1174 parent = get_lock_parent(target);
1175 first_parent = parent;
1100 1176
1101 while (parent) { 1177 while (parent) {
1102 print_circular_bug_entry(parent, --depth); 1178 print_circular_bug_entry(parent, --depth);
@@ -1104,6 +1180,9 @@ static noinline int print_circular_bug(struct lock_list *this,
1104 } 1180 }
1105 1181
1106 printk("\nother info that might help us debug this:\n\n"); 1182 printk("\nother info that might help us debug this:\n\n");
1183 print_circular_lock_scenario(check_src, check_tgt,
1184 first_parent);
1185
1107 lockdep_print_held_locks(curr); 1186 lockdep_print_held_locks(curr);
1108 1187
1109 printk("\nstack backtrace:\n"); 1188 printk("\nstack backtrace:\n");
@@ -1302,7 +1381,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
1302 printk("\n"); 1381 printk("\n");
1303 1382
1304 if (depth == 0 && (entry != root)) { 1383 if (depth == 0 && (entry != root)) {
1305 printk("lockdep:%s bad BFS generated tree\n", __func__); 1384 printk("lockdep:%s bad path found in chain graph\n", __func__);
1306 break; 1385 break;
1307 } 1386 }
1308 1387
@@ -1313,6 +1392,62 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
1313 return; 1392 return;
1314} 1393}
1315 1394
1395static void
1396print_irq_lock_scenario(struct lock_list *safe_entry,
1397 struct lock_list *unsafe_entry,
1398 struct lock_class *prev_class,
1399 struct lock_class *next_class)
1400{
1401 struct lock_class *safe_class = safe_entry->class;
1402 struct lock_class *unsafe_class = unsafe_entry->class;
1403 struct lock_class *middle_class = prev_class;
1404
1405 if (middle_class == safe_class)
1406 middle_class = next_class;
1407
1408 /*
1409 * A direct locking problem where unsafe_class lock is taken
1410 * directly by safe_class lock, then all we need to show
1411 * is the deadlock scenario, as it is obvious that the
1412 * unsafe lock is taken under the safe lock.
1413 *
1414 * But if there is a chain instead, where the safe lock takes
1415 * an intermediate lock (middle_class) where this lock is
1416 * not the same as the safe lock, then the lock chain is
1417 * used to describe the problem. Otherwise we would need
1418 * to show a different CPU case for each link in the chain
1419 * from the safe_class lock to the unsafe_class lock.
1420 */
1421 if (middle_class != unsafe_class) {
1422 printk("Chain exists of:\n ");
1423 __print_lock_name(safe_class);
1424 printk(" --> ");
1425 __print_lock_name(middle_class);
1426 printk(" --> ");
1427 __print_lock_name(unsafe_class);
1428 printk("\n\n");
1429 }
1430
1431 printk(" Possible interrupt unsafe locking scenario:\n\n");
1432 printk(" CPU0 CPU1\n");
1433 printk(" ---- ----\n");
1434 printk(" lock(");
1435 __print_lock_name(unsafe_class);
1436 printk(");\n");
1437 printk(" local_irq_disable();\n");
1438 printk(" lock(");
1439 __print_lock_name(safe_class);
1440 printk(");\n");
1441 printk(" lock(");
1442 __print_lock_name(middle_class);
1443 printk(");\n");
1444 printk(" <Interrupt>\n");
1445 printk(" lock(");
1446 __print_lock_name(safe_class);
1447 printk(");\n");
1448 printk("\n *** DEADLOCK ***\n\n");
1449}
1450
1316static int 1451static int
1317print_bad_irq_dependency(struct task_struct *curr, 1452print_bad_irq_dependency(struct task_struct *curr,
1318 struct lock_list *prev_root, 1453 struct lock_list *prev_root,
@@ -1364,6 +1499,9 @@ print_bad_irq_dependency(struct task_struct *curr,
1364 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); 1499 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
1365 1500
1366 printk("\nother info that might help us debug this:\n\n"); 1501 printk("\nother info that might help us debug this:\n\n");
1502 print_irq_lock_scenario(backwards_entry, forwards_entry,
1503 hlock_class(prev), hlock_class(next));
1504
1367 lockdep_print_held_locks(curr); 1505 lockdep_print_held_locks(curr);
1368 1506
1369 printk("\nthe dependencies between %s-irq-safe lock", irqclass); 1507 printk("\nthe dependencies between %s-irq-safe lock", irqclass);
@@ -1527,6 +1665,26 @@ static inline void inc_chains(void)
1527 1665
1528#endif 1666#endif
1529 1667
1668static void
1669print_deadlock_scenario(struct held_lock *nxt,
1670 struct held_lock *prv)
1671{
1672 struct lock_class *next = hlock_class(nxt);
1673 struct lock_class *prev = hlock_class(prv);
1674
1675 printk(" Possible unsafe locking scenario:\n\n");
1676 printk(" CPU0\n");
1677 printk(" ----\n");
1678 printk(" lock(");
1679 __print_lock_name(prev);
1680 printk(");\n");
1681 printk(" lock(");
1682 __print_lock_name(next);
1683 printk(");\n");
1684 printk("\n *** DEADLOCK ***\n\n");
1685 printk(" May be due to missing lock nesting notation\n\n");
1686}
1687
1530static int 1688static int
1531print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, 1689print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1532 struct held_lock *next) 1690 struct held_lock *next)
@@ -1545,6 +1703,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1545 print_lock(prev); 1703 print_lock(prev);
1546 1704
1547 printk("\nother info that might help us debug this:\n"); 1705 printk("\nother info that might help us debug this:\n");
1706 print_deadlock_scenario(next, prev);
1548 lockdep_print_held_locks(curr); 1707 lockdep_print_held_locks(curr);
1549 1708
1550 printk("\nstack backtrace:\n"); 1709 printk("\nstack backtrace:\n");
@@ -1814,7 +1973,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
1814 struct list_head *hash_head = chainhashentry(chain_key); 1973 struct list_head *hash_head = chainhashentry(chain_key);
1815 struct lock_chain *chain; 1974 struct lock_chain *chain;
1816 struct held_lock *hlock_curr, *hlock_next; 1975 struct held_lock *hlock_curr, *hlock_next;
1817 int i, j, n, cn; 1976 int i, j;
1818 1977
1819 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 1978 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1820 return 0; 1979 return 0;
@@ -1874,15 +2033,9 @@ cache_hit:
1874 } 2033 }
1875 i++; 2034 i++;
1876 chain->depth = curr->lockdep_depth + 1 - i; 2035 chain->depth = curr->lockdep_depth + 1 - i;
1877 cn = nr_chain_hlocks; 2036 if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
1878 while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) { 2037 chain->base = nr_chain_hlocks;
1879 n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth); 2038 nr_chain_hlocks += chain->depth;
1880 if (n == cn)
1881 break;
1882 cn = n;
1883 }
1884 if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
1885 chain->base = cn;
1886 for (j = 0; j < chain->depth - 1; j++, i++) { 2039 for (j = 0; j < chain->depth - 1; j++, i++) {
1887 int lock_id = curr->held_locks[i].class_idx - 1; 2040 int lock_id = curr->held_locks[i].class_idx - 1;
1888 chain_hlocks[chain->base + j] = lock_id; 2041 chain_hlocks[chain->base + j] = lock_id;
@@ -1999,6 +2152,24 @@ static void check_chain_key(struct task_struct *curr)
1999#endif 2152#endif
2000} 2153}
2001 2154
2155static void
2156print_usage_bug_scenario(struct held_lock *lock)
2157{
2158 struct lock_class *class = hlock_class(lock);
2159
2160 printk(" Possible unsafe locking scenario:\n\n");
2161 printk(" CPU0\n");
2162 printk(" ----\n");
2163 printk(" lock(");
2164 __print_lock_name(class);
2165 printk(");\n");
2166 printk(" <Interrupt>\n");
2167 printk(" lock(");
2168 __print_lock_name(class);
2169 printk(");\n");
2170 printk("\n *** DEADLOCK ***\n\n");
2171}
2172
2002static int 2173static int
2003print_usage_bug(struct task_struct *curr, struct held_lock *this, 2174print_usage_bug(struct task_struct *curr, struct held_lock *this,
2004 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) 2175 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
@@ -2027,6 +2198,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2027 2198
2028 print_irqtrace_events(curr); 2199 print_irqtrace_events(curr);
2029 printk("\nother info that might help us debug this:\n"); 2200 printk("\nother info that might help us debug this:\n");
2201 print_usage_bug_scenario(this);
2202
2030 lockdep_print_held_locks(curr); 2203 lockdep_print_held_locks(curr);
2031 2204
2032 printk("\nstack backtrace:\n"); 2205 printk("\nstack backtrace:\n");
@@ -2061,6 +2234,10 @@ print_irq_inversion_bug(struct task_struct *curr,
2061 struct held_lock *this, int forwards, 2234 struct held_lock *this, int forwards,
2062 const char *irqclass) 2235 const char *irqclass)
2063{ 2236{
2237 struct lock_list *entry = other;
2238 struct lock_list *middle = NULL;
2239 int depth;
2240
2064 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 2241 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
2065 return 0; 2242 return 0;
2066 2243
@@ -2079,6 +2256,25 @@ print_irq_inversion_bug(struct task_struct *curr,
2079 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); 2256 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
2080 2257
2081 printk("\nother info that might help us debug this:\n"); 2258 printk("\nother info that might help us debug this:\n");
2259
2260 /* Find a middle lock (if one exists) */
2261 depth = get_lock_depth(other);
2262 do {
2263 if (depth == 0 && (entry != root)) {
2264 printk("lockdep:%s bad path found in chain graph\n", __func__);
2265 break;
2266 }
2267 middle = entry;
2268 entry = get_lock_parent(entry);
2269 depth--;
2270 } while (entry && entry != root && (depth >= 0));
2271 if (forwards)
2272 print_irq_lock_scenario(root, other,
2273 middle ? middle->class : root->class, other->class);
2274 else
2275 print_irq_lock_scenario(other, root,
2276 middle ? middle->class : other->class, root->class);
2277
2082 lockdep_print_held_locks(curr); 2278 lockdep_print_held_locks(curr);
2083 2279
2084 printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); 2280 printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
@@ -2280,22 +2476,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
2280} 2476}
2281 2477
2282/* 2478/*
2283 * Debugging helper: via this flag we know that we are in
2284 * 'early bootup code', and will warn about any invalid irqs-on event:
2285 */
2286static int early_boot_irqs_enabled;
2287
2288void early_boot_irqs_off(void)
2289{
2290 early_boot_irqs_enabled = 0;
2291}
2292
2293void early_boot_irqs_on(void)
2294{
2295 early_boot_irqs_enabled = 1;
2296}
2297
2298/*
2299 * Hardirqs will be enabled: 2479 * Hardirqs will be enabled:
2300 */ 2480 */
2301void trace_hardirqs_on_caller(unsigned long ip) 2481void trace_hardirqs_on_caller(unsigned long ip)
@@ -2307,13 +2487,13 @@ void trace_hardirqs_on_caller(unsigned long ip)
2307 if (unlikely(!debug_locks || current->lockdep_recursion)) 2487 if (unlikely(!debug_locks || current->lockdep_recursion))
2308 return; 2488 return;
2309 2489
2310 if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled))) 2490 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
2311 return; 2491 return;
2312 2492
2313 if (unlikely(curr->hardirqs_enabled)) { 2493 if (unlikely(curr->hardirqs_enabled)) {
2314 /* 2494 /*
2315 * Neither irq nor preemption are disabled here 2495 * Neither irq nor preemption are disabled here
2316 * so this is racy by nature but loosing one hit 2496 * so this is racy by nature but losing one hit
2317 * in a stat is not a big deal. 2497 * in a stat is not a big deal.
2318 */ 2498 */
2319 __debug_atomic_inc(redundant_hardirqs_on); 2499 __debug_atomic_inc(redundant_hardirqs_on);
@@ -2624,7 +2804,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2624 if (!graph_lock()) 2804 if (!graph_lock())
2625 return 0; 2805 return 0;
2626 /* 2806 /*
2627 * Make sure we didnt race: 2807 * Make sure we didn't race:
2628 */ 2808 */
2629 if (unlikely(hlock_class(this)->usage_mask & new_mask)) { 2809 if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
2630 graph_unlock(); 2810 graph_unlock();
@@ -2679,7 +2859,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2679void lockdep_init_map(struct lockdep_map *lock, const char *name, 2859void lockdep_init_map(struct lockdep_map *lock, const char *name,
2680 struct lock_class_key *key, int subclass) 2860 struct lock_class_key *key, int subclass)
2681{ 2861{
2682 lock->class_cache = NULL; 2862 int i;
2863
2864 for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
2865 lock->class_cache[i] = NULL;
2866
2683#ifdef CONFIG_LOCK_STAT 2867#ifdef CONFIG_LOCK_STAT
2684 lock->cpu = raw_smp_processor_id(); 2868 lock->cpu = raw_smp_processor_id();
2685#endif 2869#endif
@@ -2739,21 +2923,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2739 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2923 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2740 return 0; 2924 return 0;
2741 2925
2742 if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
2743 debug_locks_off();
2744 printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
2745 printk("turning off the locking correctness validator.\n");
2746 dump_stack();
2747 return 0;
2748 }
2749
2750 if (lock->key == &__lockdep_no_validate__) 2926 if (lock->key == &__lockdep_no_validate__)
2751 check = 1; 2927 check = 1;
2752 2928
2753 if (!subclass) 2929 if (subclass < NR_LOCKDEP_CACHING_CLASSES)
2754 class = lock->class_cache; 2930 class = lock->class_cache[subclass];
2755 /* 2931 /*
2756 * Not cached yet or subclass? 2932 * Not cached?
2757 */ 2933 */
2758 if (unlikely(!class)) { 2934 if (unlikely(!class)) {
2759 class = register_lock_class(lock, subclass, 0); 2935 class = register_lock_class(lock, subclass, 0);
@@ -2918,7 +3094,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
2918 return 1; 3094 return 1;
2919 3095
2920 if (hlock->references) { 3096 if (hlock->references) {
2921 struct lock_class *class = lock->class_cache; 3097 struct lock_class *class = lock->class_cache[0];
2922 3098
2923 if (!class) 3099 if (!class)
2924 class = look_up_lock_class(lock, 0); 3100 class = look_up_lock_class(lock, 0);
@@ -3250,7 +3426,7 @@ int lock_is_held(struct lockdep_map *lock)
3250 int ret = 0; 3426 int ret = 0;
3251 3427
3252 if (unlikely(current->lockdep_recursion)) 3428 if (unlikely(current->lockdep_recursion))
3253 return ret; 3429 return 1; /* avoid false negative lockdep_assert_held() */
3254 3430
3255 raw_local_irq_save(flags); 3431 raw_local_irq_save(flags);
3256 check_flags(flags); 3432 check_flags(flags);
@@ -3559,7 +3735,12 @@ void lockdep_reset_lock(struct lockdep_map *lock)
3559 if (list_empty(head)) 3735 if (list_empty(head))
3560 continue; 3736 continue;
3561 list_for_each_entry_safe(class, next, head, hash_entry) { 3737 list_for_each_entry_safe(class, next, head, hash_entry) {
3562 if (unlikely(class == lock->class_cache)) { 3738 int match = 0;
3739
3740 for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
3741 match |= class == lock->class_cache[j];
3742
3743 if (unlikely(match)) {
3563 if (debug_locks_off_graph_unlock()) 3744 if (debug_locks_off_graph_unlock())
3564 WARN_ON(1); 3745 WARN_ON(1);
3565 goto out_restore; 3746 goto out_restore;
@@ -3775,7 +3956,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks);
3775 * Careful: only use this function if you are sure that 3956 * Careful: only use this function if you are sure that
3776 * the task cannot run in parallel! 3957 * the task cannot run in parallel!
3777 */ 3958 */
3778void __debug_show_held_locks(struct task_struct *task) 3959void debug_show_held_locks(struct task_struct *task)
3779{ 3960{
3780 if (unlikely(!debug_locks)) { 3961 if (unlikely(!debug_locks)) {
3781 printk("INFO: lockdep is turned off.\n"); 3962 printk("INFO: lockdep is turned off.\n");
@@ -3783,12 +3964,6 @@ void __debug_show_held_locks(struct task_struct *task)
3783 } 3964 }
3784 lockdep_print_held_locks(task); 3965 lockdep_print_held_locks(task);
3785} 3966}
3786EXPORT_SYMBOL_GPL(__debug_show_held_locks);
3787
3788void debug_show_held_locks(struct task_struct *task)
3789{
3790 __debug_show_held_locks(task);
3791}
3792EXPORT_SYMBOL_GPL(debug_show_held_locks); 3967EXPORT_SYMBOL_GPL(debug_show_held_locks);
3793 3968
3794void lockdep_sys_exit(void) 3969void lockdep_sys_exit(void)
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 59b76c8ce9d7..71edd2f60c02 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -225,7 +225,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
225 nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, 225 nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
226 nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, 226 nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
227 nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, 227 nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
228 sum_forward_deps = 0, factor = 0; 228 sum_forward_deps = 0;
229 229
230 list_for_each_entry(class, &all_lock_classes, lock_entry) { 230 list_for_each_entry(class, &all_lock_classes, lock_entry) {
231 231
@@ -283,13 +283,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
283 nr_hardirq_unsafe * nr_hardirq_safe + 283 nr_hardirq_unsafe * nr_hardirq_safe +
284 nr_list_entries); 284 nr_list_entries);
285 285
286 /*
287 * Estimated factor between direct and indirect
288 * dependencies:
289 */
290 if (nr_list_entries)
291 factor = sum_forward_deps / nr_list_entries;
292
293#ifdef CONFIG_PROVE_LOCKING 286#ifdef CONFIG_PROVE_LOCKING
294 seq_printf(m, " dependency chains: %11lu [max: %lu]\n", 287 seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
295 nr_lock_chains, MAX_LOCKDEP_CHAINS); 288 nr_lock_chains, MAX_LOCKDEP_CHAINS);
@@ -494,7 +487,6 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
494 namelen += 2; 487 namelen += 2;
495 488
496 for (i = 0; i < LOCKSTAT_POINTS; i++) { 489 for (i = 0; i < LOCKSTAT_POINTS; i++) {
497 char sym[KSYM_SYMBOL_LEN];
498 char ip[32]; 490 char ip[32];
499 491
500 if (class->contention_point[i] == 0) 492 if (class->contention_point[i] == 0)
@@ -503,15 +495,13 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
503 if (!i) 495 if (!i)
504 seq_line(m, '-', 40-namelen, namelen); 496 seq_line(m, '-', 40-namelen, namelen);
505 497
506 sprint_symbol(sym, class->contention_point[i]);
507 snprintf(ip, sizeof(ip), "[<%p>]", 498 snprintf(ip, sizeof(ip), "[<%p>]",
508 (void *)class->contention_point[i]); 499 (void *)class->contention_point[i]);
509 seq_printf(m, "%40s %14lu %29s %s\n", name, 500 seq_printf(m, "%40s %14lu %29s %pS\n",
510 stats->contention_point[i], 501 name, stats->contention_point[i],
511 ip, sym); 502 ip, (void *)class->contention_point[i]);
512 } 503 }
513 for (i = 0; i < LOCKSTAT_POINTS; i++) { 504 for (i = 0; i < LOCKSTAT_POINTS; i++) {
514 char sym[KSYM_SYMBOL_LEN];
515 char ip[32]; 505 char ip[32];
516 506
517 if (class->contending_point[i] == 0) 507 if (class->contending_point[i] == 0)
@@ -520,12 +510,11 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
520 if (!i) 510 if (!i)
521 seq_line(m, '-', 40-namelen, namelen); 511 seq_line(m, '-', 40-namelen, namelen);
522 512
523 sprint_symbol(sym, class->contending_point[i]);
524 snprintf(ip, sizeof(ip), "[<%p>]", 513 snprintf(ip, sizeof(ip), "[<%p>]",
525 (void *)class->contending_point[i]); 514 (void *)class->contending_point[i]);
526 seq_printf(m, "%40s %14lu %29s %s\n", name, 515 seq_printf(m, "%40s %14lu %29s %pS\n",
527 stats->contending_point[i], 516 name, stats->contending_point[i],
528 ip, sym); 517 ip, (void *)class->contending_point[i]);
529 } 518 }
530 if (i) { 519 if (i) {
531 seq_puts(m, "\n"); 520 seq_puts(m, "\n");
diff --git a/kernel/module.c b/kernel/module.c
index ccd641991842..795bdc7f5c3f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -55,6 +55,9 @@
55#include <linux/async.h> 55#include <linux/async.h>
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/kmemleak.h> 57#include <linux/kmemleak.h>
58#include <linux/jump_label.h>
59#include <linux/pfn.h>
60#include <linux/bsearch.h>
58 61
59#define CREATE_TRACE_POINTS 62#define CREATE_TRACE_POINTS
60#include <trace/events/module.h> 63#include <trace/events/module.h>
@@ -69,6 +72,26 @@
69#define ARCH_SHF_SMALL 0 72#define ARCH_SHF_SMALL 0
70#endif 73#endif
71 74
75/*
76 * Modules' sections will be aligned on page boundaries
77 * to ensure complete separation of code and data, but
78 * only when CONFIG_DEBUG_SET_MODULE_RONX=y
79 */
80#ifdef CONFIG_DEBUG_SET_MODULE_RONX
81# define debug_align(X) ALIGN(X, PAGE_SIZE)
82#else
83# define debug_align(X) (X)
84#endif
85
86/*
87 * Given BASE and SIZE this macro calculates the number of pages the
88 * memory regions occupies
89 */
90#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \
91 (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \
92 PFN_DOWN((unsigned long)BASE) + 1) \
93 : (0UL))
94
72/* If this is set, the section belongs in the init part of the module */ 95/* If this is set, the section belongs in the init part of the module */
73#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) 96#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
74 97
@@ -218,23 +241,24 @@ static bool each_symbol_in_section(const struct symsearch *arr,
218 struct module *owner, 241 struct module *owner,
219 bool (*fn)(const struct symsearch *syms, 242 bool (*fn)(const struct symsearch *syms,
220 struct module *owner, 243 struct module *owner,
221 unsigned int symnum, void *data), 244 void *data),
222 void *data) 245 void *data)
223{ 246{
224 unsigned int i, j; 247 unsigned int j;
225 248
226 for (j = 0; j < arrsize; j++) { 249 for (j = 0; j < arrsize; j++) {
227 for (i = 0; i < arr[j].stop - arr[j].start; i++) 250 if (fn(&arr[j], owner, data))
228 if (fn(&arr[j], owner, i, data)) 251 return true;
229 return true;
230 } 252 }
231 253
232 return false; 254 return false;
233} 255}
234 256
235/* Returns true as soon as fn returns true, otherwise false. */ 257/* Returns true as soon as fn returns true, otherwise false. */
236bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, 258bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
237 unsigned int symnum, void *data), void *data) 259 struct module *owner,
260 void *data),
261 void *data)
238{ 262{
239 struct module *mod; 263 struct module *mod;
240 static const struct symsearch arr[] = { 264 static const struct symsearch arr[] = {
@@ -287,7 +311,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
287 } 311 }
288 return false; 312 return false;
289} 313}
290EXPORT_SYMBOL_GPL(each_symbol); 314EXPORT_SYMBOL_GPL(each_symbol_section);
291 315
292struct find_symbol_arg { 316struct find_symbol_arg {
293 /* Input */ 317 /* Input */
@@ -301,15 +325,12 @@ struct find_symbol_arg {
301 const struct kernel_symbol *sym; 325 const struct kernel_symbol *sym;
302}; 326};
303 327
304static bool find_symbol_in_section(const struct symsearch *syms, 328static bool check_symbol(const struct symsearch *syms,
305 struct module *owner, 329 struct module *owner,
306 unsigned int symnum, void *data) 330 unsigned int symnum, void *data)
307{ 331{
308 struct find_symbol_arg *fsa = data; 332 struct find_symbol_arg *fsa = data;
309 333
310 if (strcmp(syms->start[symnum].name, fsa->name) != 0)
311 return false;
312
313 if (!fsa->gplok) { 334 if (!fsa->gplok) {
314 if (syms->licence == GPL_ONLY) 335 if (syms->licence == GPL_ONLY)
315 return false; 336 return false;
@@ -343,6 +364,30 @@ static bool find_symbol_in_section(const struct symsearch *syms,
343 return true; 364 return true;
344} 365}
345 366
367static int cmp_name(const void *va, const void *vb)
368{
369 const char *a;
370 const struct kernel_symbol *b;
371 a = va; b = vb;
372 return strcmp(a, b->name);
373}
374
375static bool find_symbol_in_section(const struct symsearch *syms,
376 struct module *owner,
377 void *data)
378{
379 struct find_symbol_arg *fsa = data;
380 struct kernel_symbol *sym;
381
382 sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
383 sizeof(struct kernel_symbol), cmp_name);
384
385 if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data))
386 return true;
387
388 return false;
389}
390
346/* Find a symbol and return it, along with, (optional) crc and 391/* Find a symbol and return it, along with, (optional) crc and
347 * (optional) module which owns it. Needs preempt disabled or module_mutex. */ 392 * (optional) module which owns it. Needs preempt disabled or module_mutex. */
348const struct kernel_symbol *find_symbol(const char *name, 393const struct kernel_symbol *find_symbol(const char *name,
@@ -357,7 +402,7 @@ const struct kernel_symbol *find_symbol(const char *name,
357 fsa.gplok = gplok; 402 fsa.gplok = gplok;
358 fsa.warn = warn; 403 fsa.warn = warn;
359 404
360 if (each_symbol(find_symbol_in_section, &fsa)) { 405 if (each_symbol_section(find_symbol_in_section, &fsa)) {
361 if (owner) 406 if (owner)
362 *owner = fsa.owner; 407 *owner = fsa.owner;
363 if (crc) 408 if (crc)
@@ -787,7 +832,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
787 wait_for_zero_refcount(mod); 832 wait_for_zero_refcount(mod);
788 833
789 mutex_unlock(&module_mutex); 834 mutex_unlock(&module_mutex);
790 /* Final destruction now noone is using it. */ 835 /* Final destruction now no one is using it. */
791 if (mod->exit != NULL) 836 if (mod->exit != NULL)
792 mod->exit(); 837 mod->exit();
793 blocking_notifier_call_chain(&module_notify_list, 838 blocking_notifier_call_chain(&module_notify_list,
@@ -1146,7 +1191,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
1146{ 1191{
1147 struct module_sect_attr *sattr = 1192 struct module_sect_attr *sattr =
1148 container_of(mattr, struct module_sect_attr, mattr); 1193 container_of(mattr, struct module_sect_attr, mattr);
1149 return sprintf(buf, "0x%lx\n", sattr->address); 1194 return sprintf(buf, "0x%pK\n", (void *)sattr->address);
1150} 1195}
1151 1196
1152static void free_sect_attrs(struct module_sect_attrs *sect_attrs) 1197static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
@@ -1541,6 +1586,117 @@ static int __unlink_module(void *_mod)
1541 return 0; 1586 return 0;
1542} 1587}
1543 1588
1589#ifdef CONFIG_DEBUG_SET_MODULE_RONX
1590/*
1591 * LKM RO/NX protection: protect module's text/ro-data
1592 * from modification and any data from execution.
1593 */
1594void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
1595{
1596 unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
1597 unsigned long end_pfn = PFN_DOWN((unsigned long)end);
1598
1599 if (end_pfn > begin_pfn)
1600 set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
1601}
1602
1603static void set_section_ro_nx(void *base,
1604 unsigned long text_size,
1605 unsigned long ro_size,
1606 unsigned long total_size)
1607{
1608 /* begin and end PFNs of the current subsection */
1609 unsigned long begin_pfn;
1610 unsigned long end_pfn;
1611
1612 /*
1613 * Set RO for module text and RO-data:
1614 * - Always protect first page.
1615 * - Do not protect last partial page.
1616 */
1617 if (ro_size > 0)
1618 set_page_attributes(base, base + ro_size, set_memory_ro);
1619
1620 /*
1621 * Set NX permissions for module data:
1622 * - Do not protect first partial page.
1623 * - Always protect last page.
1624 */
1625 if (total_size > text_size) {
1626 begin_pfn = PFN_UP((unsigned long)base + text_size);
1627 end_pfn = PFN_UP((unsigned long)base + total_size);
1628 if (end_pfn > begin_pfn)
1629 set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
1630 }
1631}
1632
1633static void unset_module_core_ro_nx(struct module *mod)
1634{
1635 set_page_attributes(mod->module_core + mod->core_text_size,
1636 mod->module_core + mod->core_size,
1637 set_memory_x);
1638 set_page_attributes(mod->module_core,
1639 mod->module_core + mod->core_ro_size,
1640 set_memory_rw);
1641}
1642
1643static void unset_module_init_ro_nx(struct module *mod)
1644{
1645 set_page_attributes(mod->module_init + mod->init_text_size,
1646 mod->module_init + mod->init_size,
1647 set_memory_x);
1648 set_page_attributes(mod->module_init,
1649 mod->module_init + mod->init_ro_size,
1650 set_memory_rw);
1651}
1652
1653/* Iterate through all modules and set each module's text as RW */
1654void set_all_modules_text_rw(void)
1655{
1656 struct module *mod;
1657
1658 mutex_lock(&module_mutex);
1659 list_for_each_entry_rcu(mod, &modules, list) {
1660 if ((mod->module_core) && (mod->core_text_size)) {
1661 set_page_attributes(mod->module_core,
1662 mod->module_core + mod->core_text_size,
1663 set_memory_rw);
1664 }
1665 if ((mod->module_init) && (mod->init_text_size)) {
1666 set_page_attributes(mod->module_init,
1667 mod->module_init + mod->init_text_size,
1668 set_memory_rw);
1669 }
1670 }
1671 mutex_unlock(&module_mutex);
1672}
1673
1674/* Iterate through all modules and set each module's text as RO */
1675void set_all_modules_text_ro(void)
1676{
1677 struct module *mod;
1678
1679 mutex_lock(&module_mutex);
1680 list_for_each_entry_rcu(mod, &modules, list) {
1681 if ((mod->module_core) && (mod->core_text_size)) {
1682 set_page_attributes(mod->module_core,
1683 mod->module_core + mod->core_text_size,
1684 set_memory_ro);
1685 }
1686 if ((mod->module_init) && (mod->init_text_size)) {
1687 set_page_attributes(mod->module_init,
1688 mod->module_init + mod->init_text_size,
1689 set_memory_ro);
1690 }
1691 }
1692 mutex_unlock(&module_mutex);
1693}
1694#else
1695static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
1696static void unset_module_core_ro_nx(struct module *mod) { }
1697static void unset_module_init_ro_nx(struct module *mod) { }
1698#endif
1699
1544/* Free a module, remove from lists, etc. */ 1700/* Free a module, remove from lists, etc. */
1545static void free_module(struct module *mod) 1701static void free_module(struct module *mod)
1546{ 1702{
@@ -1565,6 +1721,7 @@ static void free_module(struct module *mod)
1565 destroy_params(mod->kp, mod->num_kp); 1721 destroy_params(mod->kp, mod->num_kp);
1566 1722
1567 /* This may be NULL, but that's OK */ 1723 /* This may be NULL, but that's OK */
1724 unset_module_init_ro_nx(mod);
1568 module_free(mod, mod->module_init); 1725 module_free(mod, mod->module_init);
1569 kfree(mod->args); 1726 kfree(mod->args);
1570 percpu_modfree(mod); 1727 percpu_modfree(mod);
@@ -1573,6 +1730,7 @@ static void free_module(struct module *mod)
1573 lockdep_free_key_range(mod->module_core, mod->core_size); 1730 lockdep_free_key_range(mod->module_core, mod->core_size);
1574 1731
1575 /* Finally, free the core (containing the module structure) */ 1732 /* Finally, free the core (containing the module structure) */
1733 unset_module_core_ro_nx(mod);
1576 module_free(mod, mod->module_core); 1734 module_free(mod, mod->module_core);
1577 1735
1578#ifdef CONFIG_MPU 1736#ifdef CONFIG_MPU
@@ -1776,8 +1934,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
1776 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 1934 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1777 DEBUGP("\t%s\n", name); 1935 DEBUGP("\t%s\n", name);
1778 } 1936 }
1779 if (m == 0) 1937 switch (m) {
1938 case 0: /* executable */
1939 mod->core_size = debug_align(mod->core_size);
1780 mod->core_text_size = mod->core_size; 1940 mod->core_text_size = mod->core_size;
1941 break;
1942 case 1: /* RO: text and ro-data */
1943 mod->core_size = debug_align(mod->core_size);
1944 mod->core_ro_size = mod->core_size;
1945 break;
1946 case 3: /* whole core */
1947 mod->core_size = debug_align(mod->core_size);
1948 break;
1949 }
1781 } 1950 }
1782 1951
1783 DEBUGP("Init section allocation order:\n"); 1952 DEBUGP("Init section allocation order:\n");
@@ -1795,8 +1964,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
1795 | INIT_OFFSET_MASK); 1964 | INIT_OFFSET_MASK);
1796 DEBUGP("\t%s\n", sname); 1965 DEBUGP("\t%s\n", sname);
1797 } 1966 }
1798 if (m == 0) 1967 switch (m) {
1968 case 0: /* executable */
1969 mod->init_size = debug_align(mod->init_size);
1799 mod->init_text_size = mod->init_size; 1970 mod->init_text_size = mod->init_size;
1971 break;
1972 case 1: /* RO: text and ro-data */
1973 mod->init_size = debug_align(mod->init_size);
1974 mod->init_ro_size = mod->init_size;
1975 break;
1976 case 3: /* whole init */
1977 mod->init_size = debug_align(mod->init_size);
1978 break;
1979 }
1800 } 1980 }
1801} 1981}
1802 1982
@@ -1875,11 +2055,8 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
1875 const struct kernel_symbol *start, 2055 const struct kernel_symbol *start,
1876 const struct kernel_symbol *stop) 2056 const struct kernel_symbol *stop)
1877{ 2057{
1878 const struct kernel_symbol *ks = start; 2058 return bsearch(name, start, stop - start,
1879 for (; ks < stop; ks++) 2059 sizeof(struct kernel_symbol), cmp_name);
1880 if (strcmp(ks->name, name) == 0)
1881 return ks;
1882 return NULL;
1883} 2060}
1884 2061
1885static int is_exported(const char *name, unsigned long value, 2062static int is_exported(const char *name, unsigned long value,
@@ -2036,7 +2213,7 @@ static inline void layout_symtab(struct module *mod, struct load_info *info)
2036{ 2213{
2037} 2214}
2038 2215
2039static void add_kallsyms(struct module *mod, struct load_info *info) 2216static void add_kallsyms(struct module *mod, const struct load_info *info)
2040{ 2217{
2041} 2218}
2042#endif /* CONFIG_KALLSYMS */ 2219#endif /* CONFIG_KALLSYMS */
@@ -2305,9 +2482,14 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2305#endif 2482#endif
2306 2483
2307#ifdef CONFIG_TRACEPOINTS 2484#ifdef CONFIG_TRACEPOINTS
2308 mod->tracepoints = section_objs(info, "__tracepoints", 2485 mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
2309 sizeof(*mod->tracepoints), 2486 sizeof(*mod->tracepoints_ptrs),
2310 &mod->num_tracepoints); 2487 &mod->num_tracepoints);
2488#endif
2489#ifdef HAVE_JUMP_LABEL
2490 mod->jump_entries = section_objs(info, "__jump_table",
2491 sizeof(*mod->jump_entries),
2492 &mod->num_jump_entries);
2311#endif 2493#endif
2312#ifdef CONFIG_EVENT_TRACING 2494#ifdef CONFIG_EVENT_TRACING
2313 mod->trace_events = section_objs(info, "_ftrace_events", 2495 mod->trace_events = section_objs(info, "_ftrace_events",
@@ -2320,6 +2502,18 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2320 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * 2502 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2321 mod->num_trace_events, GFP_KERNEL); 2503 mod->num_trace_events, GFP_KERNEL);
2322#endif 2504#endif
2505#ifdef CONFIG_TRACING
2506 mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
2507 sizeof(*mod->trace_bprintk_fmt_start),
2508 &mod->num_trace_bprintk_fmt);
2509 /*
2510 * This section contains pointers to allocated objects in the trace
2511 * code and not scanning it leads to false positives.
2512 */
2513 kmemleak_scan_area(mod->trace_bprintk_fmt_start,
2514 sizeof(*mod->trace_bprintk_fmt_start) *
2515 mod->num_trace_bprintk_fmt, GFP_KERNEL);
2516#endif
2323#ifdef CONFIG_FTRACE_MCOUNT_RECORD 2517#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2324 /* sechdrs[0].sh_size is always zero */ 2518 /* sechdrs[0].sh_size is always zero */
2325 mod->ftrace_callsites = section_objs(info, "__mcount_loc", 2519 mod->ftrace_callsites = section_objs(info, "__mcount_loc",
@@ -2605,7 +2799,7 @@ static struct module *load_module(void __user *umod,
2605 mod->state = MODULE_STATE_COMING; 2799 mod->state = MODULE_STATE_COMING;
2606 2800
2607 /* Now sew it into the lists so we can get lockdep and oops 2801 /* Now sew it into the lists so we can get lockdep and oops
2608 * info during argument parsing. Noone should access us, since 2802 * info during argument parsing. No one should access us, since
2609 * strong_try_module_get() will fail. 2803 * strong_try_module_get() will fail.
2610 * lockdep/oops can run asynchronous, so use the RCU list insertion 2804 * lockdep/oops can run asynchronous, so use the RCU list insertion
2611 * function to insert in a way safe to concurrent readers. 2805 * function to insert in a way safe to concurrent readers.
@@ -2618,7 +2812,7 @@ static struct module *load_module(void __user *umod,
2618 } 2812 }
2619 2813
2620 /* This has to be done once we're sure module name is unique. */ 2814 /* This has to be done once we're sure module name is unique. */
2621 if (!mod->taints) 2815 if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
2622 dynamic_debug_setup(info.debug, info.num_debug); 2816 dynamic_debug_setup(info.debug, info.num_debug);
2623 2817
2624 /* Find duplicate symbols */ 2818 /* Find duplicate symbols */
@@ -2655,7 +2849,7 @@ static struct module *load_module(void __user *umod,
2655 module_bug_cleanup(mod); 2849 module_bug_cleanup(mod);
2656 2850
2657 ddebug: 2851 ddebug:
2658 if (!mod->taints) 2852 if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
2659 dynamic_debug_remove(info.debug); 2853 dynamic_debug_remove(info.debug);
2660 unlock: 2854 unlock:
2661 mutex_unlock(&module_mutex); 2855 mutex_unlock(&module_mutex);
@@ -2704,6 +2898,18 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2704 blocking_notifier_call_chain(&module_notify_list, 2898 blocking_notifier_call_chain(&module_notify_list,
2705 MODULE_STATE_COMING, mod); 2899 MODULE_STATE_COMING, mod);
2706 2900
2901 /* Set RO and NX regions for core */
2902 set_section_ro_nx(mod->module_core,
2903 mod->core_text_size,
2904 mod->core_ro_size,
2905 mod->core_size);
2906
2907 /* Set RO and NX regions for init */
2908 set_section_ro_nx(mod->module_init,
2909 mod->init_text_size,
2910 mod->init_ro_size,
2911 mod->init_size);
2912
2707 do_mod_ctors(mod); 2913 do_mod_ctors(mod);
2708 /* Start the module */ 2914 /* Start the module */
2709 if (mod->init != NULL) 2915 if (mod->init != NULL)
@@ -2747,9 +2953,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2747 mod->symtab = mod->core_symtab; 2953 mod->symtab = mod->core_symtab;
2748 mod->strtab = mod->core_strtab; 2954 mod->strtab = mod->core_strtab;
2749#endif 2955#endif
2956 unset_module_init_ro_nx(mod);
2750 module_free(mod, mod->module_init); 2957 module_free(mod, mod->module_init);
2751 mod->module_init = NULL; 2958 mod->module_init = NULL;
2752 mod->init_size = 0; 2959 mod->init_size = 0;
2960 mod->init_ro_size = 0;
2753 mod->init_text_size = 0; 2961 mod->init_text_size = 0;
2754 mutex_unlock(&module_mutex); 2962 mutex_unlock(&module_mutex);
2755 2963
@@ -2786,7 +2994,7 @@ static const char *get_ksymbol(struct module *mod,
2786 else 2994 else
2787 nextval = (unsigned long)mod->module_core+mod->core_text_size; 2995 nextval = (unsigned long)mod->module_core+mod->core_text_size;
2788 2996
2789 /* Scan for closest preceeding symbol, and next symbol. (ELF 2997 /* Scan for closest preceding symbol, and next symbol. (ELF
2790 starts real symbols at 1). */ 2998 starts real symbols at 1). */
2791 for (i = 1; i < mod->num_symtab; i++) { 2999 for (i = 1; i < mod->num_symtab; i++) {
2792 if (mod->symtab[i].st_shndx == SHN_UNDEF) 3000 if (mod->symtab[i].st_shndx == SHN_UNDEF)
@@ -3039,7 +3247,7 @@ static int m_show(struct seq_file *m, void *p)
3039 mod->state == MODULE_STATE_COMING ? "Loading": 3247 mod->state == MODULE_STATE_COMING ? "Loading":
3040 "Live"); 3248 "Live");
3041 /* Used by oprofile and other similar tools. */ 3249 /* Used by oprofile and other similar tools. */
3042 seq_printf(m, " 0x%p", mod->module_core); 3250 seq_printf(m, " 0x%pK", mod->module_core);
3043 3251
3044 /* Taints info */ 3252 /* Taints info */
3045 if (mod->taints) 3253 if (mod->taints)
@@ -3208,7 +3416,7 @@ void module_layout(struct module *mod,
3208 struct modversion_info *ver, 3416 struct modversion_info *ver,
3209 struct kernel_param *kp, 3417 struct kernel_param *kp,
3210 struct kernel_symbol *ks, 3418 struct kernel_symbol *ks,
3211 struct tracepoint *tp) 3419 struct tracepoint * const *tp)
3212{ 3420{
3213} 3421}
3214EXPORT_SYMBOL(module_layout); 3422EXPORT_SYMBOL(module_layout);
@@ -3222,8 +3430,8 @@ void module_update_tracepoints(void)
3222 mutex_lock(&module_mutex); 3430 mutex_lock(&module_mutex);
3223 list_for_each_entry(mod, &modules, list) 3431 list_for_each_entry(mod, &modules, list)
3224 if (!mod->taints) 3432 if (!mod->taints)
3225 tracepoint_update_probe_range(mod->tracepoints, 3433 tracepoint_update_probe_range(mod->tracepoints_ptrs,
3226 mod->tracepoints + mod->num_tracepoints); 3434 mod->tracepoints_ptrs + mod->num_tracepoints);
3227 mutex_unlock(&module_mutex); 3435 mutex_unlock(&module_mutex);
3228} 3436}
3229 3437
@@ -3247,8 +3455,8 @@ int module_get_iter_tracepoints(struct tracepoint_iter *iter)
3247 else if (iter_mod > iter->module) 3455 else if (iter_mod > iter->module)
3248 iter->tracepoint = NULL; 3456 iter->tracepoint = NULL;
3249 found = tracepoint_get_iter_range(&iter->tracepoint, 3457 found = tracepoint_get_iter_range(&iter->tracepoint,
3250 iter_mod->tracepoints, 3458 iter_mod->tracepoints_ptrs,
3251 iter_mod->tracepoints 3459 iter_mod->tracepoints_ptrs
3252 + iter_mod->num_tracepoints); 3460 + iter_mod->num_tracepoints);
3253 if (found) { 3461 if (found) {
3254 iter->module = iter_mod; 3462 iter->module = iter_mod;
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index ec815a960b5d..73da83aff418 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock)
75 return; 75 return;
76 76
77 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 77 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
78 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 78 DEBUG_LOCKS_WARN_ON(lock->owner != current);
79 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 79 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
80 mutex_clear_owner(lock); 80 mutex_clear_owner(lock);
81} 81}
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index 57d527a16f9d..0799fd3e4cfa 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
29 29
30static inline void mutex_set_owner(struct mutex *lock) 30static inline void mutex_set_owner(struct mutex *lock)
31{ 31{
32 lock->owner = current_thread_info(); 32 lock->owner = current;
33} 33}
34 34
35static inline void mutex_clear_owner(struct mutex *lock) 35static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 200407c1502f..d607ed5dd441 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -131,14 +131,14 @@ EXPORT_SYMBOL(mutex_unlock);
131 */ 131 */
132static inline int __sched 132static inline int __sched
133__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, 133__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
134 unsigned long ip) 134 struct lockdep_map *nest_lock, unsigned long ip)
135{ 135{
136 struct task_struct *task = current; 136 struct task_struct *task = current;
137 struct mutex_waiter waiter; 137 struct mutex_waiter waiter;
138 unsigned long flags; 138 unsigned long flags;
139 139
140 preempt_disable(); 140 preempt_disable();
141 mutex_acquire(&lock->dep_map, subclass, 0, ip); 141 mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
142 142
143#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 143#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
144 /* 144 /*
@@ -160,14 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
160 */ 160 */
161 161
162 for (;;) { 162 for (;;) {
163 struct thread_info *owner; 163 struct task_struct *owner;
164
165 /*
166 * If we own the BKL, then don't spin. The owner of
167 * the mutex might be waiting on us to release the BKL.
168 */
169 if (unlikely(current->lock_depth >= 0))
170 break;
171 164
172 /* 165 /*
173 * If there's an owner, wait for it to either 166 * If there's an owner, wait for it to either
@@ -199,7 +192,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
199 * memory barriers as we'll eventually observe the right 192 * memory barriers as we'll eventually observe the right
200 * values at the cost of a few extra spins. 193 * values at the cost of a few extra spins.
201 */ 194 */
202 cpu_relax(); 195 arch_mutex_cpu_relax();
203 } 196 }
204#endif 197#endif
205 spin_lock_mutex(&lock->wait_lock, flags); 198 spin_lock_mutex(&lock->wait_lock, flags);
@@ -245,7 +238,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
245 } 238 }
246 __set_task_state(task, state); 239 __set_task_state(task, state);
247 240
248 /* didnt get the lock, go to sleep: */ 241 /* didn't get the lock, go to sleep: */
249 spin_unlock_mutex(&lock->wait_lock, flags); 242 spin_unlock_mutex(&lock->wait_lock, flags);
250 preempt_enable_no_resched(); 243 preempt_enable_no_resched();
251 schedule(); 244 schedule();
@@ -276,16 +269,25 @@ void __sched
276mutex_lock_nested(struct mutex *lock, unsigned int subclass) 269mutex_lock_nested(struct mutex *lock, unsigned int subclass)
277{ 270{
278 might_sleep(); 271 might_sleep();
279 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_); 272 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
280} 273}
281 274
282EXPORT_SYMBOL_GPL(mutex_lock_nested); 275EXPORT_SYMBOL_GPL(mutex_lock_nested);
283 276
277void __sched
278_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
279{
280 might_sleep();
281 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
282}
283
284EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
285
284int __sched 286int __sched
285mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) 287mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
286{ 288{
287 might_sleep(); 289 might_sleep();
288 return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_); 290 return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
289} 291}
290EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); 292EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
291 293
@@ -294,7 +296,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
294{ 296{
295 might_sleep(); 297 might_sleep();
296 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 298 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
297 subclass, _RET_IP_); 299 subclass, NULL, _RET_IP_);
298} 300}
299 301
300EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); 302EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -400,7 +402,7 @@ __mutex_lock_slowpath(atomic_t *lock_count)
400{ 402{
401 struct mutex *lock = container_of(lock_count, struct mutex, count); 403 struct mutex *lock = container_of(lock_count, struct mutex, count);
402 404
403 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_); 405 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
404} 406}
405 407
406static noinline int __sched 408static noinline int __sched
@@ -408,7 +410,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count)
408{ 410{
409 struct mutex *lock = container_of(lock_count, struct mutex, count); 411 struct mutex *lock = container_of(lock_count, struct mutex, count);
410 412
411 return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_); 413 return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
412} 414}
413 415
414static noinline int __sched 416static noinline int __sched
@@ -416,7 +418,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count)
416{ 418{
417 struct mutex *lock = container_of(lock_count, struct mutex, count); 419 struct mutex *lock = container_of(lock_count, struct mutex, count);
418 420
419 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_); 421 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
420} 422}
421#endif 423#endif
422 424
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 67578ca48f94..4115fbf83b12 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -19,7 +19,7 @@
19#ifdef CONFIG_SMP 19#ifdef CONFIG_SMP
20static inline void mutex_set_owner(struct mutex *lock) 20static inline void mutex_set_owner(struct mutex *lock)
21{ 21{
22 lock->owner = current_thread_info(); 22 lock->owner = current;
23} 23}
24 24
25static inline void mutex_clear_owner(struct mutex *lock) 25static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
deleted file mode 100644
index 2a5dfec8efe0..000000000000
--- a/kernel/ns_cgroup.c
+++ /dev/null
@@ -1,110 +0,0 @@
1/*
2 * ns_cgroup.c - namespace cgroup subsystem
3 *
4 * Copyright 2006, 2007 IBM Corp
5 */
6
7#include <linux/module.h>
8#include <linux/cgroup.h>
9#include <linux/fs.h>
10#include <linux/proc_fs.h>
11#include <linux/slab.h>
12#include <linux/nsproxy.h>
13
14struct ns_cgroup {
15 struct cgroup_subsys_state css;
16};
17
18struct cgroup_subsys ns_subsys;
19
20static inline struct ns_cgroup *cgroup_to_ns(
21 struct cgroup *cgroup)
22{
23 return container_of(cgroup_subsys_state(cgroup, ns_subsys_id),
24 struct ns_cgroup, css);
25}
26
27int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
28{
29 char name[PROC_NUMBUF];
30
31 snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
32 return cgroup_clone(task, &ns_subsys, name);
33}
34
35/*
36 * Rules:
37 * 1. you can only enter a cgroup which is a descendant of your current
38 * cgroup
39 * 2. you can only place another process into a cgroup if
40 * a. you have CAP_SYS_ADMIN
41 * b. your cgroup is an ancestor of task's destination cgroup
42 * (hence either you are in the same cgroup as task, or in an
43 * ancestor cgroup thereof)
44 */
45static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
46 struct task_struct *task, bool threadgroup)
47{
48 if (current != task) {
49 if (!capable(CAP_SYS_ADMIN))
50 return -EPERM;
51
52 if (!cgroup_is_descendant(new_cgroup, current))
53 return -EPERM;
54 }
55
56 if (!cgroup_is_descendant(new_cgroup, task))
57 return -EPERM;
58
59 if (threadgroup) {
60 struct task_struct *c;
61 rcu_read_lock();
62 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
63 if (!cgroup_is_descendant(new_cgroup, c)) {
64 rcu_read_unlock();
65 return -EPERM;
66 }
67 }
68 rcu_read_unlock();
69 }
70
71 return 0;
72}
73
74/*
75 * Rules: you can only create a cgroup if
76 * 1. you are capable(CAP_SYS_ADMIN)
77 * 2. the target cgroup is a descendant of your own cgroup
78 */
79static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
80 struct cgroup *cgroup)
81{
82 struct ns_cgroup *ns_cgroup;
83
84 if (!capable(CAP_SYS_ADMIN))
85 return ERR_PTR(-EPERM);
86 if (!cgroup_is_descendant(cgroup, current))
87 return ERR_PTR(-EPERM);
88
89 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
90 if (!ns_cgroup)
91 return ERR_PTR(-ENOMEM);
92 return &ns_cgroup->css;
93}
94
95static void ns_destroy(struct cgroup_subsys *ss,
96 struct cgroup *cgroup)
97{
98 struct ns_cgroup *ns_cgroup;
99
100 ns_cgroup = cgroup_to_ns(cgroup);
101 kfree(ns_cgroup);
102}
103
104struct cgroup_subsys ns_subsys = {
105 .name = "ns",
106 .can_attach = ns_can_attach,
107 .create = ns_create,
108 .destroy = ns_destroy,
109 .subsys_id = ns_subsys_id,
110};
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f74e6c00e26d..d6a00f3de15d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,6 +22,9 @@
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <net/net_namespace.h> 23#include <net/net_namespace.h>
24#include <linux/ipc_namespace.h> 24#include <linux/ipc_namespace.h>
25#include <linux/proc_fs.h>
26#include <linux/file.h>
27#include <linux/syscalls.h>
25 28
26static struct kmem_cache *nsproxy_cachep; 29static struct kmem_cache *nsproxy_cachep;
27 30
@@ -69,13 +72,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
69 goto out_ns; 72 goto out_ns;
70 } 73 }
71 74
72 new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); 75 new_nsp->uts_ns = copy_utsname(flags, tsk);
73 if (IS_ERR(new_nsp->uts_ns)) { 76 if (IS_ERR(new_nsp->uts_ns)) {
74 err = PTR_ERR(new_nsp->uts_ns); 77 err = PTR_ERR(new_nsp->uts_ns);
75 goto out_uts; 78 goto out_uts;
76 } 79 }
77 80
78 new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); 81 new_nsp->ipc_ns = copy_ipcs(flags, tsk);
79 if (IS_ERR(new_nsp->ipc_ns)) { 82 if (IS_ERR(new_nsp->ipc_ns)) {
80 err = PTR_ERR(new_nsp->ipc_ns); 83 err = PTR_ERR(new_nsp->ipc_ns);
81 goto out_ipc; 84 goto out_ipc;
@@ -198,10 +201,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
198 goto out; 201 goto out;
199 } 202 }
200 203
201 err = ns_cgroup_clone(current, task_pid(current));
202 if (err)
203 put_nsproxy(*new_nsp);
204
205out: 204out:
206 return err; 205 return err;
207} 206}
@@ -233,6 +232,45 @@ void exit_task_namespaces(struct task_struct *p)
233 switch_task_namespaces(p, NULL); 232 switch_task_namespaces(p, NULL);
234} 233}
235 234
235SYSCALL_DEFINE2(setns, int, fd, int, nstype)
236{
237 const struct proc_ns_operations *ops;
238 struct task_struct *tsk = current;
239 struct nsproxy *new_nsproxy;
240 struct proc_inode *ei;
241 struct file *file;
242 int err;
243
244 if (!capable(CAP_SYS_ADMIN))
245 return -EPERM;
246
247 file = proc_ns_fget(fd);
248 if (IS_ERR(file))
249 return PTR_ERR(file);
250
251 err = -EINVAL;
252 ei = PROC_I(file->f_dentry->d_inode);
253 ops = ei->ns_ops;
254 if (nstype && (ops->type != nstype))
255 goto out;
256
257 new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
258 if (IS_ERR(new_nsproxy)) {
259 err = PTR_ERR(new_nsproxy);
260 goto out;
261 }
262
263 err = ops->install(new_nsproxy, ei->ns);
264 if (err) {
265 free_nsproxy(new_nsproxy);
266 goto out;
267 }
268 switch_task_namespaces(tsk, new_nsproxy);
269out:
270 fput(file);
271 return err;
272}
273
236static int __init nsproxy_cache_init(void) 274static int __init nsproxy_cache_init(void)
237{ 275{
238 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); 276 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
diff --git a/kernel/padata.c b/kernel/padata.c
index 751019415d23..b91941df5e63 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -262,7 +262,7 @@ static void padata_reorder(struct parallel_data *pd)
262 /* 262 /*
263 * This cpu has to do the parallel processing of the next 263 * This cpu has to do the parallel processing of the next
264 * object. It's waiting in the cpu's parallelization queue, 264 * object. It's waiting in the cpu's parallelization queue,
265 * so exit imediately. 265 * so exit immediately.
266 */ 266 */
267 if (PTR_ERR(padata) == -ENODATA) { 267 if (PTR_ERR(padata) == -ENODATA) {
268 del_timer(&pd->timer); 268 del_timer(&pd->timer);
@@ -284,7 +284,7 @@ static void padata_reorder(struct parallel_data *pd)
284 /* 284 /*
285 * The next object that needs serialization might have arrived to 285 * The next object that needs serialization might have arrived to
286 * the reorder queues in the meantime, we will be called again 286 * the reorder queues in the meantime, we will be called again
287 * from the timer function if noone else cares for it. 287 * from the timer function if no one else cares for it.
288 */ 288 */
289 if (atomic_read(&pd->reorder_objects) 289 if (atomic_read(&pd->reorder_objects)
290 && !(pinst->flags & PADATA_RESET)) 290 && !(pinst->flags & PADATA_RESET))
@@ -515,7 +515,7 @@ static void __padata_stop(struct padata_instance *pinst)
515 put_online_cpus(); 515 put_online_cpus();
516} 516}
517 517
518/* Replace the internal control stucture with a new one. */ 518/* Replace the internal control structure with a new one. */
519static void padata_replace(struct padata_instance *pinst, 519static void padata_replace(struct padata_instance *pinst,
520 struct parallel_data *pd_new) 520 struct parallel_data *pd_new)
521{ 521{
@@ -768,7 +768,7 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
768} 768}
769 769
770 /** 770 /**
771 * padata_remove_cpu - remove a cpu from the one or both(serial and paralell) 771 * padata_remove_cpu - remove a cpu from the one or both(serial and parallel)
772 * padata cpumasks. 772 * padata cpumasks.
773 * 773 *
774 * @pinst: padata instance 774 * @pinst: padata instance
diff --git a/kernel/panic.c b/kernel/panic.c
index 4c13b1a88ebb..69231670eb95 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,6 +34,7 @@ static int pause_on_oops_flag;
34static DEFINE_SPINLOCK(pause_on_oops_lock); 34static DEFINE_SPINLOCK(pause_on_oops_lock);
35 35
36int panic_timeout; 36int panic_timeout;
37EXPORT_SYMBOL_GPL(panic_timeout);
37 38
38ATOMIC_NOTIFIER_HEAD(panic_notifier_list); 39ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
39 40
@@ -432,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
432 433
433core_param(panic, panic_timeout, int, 0644); 434core_param(panic, panic_timeout, int, 0644);
434core_param(pause_on_oops, pause_on_oops, int, 0644); 435core_param(pause_on_oops, pause_on_oops, int, 0644);
436
437static int __init oops_setup(char *s)
438{
439 if (!s)
440 return -EINVAL;
441 if (!strcmp(s, "panic"))
442 panic_on_oops = 1;
443 return 0;
444}
445early_param("oops", oops_setup);
diff --git a/kernel/params.c b/kernel/params.c
index 08107d181758..ed72e1330862 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -95,7 +95,7 @@ static int parse_one(char *param,
95 /* Find parameter */ 95 /* Find parameter */
96 for (i = 0; i < num_params; i++) { 96 for (i = 0; i < num_params; i++) {
97 if (parameq(param, params[i].name)) { 97 if (parameq(param, params[i].name)) {
98 /* Noone handled NULL, so do it here. */ 98 /* No one handled NULL, so do it here. */
99 if (!val && params[i].ops->set != param_set_bool) 99 if (!val && params[i].ops->set != param_set_bool)
100 return -EINVAL; 100 return -EINVAL;
101 DEBUGP("They are equal! Calling %p\n", 101 DEBUGP("They are equal! Calling %p\n",
@@ -297,21 +297,15 @@ EXPORT_SYMBOL(param_ops_charp);
297int param_set_bool(const char *val, const struct kernel_param *kp) 297int param_set_bool(const char *val, const struct kernel_param *kp)
298{ 298{
299 bool v; 299 bool v;
300 int ret;
300 301
301 /* No equals means "set"... */ 302 /* No equals means "set"... */
302 if (!val) val = "1"; 303 if (!val) val = "1";
303 304
304 /* One of =[yYnN01] */ 305 /* One of =[yYnN01] */
305 switch (val[0]) { 306 ret = strtobool(val, &v);
306 case 'y': case 'Y': case '1': 307 if (ret)
307 v = true; 308 return ret;
308 break;
309 case 'n': case 'N': case '0':
310 v = false;
311 break;
312 default:
313 return -EINVAL;
314 }
315 309
316 if (kp->flags & KPARAM_ISBOOL) 310 if (kp->flags & KPARAM_ISBOOL)
317 *(bool *)kp->arg = v; 311 *(bool *)kp->arg = v;
@@ -719,9 +713,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)
719 params[i].ops->free(params[i].arg); 713 params[i].ops->free(params[i].arg);
720} 714}
721 715
722static void __init kernel_add_sysfs_param(const char *name, 716static struct module_kobject * __init locate_module_kobject(const char *name)
723 struct kernel_param *kparam,
724 unsigned int name_skip)
725{ 717{
726 struct module_kobject *mk; 718 struct module_kobject *mk;
727 struct kobject *kobj; 719 struct kobject *kobj;
@@ -729,10 +721,7 @@ static void __init kernel_add_sysfs_param(const char *name,
729 721
730 kobj = kset_find_obj(module_kset, name); 722 kobj = kset_find_obj(module_kset, name);
731 if (kobj) { 723 if (kobj) {
732 /* We already have one. Remove params so we can add more. */
733 mk = to_module_kobject(kobj); 724 mk = to_module_kobject(kobj);
734 /* We need to remove it before adding parameters. */
735 sysfs_remove_group(&mk->kobj, &mk->mp->grp);
736 } else { 725 } else {
737 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); 726 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
738 BUG_ON(!mk); 727 BUG_ON(!mk);
@@ -743,15 +732,36 @@ static void __init kernel_add_sysfs_param(const char *name,
743 "%s", name); 732 "%s", name);
744 if (err) { 733 if (err) {
745 kobject_put(&mk->kobj); 734 kobject_put(&mk->kobj);
746 printk(KERN_ERR "Module '%s' failed add to sysfs, " 735 printk(KERN_ERR
747 "error number %d\n", name, err); 736 "Module '%s' failed add to sysfs, error number %d\n",
748 printk(KERN_ERR "The system will be unstable now.\n"); 737 name, err);
749 return; 738 printk(KERN_ERR
739 "The system will be unstable now.\n");
740 return NULL;
750 } 741 }
751 /* So that exit path is even. */ 742
743 /* So that we hold reference in both cases. */
752 kobject_get(&mk->kobj); 744 kobject_get(&mk->kobj);
753 } 745 }
754 746
747 return mk;
748}
749
750static void __init kernel_add_sysfs_param(const char *name,
751 struct kernel_param *kparam,
752 unsigned int name_skip)
753{
754 struct module_kobject *mk;
755 int err;
756
757 mk = locate_module_kobject(name);
758 if (!mk)
759 return;
760
761 /* We need to remove old parameters before adding more. */
762 if (mk->mp)
763 sysfs_remove_group(&mk->kobj, &mk->mp->grp);
764
755 /* These should not fail at boot. */ 765 /* These should not fail at boot. */
756 err = add_sysfs_param(mk, kparam, kparam->name + name_skip); 766 err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
757 BUG_ON(err); 767 BUG_ON(err);
@@ -796,6 +806,35 @@ static void __init param_sysfs_builtin(void)
796 } 806 }
797} 807}
798 808
809ssize_t __modver_version_show(struct module_attribute *mattr,
810 struct module *mod, char *buf)
811{
812 struct module_version_attribute *vattr =
813 container_of(mattr, struct module_version_attribute, mattr);
814
815 return sprintf(buf, "%s\n", vattr->version);
816}
817
818extern const struct module_version_attribute *__start___modver[];
819extern const struct module_version_attribute *__stop___modver[];
820
821static void __init version_sysfs_builtin(void)
822{
823 const struct module_version_attribute **p;
824 struct module_kobject *mk;
825 int err;
826
827 for (p = __start___modver; p < __stop___modver; p++) {
828 const struct module_version_attribute *vattr = *p;
829
830 mk = locate_module_kobject(vattr->module_name);
831 if (mk) {
832 err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
833 kobject_uevent(&mk->kobj, KOBJ_ADD);
834 kobject_put(&mk->kobj);
835 }
836 }
837}
799 838
800/* module-related sysfs stuff */ 839/* module-related sysfs stuff */
801 840
@@ -875,6 +914,7 @@ static int __init param_sysfs_init(void)
875 } 914 }
876 module_sysfs_initialized = 1; 915 module_sysfs_initialized = 1;
877 916
917 version_sysfs_builtin();
878 param_sysfs_builtin(); 918 param_sysfs_builtin();
879 919
880 return 0; 920 return 0;
diff --git a/kernel/pid.c b/kernel/pid.c
index d55c6fb8d087..57a8346a270e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -217,11 +217,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
217 return -1; 217 return -1;
218} 218}
219 219
220int next_pidmap(struct pid_namespace *pid_ns, int last) 220int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
221{ 221{
222 int offset; 222 int offset;
223 struct pidmap *map, *end; 223 struct pidmap *map, *end;
224 224
225 if (last >= PID_MAX_LIMIT)
226 return -1;
227
225 offset = (last + 1) & BITS_PER_PAGE_MASK; 228 offset = (last + 1) & BITS_PER_PAGE_MASK;
226 map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; 229 map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
227 end = &pid_ns->pidmap[PIDMAP_ENTRIES]; 230 end = &pid_ns->pidmap[PIDMAP_ENTRIES];
@@ -401,7 +404,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
401 struct task_struct *result = NULL; 404 struct task_struct *result = NULL;
402 if (pid) { 405 if (pid) {
403 struct hlist_node *first; 406 struct hlist_node *first;
404 first = rcu_dereference_check(pid->tasks[type].first, 407 first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
405 rcu_read_lock_held() || 408 rcu_read_lock_held() ||
406 lockdep_tasklist_lock_is_held()); 409 lockdep_tasklist_lock_is_held());
407 if (first) 410 if (first)
@@ -416,6 +419,7 @@ EXPORT_SYMBOL(pid_task);
416 */ 419 */
417struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 420struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
418{ 421{
422 rcu_lockdep_assert(rcu_read_lock_held());
419 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); 423 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
420} 424}
421 425
@@ -434,6 +438,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
434 rcu_read_unlock(); 438 rcu_read_unlock();
435 return pid; 439 return pid;
436} 440}
441EXPORT_SYMBOL_GPL(get_task_pid);
437 442
438struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) 443struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
439{ 444{
@@ -445,6 +450,7 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
445 rcu_read_unlock(); 450 rcu_read_unlock();
446 return result; 451 return result;
447} 452}
453EXPORT_SYMBOL_GPL(get_pid_task);
448 454
449struct pid *find_get_pid(pid_t nr) 455struct pid *find_get_pid(pid_t nr)
450{ 456{
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a5aff94e1f0b..e9c9adc84ca6 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -14,6 +14,7 @@
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/acct.h> 15#include <linux/acct.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/proc_fs.h>
17 18
18#define BITS_PER_PAGE (PAGE_SIZE*8) 19#define BITS_PER_PAGE (PAGE_SIZE*8)
19 20
@@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
72{ 73{
73 struct pid_namespace *ns; 74 struct pid_namespace *ns;
74 unsigned int level = parent_pid_ns->level + 1; 75 unsigned int level = parent_pid_ns->level + 1;
75 int i; 76 int i, err = -ENOMEM;
76 77
77 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); 78 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
78 if (ns == NULL) 79 if (ns == NULL)
@@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
96 for (i = 1; i < PIDMAP_ENTRIES; i++) 97 for (i = 1; i < PIDMAP_ENTRIES; i++)
97 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 98 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
98 99
100 err = pid_ns_prepare_proc(ns);
101 if (err)
102 goto out_put_parent_pid_ns;
103
99 return ns; 104 return ns;
100 105
106out_put_parent_pid_ns:
107 put_pid_ns(parent_pid_ns);
101out_free_map: 108out_free_map:
102 kfree(ns->pidmap[0].page); 109 kfree(ns->pidmap[0].page);
103out_free: 110out_free:
104 kmem_cache_free(pid_ns_cachep, ns); 111 kmem_cache_free(pid_ns_cachep, ns);
105out: 112out:
106 return ERR_PTR(-ENOMEM); 113 return ERR_PTR(err);
107} 114}
108 115
109static void destroy_pid_namespace(struct pid_namespace *ns) 116static void destroy_pid_namespace(struct pid_namespace *ns)
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 645e541a45f6..6824ca7d4d0c 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -40,6 +40,7 @@
40#include <linux/string.h> 40#include <linux/string.h>
41#include <linux/platform_device.h> 41#include <linux/platform_device.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/kernel.h>
43 44
44#include <linux/uaccess.h> 45#include <linux/uaccess.h>
45 46
@@ -53,11 +54,17 @@ enum pm_qos_type {
53 PM_QOS_MIN /* return the smallest value */ 54 PM_QOS_MIN /* return the smallest value */
54}; 55};
55 56
57/*
58 * Note: The lockless read path depends on the CPU accessing
59 * target_value atomically. Atomic access is only guaranteed on all CPU
60 * types linux supports for 32 bit quantites
61 */
56struct pm_qos_object { 62struct pm_qos_object {
57 struct plist_head requests; 63 struct plist_head requests;
58 struct blocking_notifier_head *notifiers; 64 struct blocking_notifier_head *notifiers;
59 struct miscdevice pm_qos_power_miscdev; 65 struct miscdevice pm_qos_power_miscdev;
60 char *name; 66 char *name;
67 s32 target_value; /* Do not change to 64 bit */
61 s32 default_value; 68 s32 default_value;
62 enum pm_qos_type type; 69 enum pm_qos_type type;
63}; 70};
@@ -70,7 +77,8 @@ static struct pm_qos_object cpu_dma_pm_qos = {
70 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), 77 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
71 .notifiers = &cpu_dma_lat_notifier, 78 .notifiers = &cpu_dma_lat_notifier,
72 .name = "cpu_dma_latency", 79 .name = "cpu_dma_latency",
73 .default_value = 2000 * USEC_PER_SEC, 80 .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
81 .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
74 .type = PM_QOS_MIN, 82 .type = PM_QOS_MIN,
75}; 83};
76 84
@@ -79,7 +87,8 @@ static struct pm_qos_object network_lat_pm_qos = {
79 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), 87 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
80 .notifiers = &network_lat_notifier, 88 .notifiers = &network_lat_notifier,
81 .name = "network_latency", 89 .name = "network_latency",
82 .default_value = 2000 * USEC_PER_SEC, 90 .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
91 .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
83 .type = PM_QOS_MIN 92 .type = PM_QOS_MIN
84}; 93};
85 94
@@ -89,7 +98,8 @@ static struct pm_qos_object network_throughput_pm_qos = {
89 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), 98 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
90 .notifiers = &network_throughput_notifier, 99 .notifiers = &network_throughput_notifier,
91 .name = "network_throughput", 100 .name = "network_throughput",
92 .default_value = 0, 101 .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
102 .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
93 .type = PM_QOS_MAX, 103 .type = PM_QOS_MAX,
94}; 104};
95 105
@@ -103,13 +113,17 @@ static struct pm_qos_object *pm_qos_array[] = {
103 113
104static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 114static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
105 size_t count, loff_t *f_pos); 115 size_t count, loff_t *f_pos);
116static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
117 size_t count, loff_t *f_pos);
106static int pm_qos_power_open(struct inode *inode, struct file *filp); 118static int pm_qos_power_open(struct inode *inode, struct file *filp);
107static int pm_qos_power_release(struct inode *inode, struct file *filp); 119static int pm_qos_power_release(struct inode *inode, struct file *filp);
108 120
109static const struct file_operations pm_qos_power_fops = { 121static const struct file_operations pm_qos_power_fops = {
110 .write = pm_qos_power_write, 122 .write = pm_qos_power_write,
123 .read = pm_qos_power_read,
111 .open = pm_qos_power_open, 124 .open = pm_qos_power_open,
112 .release = pm_qos_power_release, 125 .release = pm_qos_power_release,
126 .llseek = noop_llseek,
113}; 127};
114 128
115/* unlocked internal variant */ 129/* unlocked internal variant */
@@ -120,10 +134,10 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
120 134
121 switch (o->type) { 135 switch (o->type) {
122 case PM_QOS_MIN: 136 case PM_QOS_MIN:
123 return plist_last(&o->requests)->prio; 137 return plist_first(&o->requests)->prio;
124 138
125 case PM_QOS_MAX: 139 case PM_QOS_MAX:
126 return plist_first(&o->requests)->prio; 140 return plist_last(&o->requests)->prio;
127 141
128 default: 142 default:
129 /* runtime check for not using enum */ 143 /* runtime check for not using enum */
@@ -131,6 +145,16 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
131 } 145 }
132} 146}
133 147
148static inline s32 pm_qos_read_value(struct pm_qos_object *o)
149{
150 return o->target_value;
151}
152
153static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value)
154{
155 o->target_value = value;
156}
157
134static void update_target(struct pm_qos_object *o, struct plist_node *node, 158static void update_target(struct pm_qos_object *o, struct plist_node *node,
135 int del, int value) 159 int del, int value)
136{ 160{
@@ -155,6 +179,7 @@ static void update_target(struct pm_qos_object *o, struct plist_node *node,
155 plist_add(node, &o->requests); 179 plist_add(node, &o->requests);
156 } 180 }
157 curr_value = pm_qos_get_value(o); 181 curr_value = pm_qos_get_value(o);
182 pm_qos_set_value(o, curr_value);
158 spin_unlock_irqrestore(&pm_qos_lock, flags); 183 spin_unlock_irqrestore(&pm_qos_lock, flags);
159 184
160 if (prev_value != curr_value) 185 if (prev_value != curr_value)
@@ -189,18 +214,11 @@ static int find_pm_qos_object_by_minor(int minor)
189 * pm_qos_request - returns current system wide qos expectation 214 * pm_qos_request - returns current system wide qos expectation
190 * @pm_qos_class: identification of which qos value is requested 215 * @pm_qos_class: identification of which qos value is requested
191 * 216 *
192 * This function returns the current target value in an atomic manner. 217 * This function returns the current target value.
193 */ 218 */
194int pm_qos_request(int pm_qos_class) 219int pm_qos_request(int pm_qos_class)
195{ 220{
196 unsigned long flags; 221 return pm_qos_read_value(pm_qos_array[pm_qos_class]);
197 int value;
198
199 spin_lock_irqsave(&pm_qos_lock, flags);
200 value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
201 spin_unlock_irqrestore(&pm_qos_lock, flags);
202
203 return value;
204} 222}
205EXPORT_SYMBOL_GPL(pm_qos_request); 223EXPORT_SYMBOL_GPL(pm_qos_request);
206 224
@@ -375,30 +393,63 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
375} 393}
376 394
377 395
396static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
397 size_t count, loff_t *f_pos)
398{
399 s32 value;
400 unsigned long flags;
401 struct pm_qos_object *o;
402 struct pm_qos_request_list *pm_qos_req = filp->private_data;
403
404 if (!pm_qos_req)
405 return -EINVAL;
406 if (!pm_qos_request_active(pm_qos_req))
407 return -EINVAL;
408
409 o = pm_qos_array[pm_qos_req->pm_qos_class];
410 spin_lock_irqsave(&pm_qos_lock, flags);
411 value = pm_qos_get_value(o);
412 spin_unlock_irqrestore(&pm_qos_lock, flags);
413
414 return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
415}
416
378static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 417static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
379 size_t count, loff_t *f_pos) 418 size_t count, loff_t *f_pos)
380{ 419{
381 s32 value; 420 s32 value;
382 int x;
383 char ascii_value[11];
384 struct pm_qos_request_list *pm_qos_req; 421 struct pm_qos_request_list *pm_qos_req;
385 422
386 if (count == sizeof(s32)) { 423 if (count == sizeof(s32)) {
387 if (copy_from_user(&value, buf, sizeof(s32))) 424 if (copy_from_user(&value, buf, sizeof(s32)))
388 return -EFAULT; 425 return -EFAULT;
389 } else if (count == 11) { /* len('0x12345678/0') */ 426 } else if (count <= 11) { /* ASCII perhaps? */
390 if (copy_from_user(ascii_value, buf, 11)) 427 char ascii_value[11];
428 unsigned long int ulval;
429 int ret;
430
431 if (copy_from_user(ascii_value, buf, count))
391 return -EFAULT; 432 return -EFAULT;
392 if (strlen(ascii_value) != 10) 433
393 return -EINVAL; 434 if (count > 10) {
394 x = sscanf(ascii_value, "%x", &value); 435 if (ascii_value[10] == '\n')
395 if (x != 1) 436 ascii_value[10] = '\0';
437 else
438 return -EINVAL;
439 } else {
440 ascii_value[count] = '\0';
441 }
442 ret = strict_strtoul(ascii_value, 16, &ulval);
443 if (ret) {
444 pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
396 return -EINVAL; 445 return -EINVAL;
397 pr_debug("%s, %d, 0x%x\n", ascii_value, x, value); 446 }
398 } else 447 value = (s32)lower_32_bits(ulval);
448 } else {
399 return -EINVAL; 449 return -EINVAL;
450 }
400 451
401 pm_qos_req = (struct pm_qos_request_list *)filp->private_data; 452 pm_qos_req = filp->private_data;
402 pm_qos_update_request(pm_qos_req, value); 453 pm_qos_update_request(pm_qos_req, value);
403 454
404 return count; 455 return count;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 6842eeba5879..58f405b581e7 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -37,13 +37,13 @@ static int check_clock(const clockid_t which_clock)
37 if (pid == 0) 37 if (pid == 0)
38 return 0; 38 return 0;
39 39
40 read_lock(&tasklist_lock); 40 rcu_read_lock();
41 p = find_task_by_vpid(pid); 41 p = find_task_by_vpid(pid);
42 if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? 42 if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
43 same_thread_group(p, current) : thread_group_leader(p))) { 43 same_thread_group(p, current) : has_group_leader_pid(p))) {
44 error = -EINVAL; 44 error = -EINVAL;
45 } 45 }
46 read_unlock(&tasklist_lock); 46 rcu_read_unlock();
47 47
48 return error; 48 return error;
49} 49}
@@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p)
176 return p->utime; 176 return p->utime;
177} 177}
178 178
179int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) 179static int
180posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
180{ 181{
181 int error = check_clock(which_clock); 182 int error = check_clock(which_clock);
182 if (!error) { 183 if (!error) {
@@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
194 return error; 195 return error;
195} 196}
196 197
197int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) 198static int
199posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
198{ 200{
199 /* 201 /*
200 * You can never reset a CPU clock, but we check for other errors 202 * You can never reset a CPU clock, but we check for other errors
@@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
317} 319}
318 320
319 321
320int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) 322static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
321{ 323{
322 const pid_t pid = CPUCLOCK_PID(which_clock); 324 const pid_t pid = CPUCLOCK_PID(which_clock);
323 int error = -EINVAL; 325 int error = -EINVAL;
@@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
379 * This is called from sys_timer_create() and do_cpu_nanosleep() with the 381 * This is called from sys_timer_create() and do_cpu_nanosleep() with the
380 * new timer already all-zeros initialized. 382 * new timer already all-zeros initialized.
381 */ 383 */
382int posix_cpu_timer_create(struct k_itimer *new_timer) 384static int posix_cpu_timer_create(struct k_itimer *new_timer)
383{ 385{
384 int ret = 0; 386 int ret = 0;
385 const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); 387 const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
@@ -390,7 +392,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
390 392
391 INIT_LIST_HEAD(&new_timer->it.cpu.entry); 393 INIT_LIST_HEAD(&new_timer->it.cpu.entry);
392 394
393 read_lock(&tasklist_lock); 395 rcu_read_lock();
394 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { 396 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
395 if (pid == 0) { 397 if (pid == 0) {
396 p = current; 398 p = current;
@@ -404,7 +406,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
404 p = current->group_leader; 406 p = current->group_leader;
405 } else { 407 } else {
406 p = find_task_by_vpid(pid); 408 p = find_task_by_vpid(pid);
407 if (p && !thread_group_leader(p)) 409 if (p && !has_group_leader_pid(p))
408 p = NULL; 410 p = NULL;
409 } 411 }
410 } 412 }
@@ -414,7 +416,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
414 } else { 416 } else {
415 ret = -EINVAL; 417 ret = -EINVAL;
416 } 418 }
417 read_unlock(&tasklist_lock); 419 rcu_read_unlock();
418 420
419 return ret; 421 return ret;
420} 422}
@@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
425 * If we return TIMER_RETRY, it's necessary to release the timer's lock 427 * If we return TIMER_RETRY, it's necessary to release the timer's lock
426 * and try again. (This happens when the timer is in the middle of firing.) 428 * and try again. (This happens when the timer is in the middle of firing.)
427 */ 429 */
428int posix_cpu_timer_del(struct k_itimer *timer) 430static int posix_cpu_timer_del(struct k_itimer *timer)
429{ 431{
430 struct task_struct *p = timer->it.cpu.task; 432 struct task_struct *p = timer->it.cpu.task;
431 int ret = 0; 433 int ret = 0;
@@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
665 * If we return TIMER_RETRY, it's necessary to release the timer's lock 667 * If we return TIMER_RETRY, it's necessary to release the timer's lock
666 * and try again. (This happens when the timer is in the middle of firing.) 668 * and try again. (This happens when the timer is in the middle of firing.)
667 */ 669 */
668int posix_cpu_timer_set(struct k_itimer *timer, int flags, 670static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
669 struct itimerspec *new, struct itimerspec *old) 671 struct itimerspec *new, struct itimerspec *old)
670{ 672{
671 struct task_struct *p = timer->it.cpu.task; 673 struct task_struct *p = timer->it.cpu.task;
672 union cpu_time_count old_expires, new_expires, old_incr, val; 674 union cpu_time_count old_expires, new_expires, old_incr, val;
@@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
820 return ret; 822 return ret;
821} 823}
822 824
823void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) 825static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
824{ 826{
825 union cpu_time_count now; 827 union cpu_time_count now;
826 struct task_struct *p = timer->it.cpu.task; 828 struct task_struct *p = timer->it.cpu.task;
@@ -1345,7 +1347,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1345 1347
1346 /* 1348 /*
1347 * Now that all the timers on our list have the firing flag, 1349 * Now that all the timers on our list have the firing flag,
1348 * noone will touch their list entries but us. We'll take 1350 * no one will touch their list entries but us. We'll take
1349 * each timer's lock before clearing its firing flag, so no 1351 * each timer's lock before clearing its firing flag, so no
1350 * timer call will interfere. 1352 * timer call will interfere.
1351 */ 1353 */
@@ -1481,11 +1483,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1481 return error; 1483 return error;
1482} 1484}
1483 1485
1484int posix_cpu_nsleep(const clockid_t which_clock, int flags, 1486static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1485 struct timespec *rqtp, struct timespec __user *rmtp) 1487
1488static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1489 struct timespec *rqtp, struct timespec __user *rmtp)
1486{ 1490{
1487 struct restart_block *restart_block = 1491 struct restart_block *restart_block =
1488 &current_thread_info()->restart_block; 1492 &current_thread_info()->restart_block;
1489 struct itimerspec it; 1493 struct itimerspec it;
1490 int error; 1494 int error;
1491 1495
@@ -1501,56 +1505,47 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1501 1505
1502 if (error == -ERESTART_RESTARTBLOCK) { 1506 if (error == -ERESTART_RESTARTBLOCK) {
1503 1507
1504 if (flags & TIMER_ABSTIME) 1508 if (flags & TIMER_ABSTIME)
1505 return -ERESTARTNOHAND; 1509 return -ERESTARTNOHAND;
1506 /* 1510 /*
1507 * Report back to the user the time still remaining. 1511 * Report back to the user the time still remaining.
1508 */ 1512 */
1509 if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) 1513 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1510 return -EFAULT; 1514 return -EFAULT;
1511 1515
1512 restart_block->fn = posix_cpu_nsleep_restart; 1516 restart_block->fn = posix_cpu_nsleep_restart;
1513 restart_block->arg0 = which_clock; 1517 restart_block->nanosleep.clockid = which_clock;
1514 restart_block->arg1 = (unsigned long) rmtp; 1518 restart_block->nanosleep.rmtp = rmtp;
1515 restart_block->arg2 = rqtp->tv_sec; 1519 restart_block->nanosleep.expires = timespec_to_ns(rqtp);
1516 restart_block->arg3 = rqtp->tv_nsec;
1517 } 1520 }
1518 return error; 1521 return error;
1519} 1522}
1520 1523
1521long posix_cpu_nsleep_restart(struct restart_block *restart_block) 1524static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1522{ 1525{
1523 clockid_t which_clock = restart_block->arg0; 1526 clockid_t which_clock = restart_block->nanosleep.clockid;
1524 struct timespec __user *rmtp;
1525 struct timespec t; 1527 struct timespec t;
1526 struct itimerspec it; 1528 struct itimerspec it;
1527 int error; 1529 int error;
1528 1530
1529 rmtp = (struct timespec __user *) restart_block->arg1; 1531 t = ns_to_timespec(restart_block->nanosleep.expires);
1530 t.tv_sec = restart_block->arg2;
1531 t.tv_nsec = restart_block->arg3;
1532 1532
1533 restart_block->fn = do_no_restart_syscall;
1534 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); 1533 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
1535 1534
1536 if (error == -ERESTART_RESTARTBLOCK) { 1535 if (error == -ERESTART_RESTARTBLOCK) {
1536 struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
1537 /* 1537 /*
1538 * Report back to the user the time still remaining. 1538 * Report back to the user the time still remaining.
1539 */ 1539 */
1540 if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) 1540 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1541 return -EFAULT; 1541 return -EFAULT;
1542 1542
1543 restart_block->fn = posix_cpu_nsleep_restart; 1543 restart_block->nanosleep.expires = timespec_to_ns(&t);
1544 restart_block->arg0 = which_clock;
1545 restart_block->arg1 = (unsigned long) rmtp;
1546 restart_block->arg2 = t.tv_sec;
1547 restart_block->arg3 = t.tv_nsec;
1548 } 1544 }
1549 return error; 1545 return error;
1550 1546
1551} 1547}
1552 1548
1553
1554#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) 1549#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
1555#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) 1550#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
1556 1551
@@ -1594,38 +1589,37 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
1594 timer->it_clock = THREAD_CLOCK; 1589 timer->it_clock = THREAD_CLOCK;
1595 return posix_cpu_timer_create(timer); 1590 return posix_cpu_timer_create(timer);
1596} 1591}
1597static int thread_cpu_nsleep(const clockid_t which_clock, int flags, 1592
1598 struct timespec *rqtp, struct timespec __user *rmtp) 1593struct k_clock clock_posix_cpu = {
1599{ 1594 .clock_getres = posix_cpu_clock_getres,
1600 return -EINVAL; 1595 .clock_set = posix_cpu_clock_set,
1601} 1596 .clock_get = posix_cpu_clock_get,
1602static long thread_cpu_nsleep_restart(struct restart_block *restart_block) 1597 .timer_create = posix_cpu_timer_create,
1603{ 1598 .nsleep = posix_cpu_nsleep,
1604 return -EINVAL; 1599 .nsleep_restart = posix_cpu_nsleep_restart,
1605} 1600 .timer_set = posix_cpu_timer_set,
1601 .timer_del = posix_cpu_timer_del,
1602 .timer_get = posix_cpu_timer_get,
1603};
1606 1604
1607static __init int init_posix_cpu_timers(void) 1605static __init int init_posix_cpu_timers(void)
1608{ 1606{
1609 struct k_clock process = { 1607 struct k_clock process = {
1610 .clock_getres = process_cpu_clock_getres, 1608 .clock_getres = process_cpu_clock_getres,
1611 .clock_get = process_cpu_clock_get, 1609 .clock_get = process_cpu_clock_get,
1612 .clock_set = do_posix_clock_nosettime, 1610 .timer_create = process_cpu_timer_create,
1613 .timer_create = process_cpu_timer_create, 1611 .nsleep = process_cpu_nsleep,
1614 .nsleep = process_cpu_nsleep, 1612 .nsleep_restart = process_cpu_nsleep_restart,
1615 .nsleep_restart = process_cpu_nsleep_restart,
1616 }; 1613 };
1617 struct k_clock thread = { 1614 struct k_clock thread = {
1618 .clock_getres = thread_cpu_clock_getres, 1615 .clock_getres = thread_cpu_clock_getres,
1619 .clock_get = thread_cpu_clock_get, 1616 .clock_get = thread_cpu_clock_get,
1620 .clock_set = do_posix_clock_nosettime, 1617 .timer_create = thread_cpu_timer_create,
1621 .timer_create = thread_cpu_timer_create,
1622 .nsleep = thread_cpu_nsleep,
1623 .nsleep_restart = thread_cpu_nsleep_restart,
1624 }; 1618 };
1625 struct timespec ts; 1619 struct timespec ts;
1626 1620
1627 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); 1621 posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
1628 register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); 1622 posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
1629 1623
1630 cputime_to_timespec(cputime_one_jiffy, &ts); 1624 cputime_to_timespec(cputime_one_jiffy, &ts);
1631 onecputick = ts.tv_nsec; 1625 onecputick = ts.tv_nsec;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9ca4973f736d..4556182527f3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -41,6 +41,7 @@
41#include <linux/init.h> 41#include <linux/init.h>
42#include <linux/compiler.h> 42#include <linux/compiler.h>
43#include <linux/idr.h> 43#include <linux/idr.h>
44#include <linux/posix-clock.h>
44#include <linux/posix-timers.h> 45#include <linux/posix-timers.h>
45#include <linux/syscalls.h> 46#include <linux/syscalls.h>
46#include <linux/wait.h> 47#include <linux/wait.h>
@@ -81,6 +82,14 @@ static DEFINE_SPINLOCK(idr_lock);
81#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" 82#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
82#endif 83#endif
83 84
85/*
86 * parisc wants ENOTSUP instead of EOPNOTSUPP
87 */
88#ifndef ENOTSUP
89# define ENANOSLEEP_NOTSUP EOPNOTSUPP
90#else
91# define ENANOSLEEP_NOTSUP ENOTSUP
92#endif
84 93
85/* 94/*
86 * The timer ID is turned into a timer address by idr_find(). 95 * The timer ID is turned into a timer address by idr_find().
@@ -94,11 +103,7 @@ static DEFINE_SPINLOCK(idr_lock);
94/* 103/*
95 * CLOCKs: The POSIX standard calls for a couple of clocks and allows us 104 * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
96 * to implement others. This structure defines the various 105 * to implement others. This structure defines the various
97 * clocks and allows the possibility of adding others. We 106 * clocks.
98 * provide an interface to add clocks to the table and expect
99 * the "arch" code to add at least one clock that is high
100 * resolution. Here we define the standard CLOCK_REALTIME as a
101 * 1/HZ resolution clock.
102 * 107 *
103 * RESOLUTION: Clock resolution is used to round up timer and interval 108 * RESOLUTION: Clock resolution is used to round up timer and interval
104 * times, NOT to report clock times, which are reported with as 109 * times, NOT to report clock times, which are reported with as
@@ -108,20 +113,13 @@ static DEFINE_SPINLOCK(idr_lock);
108 * necessary code is written. The standard says we should say 113 * necessary code is written. The standard says we should say
109 * something about this issue in the documentation... 114 * something about this issue in the documentation...
110 * 115 *
111 * FUNCTIONS: The CLOCKs structure defines possible functions to handle 116 * FUNCTIONS: The CLOCKs structure defines possible functions to
112 * various clock functions. For clocks that use the standard 117 * handle various clock functions.
113 * system timer code these entries should be NULL. This will
114 * allow dispatch without the overhead of indirect function
115 * calls. CLOCKS that depend on other sources (e.g. WWV or GPS)
116 * must supply functions here, even if the function just returns
117 * ENOSYS. The standard POSIX timer management code assumes the
118 * following: 1.) The k_itimer struct (sched.h) is used for the
119 * timer. 2.) The list, it_lock, it_clock, it_id and it_pid
120 * fields are not modified by timer code.
121 * 118 *
122 * At this time all functions EXCEPT clock_nanosleep can be 119 * The standard POSIX timer management code assumes the
123 * redirected by the CLOCKS structure. Clock_nanosleep is in 120 * following: 1.) The k_itimer struct (sched.h) is used for
124 * there, but the code ignores it. 121 * the timer. 2.) The list, it_lock, it_clock, it_id and
122 * it_pid fields are not modified by timer code.
125 * 123 *
126 * Permissions: It is assumed that the clock_settime() function defined 124 * Permissions: It is assumed that the clock_settime() function defined
127 * for each clock will take care of permission checks. Some 125 * for each clock will take care of permission checks. Some
@@ -138,6 +136,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
138 */ 136 */
139static int common_nsleep(const clockid_t, int flags, struct timespec *t, 137static int common_nsleep(const clockid_t, int flags, struct timespec *t,
140 struct timespec __user *rmtp); 138 struct timespec __user *rmtp);
139static int common_timer_create(struct k_itimer *new_timer);
141static void common_timer_get(struct k_itimer *, struct itimerspec *); 140static void common_timer_get(struct k_itimer *, struct itimerspec *);
142static int common_timer_set(struct k_itimer *, int, 141static int common_timer_set(struct k_itimer *, int,
143 struct itimerspec *, struct itimerspec *); 142 struct itimerspec *, struct itimerspec *);
@@ -145,83 +144,37 @@ static int common_timer_del(struct k_itimer *timer);
145 144
146static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); 145static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
147 146
148static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); 147static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
148
149#define lock_timer(tid, flags) \
150({ struct k_itimer *__timr; \
151 __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \
152 __timr; \
153})
149 154
150static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) 155static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
151{ 156{
152 spin_unlock_irqrestore(&timr->it_lock, flags); 157 spin_unlock_irqrestore(&timr->it_lock, flags);
153} 158}
154 159
155/* 160/* Get clock_realtime */
156 * Call the k_clock hook function if non-null, or the default function. 161static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
157 */
158#define CLOCK_DISPATCH(clock, call, arglist) \
159 ((clock) < 0 ? posix_cpu_##call arglist : \
160 (posix_clocks[clock].call != NULL \
161 ? (*posix_clocks[clock].call) arglist : common_##call arglist))
162
163/*
164 * Default clock hook functions when the struct k_clock passed
165 * to register_posix_clock leaves a function pointer null.
166 *
167 * The function common_CALL is the default implementation for
168 * the function pointer CALL in struct k_clock.
169 */
170
171static inline int common_clock_getres(const clockid_t which_clock,
172 struct timespec *tp)
173{
174 tp->tv_sec = 0;
175 tp->tv_nsec = posix_clocks[which_clock].res;
176 return 0;
177}
178
179/*
180 * Get real time for posix timers
181 */
182static int common_clock_get(clockid_t which_clock, struct timespec *tp)
183{ 162{
184 ktime_get_real_ts(tp); 163 ktime_get_real_ts(tp);
185 return 0; 164 return 0;
186} 165}
187 166
188static inline int common_clock_set(const clockid_t which_clock, 167/* Set clock_realtime */
189 struct timespec *tp) 168static int posix_clock_realtime_set(const clockid_t which_clock,
169 const struct timespec *tp)
190{ 170{
191 return do_sys_settimeofday(tp, NULL); 171 return do_sys_settimeofday(tp, NULL);
192} 172}
193 173
194static int common_timer_create(struct k_itimer *new_timer) 174static int posix_clock_realtime_adj(const clockid_t which_clock,
195{ 175 struct timex *t)
196 hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
197 return 0;
198}
199
200static int no_timer_create(struct k_itimer *new_timer)
201{
202 return -EOPNOTSUPP;
203}
204
205static int no_nsleep(const clockid_t which_clock, int flags,
206 struct timespec *tsave, struct timespec __user *rmtp)
207{ 176{
208 return -EOPNOTSUPP; 177 return do_adjtimex(t);
209}
210
211/*
212 * Return nonzero if we know a priori this clockid_t value is bogus.
213 */
214static inline int invalid_clockid(const clockid_t which_clock)
215{
216 if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */
217 return 0;
218 if ((unsigned) which_clock >= MAX_CLOCKS)
219 return 1;
220 if (posix_clocks[which_clock].clock_getres != NULL)
221 return 0;
222 if (posix_clocks[which_clock].res != 0)
223 return 0;
224 return 1;
225} 178}
226 179
227/* 180/*
@@ -234,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
234} 187}
235 188
236/* 189/*
237 * Get monotonic time for posix timers 190 * Get monotonic-raw time for posix timers
238 */ 191 */
239static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) 192static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
240{ 193{
@@ -261,46 +214,70 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp
261 *tp = ktime_to_timespec(KTIME_LOW_RES); 214 *tp = ktime_to_timespec(KTIME_LOW_RES);
262 return 0; 215 return 0;
263} 216}
217
218static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
219{
220 get_monotonic_boottime(tp);
221 return 0;
222}
223
224
264/* 225/*
265 * Initialize everything, well, just everything in Posix clocks/timers ;) 226 * Initialize everything, well, just everything in Posix clocks/timers ;)
266 */ 227 */
267static __init int init_posix_timers(void) 228static __init int init_posix_timers(void)
268{ 229{
269 struct k_clock clock_realtime = { 230 struct k_clock clock_realtime = {
270 .clock_getres = hrtimer_get_res, 231 .clock_getres = hrtimer_get_res,
232 .clock_get = posix_clock_realtime_get,
233 .clock_set = posix_clock_realtime_set,
234 .clock_adj = posix_clock_realtime_adj,
235 .nsleep = common_nsleep,
236 .nsleep_restart = hrtimer_nanosleep_restart,
237 .timer_create = common_timer_create,
238 .timer_set = common_timer_set,
239 .timer_get = common_timer_get,
240 .timer_del = common_timer_del,
271 }; 241 };
272 struct k_clock clock_monotonic = { 242 struct k_clock clock_monotonic = {
273 .clock_getres = hrtimer_get_res, 243 .clock_getres = hrtimer_get_res,
274 .clock_get = posix_ktime_get_ts, 244 .clock_get = posix_ktime_get_ts,
275 .clock_set = do_posix_clock_nosettime, 245 .nsleep = common_nsleep,
246 .nsleep_restart = hrtimer_nanosleep_restart,
247 .timer_create = common_timer_create,
248 .timer_set = common_timer_set,
249 .timer_get = common_timer_get,
250 .timer_del = common_timer_del,
276 }; 251 };
277 struct k_clock clock_monotonic_raw = { 252 struct k_clock clock_monotonic_raw = {
278 .clock_getres = hrtimer_get_res, 253 .clock_getres = hrtimer_get_res,
279 .clock_get = posix_get_monotonic_raw, 254 .clock_get = posix_get_monotonic_raw,
280 .clock_set = do_posix_clock_nosettime,
281 .timer_create = no_timer_create,
282 .nsleep = no_nsleep,
283 }; 255 };
284 struct k_clock clock_realtime_coarse = { 256 struct k_clock clock_realtime_coarse = {
285 .clock_getres = posix_get_coarse_res, 257 .clock_getres = posix_get_coarse_res,
286 .clock_get = posix_get_realtime_coarse, 258 .clock_get = posix_get_realtime_coarse,
287 .clock_set = do_posix_clock_nosettime,
288 .timer_create = no_timer_create,
289 .nsleep = no_nsleep,
290 }; 259 };
291 struct k_clock clock_monotonic_coarse = { 260 struct k_clock clock_monotonic_coarse = {
292 .clock_getres = posix_get_coarse_res, 261 .clock_getres = posix_get_coarse_res,
293 .clock_get = posix_get_monotonic_coarse, 262 .clock_get = posix_get_monotonic_coarse,
294 .clock_set = do_posix_clock_nosettime, 263 };
295 .timer_create = no_timer_create, 264 struct k_clock clock_boottime = {
296 .nsleep = no_nsleep, 265 .clock_getres = hrtimer_get_res,
266 .clock_get = posix_get_boottime,
267 .nsleep = common_nsleep,
268 .nsleep_restart = hrtimer_nanosleep_restart,
269 .timer_create = common_timer_create,
270 .timer_set = common_timer_set,
271 .timer_get = common_timer_get,
272 .timer_del = common_timer_del,
297 }; 273 };
298 274
299 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 275 posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
300 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 276 posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
301 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); 277 posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
302 register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); 278 posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
303 register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); 279 posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
280 posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
304 281
305 posix_timers_cache = kmem_cache_create("posix_timers_cache", 282 posix_timers_cache = kmem_cache_create("posix_timers_cache",
306 sizeof (struct k_itimer), 0, SLAB_PANIC, 283 sizeof (struct k_itimer), 0, SLAB_PANIC,
@@ -336,7 +313,7 @@ static void schedule_next_timer(struct k_itimer *timr)
336 * restarted (i.e. we have flagged this in the sys_private entry of the 313 * restarted (i.e. we have flagged this in the sys_private entry of the
337 * info block). 314 * info block).
338 * 315 *
339 * To protect aginst the timer going away while the interrupt is queued, 316 * To protect against the timer going away while the interrupt is queued,
340 * we require that the it_requeue_pending flag be set. 317 * we require that the it_requeue_pending flag be set.
341 */ 318 */
342void do_schedule_next_timer(struct siginfo *info) 319void do_schedule_next_timer(struct siginfo *info)
@@ -476,17 +453,29 @@ static struct pid *good_sigevent(sigevent_t * event)
476 return task_pid(rtn); 453 return task_pid(rtn);
477} 454}
478 455
479void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) 456void posix_timers_register_clock(const clockid_t clock_id,
457 struct k_clock *new_clock)
480{ 458{
481 if ((unsigned) clock_id >= MAX_CLOCKS) { 459 if ((unsigned) clock_id >= MAX_CLOCKS) {
482 printk("POSIX clock register failed for clock_id %d\n", 460 printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
461 clock_id);
462 return;
463 }
464
465 if (!new_clock->clock_get) {
466 printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n",
467 clock_id);
468 return;
469 }
470 if (!new_clock->clock_getres) {
471 printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n",
483 clock_id); 472 clock_id);
484 return; 473 return;
485 } 474 }
486 475
487 posix_clocks[clock_id] = *new_clock; 476 posix_clocks[clock_id] = *new_clock;
488} 477}
489EXPORT_SYMBOL_GPL(register_posix_clock); 478EXPORT_SYMBOL_GPL(posix_timers_register_clock);
490 479
491static struct k_itimer * alloc_posix_timer(void) 480static struct k_itimer * alloc_posix_timer(void)
492{ 481{
@@ -502,6 +491,13 @@ static struct k_itimer * alloc_posix_timer(void)
502 return tmr; 491 return tmr;
503} 492}
504 493
494static void k_itimer_rcu_free(struct rcu_head *head)
495{
496 struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
497
498 kmem_cache_free(posix_timers_cache, tmr);
499}
500
505#define IT_ID_SET 1 501#define IT_ID_SET 1
506#define IT_ID_NOT_SET 0 502#define IT_ID_NOT_SET 0
507static void release_posix_timer(struct k_itimer *tmr, int it_id_set) 503static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
@@ -514,7 +510,24 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
514 } 510 }
515 put_pid(tmr->it_pid); 511 put_pid(tmr->it_pid);
516 sigqueue_free(tmr->sigq); 512 sigqueue_free(tmr->sigq);
517 kmem_cache_free(posix_timers_cache, tmr); 513 call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
514}
515
516static struct k_clock *clockid_to_kclock(const clockid_t id)
517{
518 if (id < 0)
519 return (id & CLOCKFD_MASK) == CLOCKFD ?
520 &clock_posix_dynamic : &clock_posix_cpu;
521
522 if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
523 return NULL;
524 return &posix_clocks[id];
525}
526
527static int common_timer_create(struct k_itimer *new_timer)
528{
529 hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
530 return 0;
518} 531}
519 532
520/* Create a POSIX.1b interval timer. */ 533/* Create a POSIX.1b interval timer. */
@@ -523,13 +536,16 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
523 struct sigevent __user *, timer_event_spec, 536 struct sigevent __user *, timer_event_spec,
524 timer_t __user *, created_timer_id) 537 timer_t __user *, created_timer_id)
525{ 538{
539 struct k_clock *kc = clockid_to_kclock(which_clock);
526 struct k_itimer *new_timer; 540 struct k_itimer *new_timer;
527 int error, new_timer_id; 541 int error, new_timer_id;
528 sigevent_t event; 542 sigevent_t event;
529 int it_id_set = IT_ID_NOT_SET; 543 int it_id_set = IT_ID_NOT_SET;
530 544
531 if (invalid_clockid(which_clock)) 545 if (!kc)
532 return -EINVAL; 546 return -EINVAL;
547 if (!kc->timer_create)
548 return -EOPNOTSUPP;
533 549
534 new_timer = alloc_posix_timer(); 550 new_timer = alloc_posix_timer();
535 if (unlikely(!new_timer)) 551 if (unlikely(!new_timer))
@@ -591,7 +607,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
591 goto out; 607 goto out;
592 } 608 }
593 609
594 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); 610 error = kc->timer_create(new_timer);
595 if (error) 611 if (error)
596 goto out; 612 goto out;
597 613
@@ -601,7 +617,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
601 spin_unlock_irq(&current->sighand->siglock); 617 spin_unlock_irq(&current->sighand->siglock);
602 618
603 return 0; 619 return 0;
604 /* 620 /*
605 * In the case of the timer belonging to another task, after 621 * In the case of the timer belonging to another task, after
606 * the task is unlocked, the timer is owned by the other task 622 * the task is unlocked, the timer is owned by the other task
607 * and may cease to exist at any time. Don't use or modify 623 * and may cease to exist at any time. Don't use or modify
@@ -619,25 +635,21 @@ out:
619 * the find to the timer lock. To avoid a dead lock, the timer id MUST 635 * the find to the timer lock. To avoid a dead lock, the timer id MUST
620 * be release with out holding the timer lock. 636 * be release with out holding the timer lock.
621 */ 637 */
622static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) 638static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
623{ 639{
624 struct k_itimer *timr; 640 struct k_itimer *timr;
625 /* 641
626 * Watch out here. We do a irqsave on the idr_lock and pass the 642 rcu_read_lock();
627 * flags part over to the timer lock. Must not let interrupts in
628 * while we are moving the lock.
629 */
630 spin_lock_irqsave(&idr_lock, *flags);
631 timr = idr_find(&posix_timers_id, (int)timer_id); 643 timr = idr_find(&posix_timers_id, (int)timer_id);
632 if (timr) { 644 if (timr) {
633 spin_lock(&timr->it_lock); 645 spin_lock_irqsave(&timr->it_lock, *flags);
634 if (timr->it_signal == current->signal) { 646 if (timr->it_signal == current->signal) {
635 spin_unlock(&idr_lock); 647 rcu_read_unlock();
636 return timr; 648 return timr;
637 } 649 }
638 spin_unlock(&timr->it_lock); 650 spin_unlock_irqrestore(&timr->it_lock, *flags);
639 } 651 }
640 spin_unlock_irqrestore(&idr_lock, *flags); 652 rcu_read_unlock();
641 653
642 return NULL; 654 return NULL;
643} 655}
@@ -703,22 +715,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
703SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, 715SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
704 struct itimerspec __user *, setting) 716 struct itimerspec __user *, setting)
705{ 717{
706 struct k_itimer *timr;
707 struct itimerspec cur_setting; 718 struct itimerspec cur_setting;
719 struct k_itimer *timr;
720 struct k_clock *kc;
708 unsigned long flags; 721 unsigned long flags;
722 int ret = 0;
709 723
710 timr = lock_timer(timer_id, &flags); 724 timr = lock_timer(timer_id, &flags);
711 if (!timr) 725 if (!timr)
712 return -EINVAL; 726 return -EINVAL;
713 727
714 CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting)); 728 kc = clockid_to_kclock(timr->it_clock);
729 if (WARN_ON_ONCE(!kc || !kc->timer_get))
730 ret = -EINVAL;
731 else
732 kc->timer_get(timr, &cur_setting);
715 733
716 unlock_timer(timr, flags); 734 unlock_timer(timr, flags);
717 735
718 if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) 736 if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
719 return -EFAULT; 737 return -EFAULT;
720 738
721 return 0; 739 return ret;
722} 740}
723 741
724/* 742/*
@@ -807,6 +825,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
807 int error = 0; 825 int error = 0;
808 unsigned long flag; 826 unsigned long flag;
809 struct itimerspec *rtn = old_setting ? &old_spec : NULL; 827 struct itimerspec *rtn = old_setting ? &old_spec : NULL;
828 struct k_clock *kc;
810 829
811 if (!new_setting) 830 if (!new_setting)
812 return -EINVAL; 831 return -EINVAL;
@@ -822,8 +841,11 @@ retry:
822 if (!timr) 841 if (!timr)
823 return -EINVAL; 842 return -EINVAL;
824 843
825 error = CLOCK_DISPATCH(timr->it_clock, timer_set, 844 kc = clockid_to_kclock(timr->it_clock);
826 (timr, flags, &new_spec, rtn)); 845 if (WARN_ON_ONCE(!kc || !kc->timer_set))
846 error = -EINVAL;
847 else
848 error = kc->timer_set(timr, flags, &new_spec, rtn);
827 849
828 unlock_timer(timr, flag); 850 unlock_timer(timr, flag);
829 if (error == TIMER_RETRY) { 851 if (error == TIMER_RETRY) {
@@ -838,7 +860,7 @@ retry:
838 return error; 860 return error;
839} 861}
840 862
841static inline int common_timer_del(struct k_itimer *timer) 863static int common_timer_del(struct k_itimer *timer)
842{ 864{
843 timer->it.real.interval.tv64 = 0; 865 timer->it.real.interval.tv64 = 0;
844 866
@@ -849,7 +871,11 @@ static inline int common_timer_del(struct k_itimer *timer)
849 871
850static inline int timer_delete_hook(struct k_itimer *timer) 872static inline int timer_delete_hook(struct k_itimer *timer)
851{ 873{
852 return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer)); 874 struct k_clock *kc = clockid_to_kclock(timer->it_clock);
875
876 if (WARN_ON_ONCE(!kc || !kc->timer_del))
877 return -EINVAL;
878 return kc->timer_del(timer);
853} 879}
854 880
855/* Delete a POSIX.1b interval timer. */ 881/* Delete a POSIX.1b interval timer. */
@@ -921,69 +947,76 @@ void exit_itimers(struct signal_struct *sig)
921 } 947 }
922} 948}
923 949
924/* Not available / possible... functions */
925int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
926{
927 return -EINVAL;
928}
929EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
930
931int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
932 struct timespec *t, struct timespec __user *r)
933{
934#ifndef ENOTSUP
935 return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */
936#else /* parisc does define it separately. */
937 return -ENOTSUP;
938#endif
939}
940EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
941
942SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, 950SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
943 const struct timespec __user *, tp) 951 const struct timespec __user *, tp)
944{ 952{
953 struct k_clock *kc = clockid_to_kclock(which_clock);
945 struct timespec new_tp; 954 struct timespec new_tp;
946 955
947 if (invalid_clockid(which_clock)) 956 if (!kc || !kc->clock_set)
948 return -EINVAL; 957 return -EINVAL;
958
949 if (copy_from_user(&new_tp, tp, sizeof (*tp))) 959 if (copy_from_user(&new_tp, tp, sizeof (*tp)))
950 return -EFAULT; 960 return -EFAULT;
951 961
952 return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); 962 return kc->clock_set(which_clock, &new_tp);
953} 963}
954 964
955SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, 965SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
956 struct timespec __user *,tp) 966 struct timespec __user *,tp)
957{ 967{
968 struct k_clock *kc = clockid_to_kclock(which_clock);
958 struct timespec kernel_tp; 969 struct timespec kernel_tp;
959 int error; 970 int error;
960 971
961 if (invalid_clockid(which_clock)) 972 if (!kc)
962 return -EINVAL; 973 return -EINVAL;
963 error = CLOCK_DISPATCH(which_clock, clock_get, 974
964 (which_clock, &kernel_tp)); 975 error = kc->clock_get(which_clock, &kernel_tp);
976
965 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) 977 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
966 error = -EFAULT; 978 error = -EFAULT;
967 979
968 return error; 980 return error;
981}
982
983SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
984 struct timex __user *, utx)
985{
986 struct k_clock *kc = clockid_to_kclock(which_clock);
987 struct timex ktx;
988 int err;
989
990 if (!kc)
991 return -EINVAL;
992 if (!kc->clock_adj)
993 return -EOPNOTSUPP;
969 994
995 if (copy_from_user(&ktx, utx, sizeof(ktx)))
996 return -EFAULT;
997
998 err = kc->clock_adj(which_clock, &ktx);
999
1000 if (!err && copy_to_user(utx, &ktx, sizeof(ktx)))
1001 return -EFAULT;
1002
1003 return err;
970} 1004}
971 1005
972SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, 1006SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
973 struct timespec __user *, tp) 1007 struct timespec __user *, tp)
974{ 1008{
1009 struct k_clock *kc = clockid_to_kclock(which_clock);
975 struct timespec rtn_tp; 1010 struct timespec rtn_tp;
976 int error; 1011 int error;
977 1012
978 if (invalid_clockid(which_clock)) 1013 if (!kc)
979 return -EINVAL; 1014 return -EINVAL;
980 1015
981 error = CLOCK_DISPATCH(which_clock, clock_getres, 1016 error = kc->clock_getres(which_clock, &rtn_tp);
982 (which_clock, &rtn_tp));
983 1017
984 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) { 1018 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
985 error = -EFAULT; 1019 error = -EFAULT;
986 }
987 1020
988 return error; 1021 return error;
989} 1022}
@@ -1003,10 +1036,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1003 const struct timespec __user *, rqtp, 1036 const struct timespec __user *, rqtp,
1004 struct timespec __user *, rmtp) 1037 struct timespec __user *, rmtp)
1005{ 1038{
1039 struct k_clock *kc = clockid_to_kclock(which_clock);
1006 struct timespec t; 1040 struct timespec t;
1007 1041
1008 if (invalid_clockid(which_clock)) 1042 if (!kc)
1009 return -EINVAL; 1043 return -EINVAL;
1044 if (!kc->nsleep)
1045 return -ENANOSLEEP_NOTSUP;
1010 1046
1011 if (copy_from_user(&t, rqtp, sizeof (struct timespec))) 1047 if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
1012 return -EFAULT; 1048 return -EFAULT;
@@ -1014,27 +1050,20 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1014 if (!timespec_valid(&t)) 1050 if (!timespec_valid(&t))
1015 return -EINVAL; 1051 return -EINVAL;
1016 1052
1017 return CLOCK_DISPATCH(which_clock, nsleep, 1053 return kc->nsleep(which_clock, flags, &t, rmtp);
1018 (which_clock, flags, &t, rmtp));
1019}
1020
1021/*
1022 * nanosleep_restart for monotonic and realtime clocks
1023 */
1024static int common_nsleep_restart(struct restart_block *restart_block)
1025{
1026 return hrtimer_nanosleep_restart(restart_block);
1027} 1054}
1028 1055
1029/* 1056/*
1030 * This will restart clock_nanosleep. This is required only by 1057 * This will restart clock_nanosleep. This is required only by
1031 * compat_clock_nanosleep_restart for now. 1058 * compat_clock_nanosleep_restart for now.
1032 */ 1059 */
1033long 1060long clock_nanosleep_restart(struct restart_block *restart_block)
1034clock_nanosleep_restart(struct restart_block *restart_block)
1035{ 1061{
1036 clockid_t which_clock = restart_block->arg0; 1062 clockid_t which_clock = restart_block->nanosleep.clockid;
1063 struct k_clock *kc = clockid_to_kclock(which_clock);
1064
1065 if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
1066 return -EINVAL;
1037 1067
1038 return CLOCK_DISPATCH(which_clock, nsleep_restart, 1068 return kc->nsleep_restart(restart_block);
1039 (restart_block));
1040} 1069}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ca6066a6952e..87f4d24b55b0 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,128 +1,12 @@
1config PM
2 bool "Power Management support"
3 depends on !IA64_HP_SIM
4 ---help---
5 "Power Management" means that parts of your computer are shut
6 off or put into a power conserving "sleep" mode if they are not
7 being used. There are two competing standards for doing this: APM
8 and ACPI. If you want to use either one, say Y here and then also
9 to the requisite support below.
10
11 Power Management is most important for battery powered laptop
12 computers; if you have a laptop, check out the Linux Laptop home
13 page on the WWW at <http://www.linux-on-laptops.com/> or
14 Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
15 and the Battery Powered Linux mini-HOWTO, available from
16 <http://www.tldp.org/docs.html#howto>.
17
18 Note that, even if you say N here, Linux on the x86 architecture
19 will issue the hlt instruction if nothing is to be done, thereby
20 sending the processor to sleep and saving power.
21
22config PM_DEBUG
23 bool "Power Management Debug Support"
24 depends on PM
25 ---help---
26 This option enables various debugging support in the Power Management
27 code. This is helpful when debugging and reporting PM bugs, like
28 suspend support.
29
30config PM_ADVANCED_DEBUG
31 bool "Extra PM attributes in sysfs for low-level debugging/testing"
32 depends on PM_DEBUG
33 default n
34 ---help---
35 Add extra sysfs attributes allowing one to access some Power Management
36 fields of device objects from user space. If you are not a kernel
37 developer interested in debugging/testing Power Management, say "no".
38
39config PM_VERBOSE
40 bool "Verbose Power Management debugging"
41 depends on PM_DEBUG
42 default n
43 ---help---
44 This option enables verbose messages from the Power Management code.
45
46config CAN_PM_TRACE
47 def_bool y
48 depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL
49
50config PM_TRACE
51 bool
52 help
53 This enables code to save the last PM event point across
54 reboot. The architecture needs to support this, x86 for
55 example does by saving things in the RTC, see below.
56
57 The architecture specific code must provide the extern
58 functions from <linux/resume-trace.h> as well as the
59 <asm/resume-trace.h> header with a TRACE_RESUME() macro.
60
61 The way the information is presented is architecture-
62 dependent, x86 will print the information during a
63 late_initcall.
64
65config PM_TRACE_RTC
66 bool "Suspend/resume event tracing"
67 depends on CAN_PM_TRACE
68 depends on X86
69 select PM_TRACE
70 default n
71 ---help---
72 This enables some cheesy code to save the last PM event point in the
73 RTC across reboots, so that you can debug a machine that just hangs
74 during suspend (or more commonly, during resume).
75
76 To use this debugging feature you should attempt to suspend the
77 machine, reboot it and then run
78
79 dmesg -s 1000000 | grep 'hash matches'
80
81 CAUTION: this option will cause your machine's real-time clock to be
82 set to an invalid time after a resume.
83
84config PM_SLEEP_SMP
85 bool
86 depends on SMP
87 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
88 depends on PM_SLEEP
89 select HOTPLUG_CPU
90 default y
91
92config PM_SLEEP
93 bool
94 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
95 default y
96
97config PM_SLEEP_ADVANCED_DEBUG
98 bool
99 depends on PM_ADVANCED_DEBUG
100 default n
101
102config SUSPEND_NVS
103 bool
104
105config SUSPEND 1config SUSPEND
106 bool "Suspend to RAM and standby" 2 bool "Suspend to RAM and standby"
107 depends on PM && ARCH_SUSPEND_POSSIBLE 3 depends on ARCH_SUSPEND_POSSIBLE
108 select SUSPEND_NVS if HAS_IOMEM
109 default y 4 default y
110 ---help--- 5 ---help---
111 Allow the system to enter sleep states in which main memory is 6 Allow the system to enter sleep states in which main memory is
112 powered and thus its contents are preserved, such as the 7 powered and thus its contents are preserved, such as the
113 suspend-to-RAM state (e.g. the ACPI S3 state). 8 suspend-to-RAM state (e.g. the ACPI S3 state).
114 9
115config PM_TEST_SUSPEND
116 bool "Test suspend/resume and wakealarm during bootup"
117 depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
118 ---help---
119 This option will let you suspend your machine during bootup, and
120 make it wake up a few seconds later using an RTC wakeup alarm.
121 Enable this with a kernel parameter like "test_suspend=mem".
122
123 You probably want to have your system's RTC driver statically
124 linked, ensuring that it's available when this test runs.
125
126config SUSPEND_FREEZER 10config SUSPEND_FREEZER
127 bool "Enable freezer for suspend to RAM/standby" \ 11 bool "Enable freezer for suspend to RAM/standby" \
128 if ARCH_WANTS_FREEZER_CONTROL || BROKEN 12 if ARCH_WANTS_FREEZER_CONTROL || BROKEN
@@ -134,10 +18,15 @@ config SUSPEND_FREEZER
134 18
135 Turning OFF this setting is NOT recommended! If in doubt, say Y. 19 Turning OFF this setting is NOT recommended! If in doubt, say Y.
136 20
21config HIBERNATE_CALLBACKS
22 bool
23
137config HIBERNATION 24config HIBERNATION
138 bool "Hibernation (aka 'suspend to disk')" 25 bool "Hibernation (aka 'suspend to disk')"
139 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE 26 depends on SWAP && ARCH_HIBERNATION_POSSIBLE
140 select SUSPEND_NVS if HAS_IOMEM 27 select HIBERNATE_CALLBACKS
28 select LZO_COMPRESS
29 select LZO_DECOMPRESS
141 ---help--- 30 ---help---
142 Enable the suspend to disk (STD) functionality, which is usually 31 Enable the suspend to disk (STD) functionality, which is usually
143 called "hibernation" in user interfaces. STD checkpoints the 32 called "hibernation" in user interfaces. STD checkpoints the
@@ -198,6 +87,100 @@ config PM_STD_PARTITION
198 suspended image to. It will simply pick the first available swap 87 suspended image to. It will simply pick the first available swap
199 device. 88 device.
200 89
90config PM_SLEEP
91 def_bool y
92 depends on SUSPEND || HIBERNATE_CALLBACKS
93
94config PM_SLEEP_SMP
95 def_bool y
96 depends on SMP
97 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
98 depends on PM_SLEEP
99 select HOTPLUG
100 select HOTPLUG_CPU
101
102config PM_RUNTIME
103 bool "Run-time PM core functionality"
104 depends on !IA64_HP_SIM
105 ---help---
106 Enable functionality allowing I/O devices to be put into energy-saving
107 (low power) states at run time (or autosuspended) after a specified
108 period of inactivity and woken up in response to a hardware-generated
109 wake-up event or a driver's request.
110
111 Hardware support is generally required for this functionality to work
112 and the bus type drivers of the buses the devices are on are
113 responsible for the actual handling of the autosuspend requests and
114 wake-up events.
115
116config PM
117 def_bool y
118 depends on PM_SLEEP || PM_RUNTIME
119
120config PM_DEBUG
121 bool "Power Management Debug Support"
122 depends on PM
123 ---help---
124 This option enables various debugging support in the Power Management
125 code. This is helpful when debugging and reporting PM bugs, like
126 suspend support.
127
128config PM_ADVANCED_DEBUG
129 bool "Extra PM attributes in sysfs for low-level debugging/testing"
130 depends on PM_DEBUG
131 ---help---
132 Add extra sysfs attributes allowing one to access some Power Management
133 fields of device objects from user space. If you are not a kernel
134 developer interested in debugging/testing Power Management, say "no".
135
136config PM_TEST_SUSPEND
137 bool "Test suspend/resume and wakealarm during bootup"
138 depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
139 ---help---
140 This option will let you suspend your machine during bootup, and
141 make it wake up a few seconds later using an RTC wakeup alarm.
142 Enable this with a kernel parameter like "test_suspend=mem".
143
144 You probably want to have your system's RTC driver statically
145 linked, ensuring that it's available when this test runs.
146
147config CAN_PM_TRACE
148 def_bool y
149 depends on PM_DEBUG && PM_SLEEP
150
151config PM_TRACE
152 bool
153 help
154 This enables code to save the last PM event point across
155 reboot. The architecture needs to support this, x86 for
156 example does by saving things in the RTC, see below.
157
158 The architecture specific code must provide the extern
159 functions from <linux/resume-trace.h> as well as the
160 <asm/resume-trace.h> header with a TRACE_RESUME() macro.
161
162 The way the information is presented is architecture-
163 dependent, x86 will print the information during a
164 late_initcall.
165
166config PM_TRACE_RTC
167 bool "Suspend/resume event tracing"
168 depends on CAN_PM_TRACE
169 depends on X86
170 select PM_TRACE
171 ---help---
172 This enables some cheesy code to save the last PM event point in the
173 RTC across reboots, so that you can debug a machine that just hangs
174 during suspend (or more commonly, during resume).
175
176 To use this debugging feature you should attempt to suspend the
177 machine, reboot it and then run
178
179 dmesg -s 1000000 | grep 'hash matches'
180
181 CAUTION: this option will cause your machine's real-time clock to be
182 set to an invalid time after a resume.
183
201config APM_EMULATION 184config APM_EMULATION
202 tristate "Advanced Power Management Emulation" 185 tristate "Advanced Power Management Emulation"
203 depends on PM && SYS_SUPPORTS_APM_EMULATION 186 depends on PM && SYS_SUPPORTS_APM_EMULATION
@@ -224,21 +207,23 @@ config APM_EMULATION
224 anything, try disabling/enabling this option (or disabling/enabling 207 anything, try disabling/enabling this option (or disabling/enabling
225 APM in your BIOS). 208 APM in your BIOS).
226 209
227config PM_RUNTIME 210config ARCH_HAS_OPP
228 bool "Run-time PM core functionality" 211 bool
229 depends on PM 212
213config PM_OPP
214 bool "Operating Performance Point (OPP) Layer library"
215 depends on ARCH_HAS_OPP
230 ---help--- 216 ---help---
231 Enable functionality allowing I/O devices to be put into energy-saving 217 SOCs have a standard set of tuples consisting of frequency and
232 (low power) states at run time (or autosuspended) after a specified 218 voltage pairs that the device will support per voltage domain. This
233 period of inactivity and woken up in response to a hardware-generated 219 is called Operating Performance Point or OPP. The actual definitions
234 wake-up event or a driver's request. 220 of OPP varies over silicon within the same family of devices.
235 221
236 Hardware support is generally required for this functionality to work 222 OPP layer organizes the data internally using device pointers
237 and the bus type drivers of the buses the devices are on are 223 representing individual voltage domains and provides SOC
238 responsible for the actual handling of the autosuspend requests and 224 implementations a ready to use framework to manage OPPs.
239 wake-up events. 225 For more information, read <file:Documentation/power/opp.txt>
240 226
241config PM_OPS 227config PM_RUNTIME_CLK
242 bool 228 def_bool y
243 depends on PM_SLEEP || PM_RUNTIME 229 depends on PM_RUNTIME && HAVE_CLK
244 default y
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f9063c6b185d..c5ebc6a90643 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,7 +1,5 @@
1 1
2ifeq ($(CONFIG_PM_DEBUG),y) 2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
3EXTRA_CFLAGS += -DDEBUG
4endif
5 3
6obj-$(CONFIG_PM) += main.o 4obj-$(CONFIG_PM) += main.o
7obj-$(CONFIG_PM_SLEEP) += console.o 5obj-$(CONFIG_PM_SLEEP) += console.o
@@ -10,6 +8,5 @@ obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 8obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ 9obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
12 block_io.o 10 block_io.o
13obj-$(CONFIG_SUSPEND_NVS) += nvs.o
14 11
15obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 12obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 83bbc7c02df9..d09dd10c5a5e 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
28static int submit(int rw, struct block_device *bdev, sector_t sector, 28static int submit(int rw, struct block_device *bdev, sector_t sector,
29 struct page *page, struct bio **bio_chain) 29 struct page *page, struct bio **bio_chain)
30{ 30{
31 const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG; 31 const int bio_rw = rw | REQ_SYNC;
32 struct bio *bio; 32 struct bio *bio;
33 33
34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); 34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8dc31e02ae12..8f7b1db1ece1 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -23,12 +23,13 @@
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/gfp.h> 25#include <linux/gfp.h>
26#include <linux/syscore_ops.h>
26#include <scsi/scsi_scan.h> 27#include <scsi/scsi_scan.h>
27#include <asm/suspend.h>
28 28
29#include "power.h" 29#include "power.h"
30 30
31 31
32static int nocompress = 0;
32static int noresume = 0; 33static int noresume = 0;
33static char resume_file[256] = CONFIG_PM_STD_PARTITION; 34static char resume_file[256] = CONFIG_PM_STD_PARTITION;
34dev_t swsusp_resume_device; 35dev_t swsusp_resume_device;
@@ -50,18 +51,17 @@ enum {
50 51
51static int hibernation_mode = HIBERNATION_SHUTDOWN; 52static int hibernation_mode = HIBERNATION_SHUTDOWN;
52 53
53static struct platform_hibernation_ops *hibernation_ops; 54static const struct platform_hibernation_ops *hibernation_ops;
54 55
55/** 56/**
56 * hibernation_set_ops - set the global hibernate operations 57 * hibernation_set_ops - Set the global hibernate operations.
57 * @ops: the hibernation operations to use in subsequent hibernation transitions 58 * @ops: Hibernation operations to use in subsequent hibernation transitions.
58 */ 59 */
59 60void hibernation_set_ops(const struct platform_hibernation_ops *ops)
60void hibernation_set_ops(struct platform_hibernation_ops *ops)
61{ 61{
62 if (ops && !(ops->begin && ops->end && ops->pre_snapshot 62 if (ops && !(ops->begin && ops->end && ops->pre_snapshot
63 && ops->prepare && ops->finish && ops->enter && ops->pre_restore 63 && ops->prepare && ops->finish && ops->enter && ops->pre_restore
64 && ops->restore_cleanup)) { 64 && ops->restore_cleanup && ops->leave)) {
65 WARN_ON(1); 65 WARN_ON(1);
66 return; 66 return;
67 } 67 }
@@ -113,10 +113,9 @@ static int hibernation_test(int level) { return 0; }
113#endif /* !CONFIG_PM_DEBUG */ 113#endif /* !CONFIG_PM_DEBUG */
114 114
115/** 115/**
116 * platform_begin - tell the platform driver that we're starting 116 * platform_begin - Call platform to start hibernation.
117 * hibernation 117 * @platform_mode: Whether or not to use the platform driver.
118 */ 118 */
119
120static int platform_begin(int platform_mode) 119static int platform_begin(int platform_mode)
121{ 120{
122 return (platform_mode && hibernation_ops) ? 121 return (platform_mode && hibernation_ops) ?
@@ -124,10 +123,9 @@ static int platform_begin(int platform_mode)
124} 123}
125 124
126/** 125/**
127 * platform_end - tell the platform driver that we've entered the 126 * platform_end - Call platform to finish transition to the working state.
128 * working state 127 * @platform_mode: Whether or not to use the platform driver.
129 */ 128 */
130
131static void platform_end(int platform_mode) 129static void platform_end(int platform_mode)
132{ 130{
133 if (platform_mode && hibernation_ops) 131 if (platform_mode && hibernation_ops)
@@ -135,8 +133,11 @@ static void platform_end(int platform_mode)
135} 133}
136 134
137/** 135/**
138 * platform_pre_snapshot - prepare the machine for hibernation using the 136 * platform_pre_snapshot - Call platform to prepare the machine for hibernation.
139 * platform driver if so configured and return an error code if it fails 137 * @platform_mode: Whether or not to use the platform driver.
138 *
139 * Use the platform driver to prepare the system for creating a hibernate image,
140 * if so configured, and return an error code if that fails.
140 */ 141 */
141 142
142static int platform_pre_snapshot(int platform_mode) 143static int platform_pre_snapshot(int platform_mode)
@@ -146,10 +147,14 @@ static int platform_pre_snapshot(int platform_mode)
146} 147}
147 148
148/** 149/**
149 * platform_leave - prepare the machine for switching to the normal mode 150 * platform_leave - Call platform to prepare a transition to the working state.
150 * of operation using the platform driver (called with interrupts disabled) 151 * @platform_mode: Whether or not to use the platform driver.
152 *
153 * Use the platform driver prepare to prepare the machine for switching to the
154 * normal mode of operation.
155 *
156 * This routine is called on one CPU with interrupts disabled.
151 */ 157 */
152
153static void platform_leave(int platform_mode) 158static void platform_leave(int platform_mode)
154{ 159{
155 if (platform_mode && hibernation_ops) 160 if (platform_mode && hibernation_ops)
@@ -157,10 +162,14 @@ static void platform_leave(int platform_mode)
157} 162}
158 163
159/** 164/**
160 * platform_finish - switch the machine to the normal mode of operation 165 * platform_finish - Call platform to switch the system to the working state.
161 * using the platform driver (must be called after platform_prepare()) 166 * @platform_mode: Whether or not to use the platform driver.
167 *
168 * Use the platform driver to switch the machine to the normal mode of
169 * operation.
170 *
171 * This routine must be called after platform_prepare().
162 */ 172 */
163
164static void platform_finish(int platform_mode) 173static void platform_finish(int platform_mode)
165{ 174{
166 if (platform_mode && hibernation_ops) 175 if (platform_mode && hibernation_ops)
@@ -168,11 +177,15 @@ static void platform_finish(int platform_mode)
168} 177}
169 178
170/** 179/**
171 * platform_pre_restore - prepare the platform for the restoration from a 180 * platform_pre_restore - Prepare for hibernate image restoration.
172 * hibernation image. If the restore fails after this function has been 181 * @platform_mode: Whether or not to use the platform driver.
173 * called, platform_restore_cleanup() must be called. 182 *
183 * Use the platform driver to prepare the system for resume from a hibernation
184 * image.
185 *
186 * If the restore fails after this function has been called,
187 * platform_restore_cleanup() must be called.
174 */ 188 */
175
176static int platform_pre_restore(int platform_mode) 189static int platform_pre_restore(int platform_mode)
177{ 190{
178 return (platform_mode && hibernation_ops) ? 191 return (platform_mode && hibernation_ops) ?
@@ -180,12 +193,16 @@ static int platform_pre_restore(int platform_mode)
180} 193}
181 194
182/** 195/**
183 * platform_restore_cleanup - switch the platform to the normal mode of 196 * platform_restore_cleanup - Switch to the working state after failing restore.
184 * operation after a failing restore. If platform_pre_restore() has been 197 * @platform_mode: Whether or not to use the platform driver.
185 * called before the failing restore, this function must be called too, 198 *
186 * regardless of the result of platform_pre_restore(). 199 * Use the platform driver to switch the system to the normal mode of operation
200 * after a failing restore.
201 *
202 * If platform_pre_restore() has been called before the failing restore, this
203 * function must be called too, regardless of the result of
204 * platform_pre_restore().
187 */ 205 */
188
189static void platform_restore_cleanup(int platform_mode) 206static void platform_restore_cleanup(int platform_mode)
190{ 207{
191 if (platform_mode && hibernation_ops) 208 if (platform_mode && hibernation_ops)
@@ -193,10 +210,9 @@ static void platform_restore_cleanup(int platform_mode)
193} 210}
194 211
195/** 212/**
196 * platform_recover - recover the platform from a failure to suspend 213 * platform_recover - Recover from a failure to suspend devices.
197 * devices. 214 * @platform_mode: Whether or not to use the platform driver.
198 */ 215 */
199
200static void platform_recover(int platform_mode) 216static void platform_recover(int platform_mode)
201{ 217{
202 if (platform_mode && hibernation_ops && hibernation_ops->recover) 218 if (platform_mode && hibernation_ops && hibernation_ops->recover)
@@ -204,13 +220,12 @@ static void platform_recover(int platform_mode)
204} 220}
205 221
206/** 222/**
207 * swsusp_show_speed - print the time elapsed between two events. 223 * swsusp_show_speed - Print time elapsed between two events during hibernation.
208 * @start: Starting event. 224 * @start: Starting event.
209 * @stop: Final event. 225 * @stop: Final event.
210 * @nr_pages - number of pages processed between @start and @stop 226 * @nr_pages: Number of memory pages processed between @start and @stop.
211 * @msg - introductory message to print 227 * @msg: Additional diagnostic message to print.
212 */ 228 */
213
214void swsusp_show_speed(struct timeval *start, struct timeval *stop, 229void swsusp_show_speed(struct timeval *start, struct timeval *stop,
215 unsigned nr_pages, char *msg) 230 unsigned nr_pages, char *msg)
216{ 231{
@@ -233,25 +248,18 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
233} 248}
234 249
235/** 250/**
236 * create_image - freeze devices that need to be frozen with interrupts 251 * create_image - Create a hibernation image.
237 * off, create the hibernation image and thaw those devices. Control 252 * @platform_mode: Whether or not to use the platform driver.
238 * reappears in this routine after a restore. 253 *
254 * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image
255 * and execute the drivers' .thaw_noirq() callbacks.
256 *
257 * Control reappears in this routine after the subsequent restore.
239 */ 258 */
240
241static int create_image(int platform_mode) 259static int create_image(int platform_mode)
242{ 260{
243 int error; 261 int error;
244 262
245 error = arch_prepare_suspend();
246 if (error)
247 return error;
248
249 /* At this point, dpm_suspend_start() has been called, but *not*
250 * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
251 * Otherwise, drivers for some devices (e.g. interrupt controllers)
252 * become desynchronized with the actual state of the hardware
253 * at resume time, and evil weirdness ensues.
254 */
255 error = dpm_suspend_noirq(PMSG_FREEZE); 263 error = dpm_suspend_noirq(PMSG_FREEZE);
256 if (error) { 264 if (error) {
257 printk(KERN_ERR "PM: Some devices failed to power down, " 265 printk(KERN_ERR "PM: Some devices failed to power down, "
@@ -270,14 +278,14 @@ static int create_image(int platform_mode)
270 278
271 local_irq_disable(); 279 local_irq_disable();
272 280
273 error = sysdev_suspend(PMSG_FREEZE); 281 error = syscore_suspend();
274 if (error) { 282 if (error) {
275 printk(KERN_ERR "PM: Some system devices failed to power down, " 283 printk(KERN_ERR "PM: Some system devices failed to power down, "
276 "aborting hibernation\n"); 284 "aborting hibernation\n");
277 goto Enable_irqs; 285 goto Enable_irqs;
278 } 286 }
279 287
280 if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) 288 if (hibernation_test(TEST_CORE) || pm_wakeup_pending())
281 goto Power_up; 289 goto Power_up;
282 290
283 in_suspend = 1; 291 in_suspend = 1;
@@ -294,10 +302,7 @@ static int create_image(int platform_mode)
294 } 302 }
295 303
296 Power_up: 304 Power_up:
297 sysdev_resume(); 305 syscore_resume();
298 /* NOTE: dpm_resume_noirq() is just a resume() for devices
299 * that suspended with irqs off ... no overall powerup.
300 */
301 306
302 Enable_irqs: 307 Enable_irqs:
303 local_irq_enable(); 308 local_irq_enable();
@@ -315,31 +320,32 @@ static int create_image(int platform_mode)
315} 320}
316 321
317/** 322/**
318 * hibernation_snapshot - quiesce devices and create the hibernation 323 * hibernation_snapshot - Quiesce devices and create a hibernation image.
319 * snapshot image. 324 * @platform_mode: If set, use platform driver to prepare for the transition.
320 * @platform_mode - if set, use the platform driver, if available, to
321 * prepare the platform firmware for the power transition.
322 * 325 *
323 * Must be called with pm_mutex held 326 * This routine must be called with pm_mutex held.
324 */ 327 */
325
326int hibernation_snapshot(int platform_mode) 328int hibernation_snapshot(int platform_mode)
327{ 329{
330 pm_message_t msg = PMSG_RECOVER;
328 int error; 331 int error;
329 gfp_t saved_mask;
330 332
331 error = platform_begin(platform_mode); 333 error = platform_begin(platform_mode);
332 if (error) 334 if (error)
333 goto Close; 335 goto Close;
334 336
337 error = dpm_prepare(PMSG_FREEZE);
338 if (error)
339 goto Complete_devices;
340
335 /* Preallocate image memory before shutting down devices. */ 341 /* Preallocate image memory before shutting down devices. */
336 error = hibernate_preallocate_memory(); 342 error = hibernate_preallocate_memory();
337 if (error) 343 if (error)
338 goto Close; 344 goto Complete_devices;
339 345
340 suspend_console(); 346 suspend_console();
341 saved_mask = clear_gfp_allowed_mask(GFP_IOFS); 347 pm_restrict_gfp_mask();
342 error = dpm_suspend_start(PMSG_FREEZE); 348 error = dpm_suspend(PMSG_FREEZE);
343 if (error) 349 if (error)
344 goto Recover_platform; 350 goto Recover_platform;
345 351
@@ -347,17 +353,27 @@ int hibernation_snapshot(int platform_mode)
347 goto Recover_platform; 353 goto Recover_platform;
348 354
349 error = create_image(platform_mode); 355 error = create_image(platform_mode);
350 /* Control returns here after successful restore */ 356 /*
357 * Control returns here (1) after the image has been created or the
358 * image creation has failed and (2) after a successful restore.
359 */
351 360
352 Resume_devices: 361 Resume_devices:
353 /* We may need to release the preallocated image pages here. */ 362 /* We may need to release the preallocated image pages here. */
354 if (error || !in_suspend) 363 if (error || !in_suspend)
355 swsusp_free(); 364 swsusp_free();
356 365
357 dpm_resume_end(in_suspend ? 366 msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE;
358 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 367 dpm_resume(msg);
359 set_gfp_allowed_mask(saved_mask); 368
369 if (error || !in_suspend)
370 pm_restore_gfp_mask();
371
360 resume_console(); 372 resume_console();
373
374 Complete_devices:
375 dpm_complete(msg);
376
361 Close: 377 Close:
362 platform_end(platform_mode); 378 platform_end(platform_mode);
363 return error; 379 return error;
@@ -368,13 +384,14 @@ int hibernation_snapshot(int platform_mode)
368} 384}
369 385
370/** 386/**
371 * resume_target_kernel - prepare devices that need to be suspended with 387 * resume_target_kernel - Restore system state from a hibernation image.
372 * interrupts off, restore the contents of highmem that have not been 388 * @platform_mode: Whether or not to use the platform driver.
373 * restored yet from the image and run the low level code that will restore 389 *
374 * the remaining contents of memory and switch to the just restored target 390 * Execute device drivers' .freeze_noirq() callbacks, restore the contents of
375 * kernel. 391 * highmem that have not been restored yet from the image and run the low-level
392 * code that will restore the remaining contents of memory and switch to the
393 * just restored target kernel.
376 */ 394 */
377
378static int resume_target_kernel(bool platform_mode) 395static int resume_target_kernel(bool platform_mode)
379{ 396{
380 int error; 397 int error;
@@ -396,34 +413,36 @@ static int resume_target_kernel(bool platform_mode)
396 413
397 local_irq_disable(); 414 local_irq_disable();
398 415
399 error = sysdev_suspend(PMSG_QUIESCE); 416 error = syscore_suspend();
400 if (error) 417 if (error)
401 goto Enable_irqs; 418 goto Enable_irqs;
402 419
403 /* We'll ignore saved state, but this gets preempt count (etc) right */
404 save_processor_state(); 420 save_processor_state();
405 error = restore_highmem(); 421 error = restore_highmem();
406 if (!error) { 422 if (!error) {
407 error = swsusp_arch_resume(); 423 error = swsusp_arch_resume();
408 /* 424 /*
409 * The code below is only ever reached in case of a failure. 425 * The code below is only ever reached in case of a failure.
410 * Otherwise execution continues at place where 426 * Otherwise, execution continues at the place where
411 * swsusp_arch_suspend() was called 427 * swsusp_arch_suspend() was called.
412 */ 428 */
413 BUG_ON(!error); 429 BUG_ON(!error);
414 /* This call to restore_highmem() undos the previous one */ 430 /*
431 * This call to restore_highmem() reverts the changes made by
432 * the previous one.
433 */
415 restore_highmem(); 434 restore_highmem();
416 } 435 }
417 /* 436 /*
418 * The only reason why swsusp_arch_resume() can fail is memory being 437 * The only reason why swsusp_arch_resume() can fail is memory being
419 * very tight, so we have to free it as soon as we can to avoid 438 * very tight, so we have to free it as soon as we can to avoid
420 * subsequent failures 439 * subsequent failures.
421 */ 440 */
422 swsusp_free(); 441 swsusp_free();
423 restore_processor_state(); 442 restore_processor_state();
424 touch_softlockup_watchdog(); 443 touch_softlockup_watchdog();
425 444
426 sysdev_resume(); 445 syscore_resume();
427 446
428 Enable_irqs: 447 Enable_irqs:
429 local_irq_enable(); 448 local_irq_enable();
@@ -440,42 +459,36 @@ static int resume_target_kernel(bool platform_mode)
440} 459}
441 460
442/** 461/**
443 * hibernation_restore - quiesce devices and restore the hibernation 462 * hibernation_restore - Quiesce devices and restore from a hibernation image.
444 * snapshot image. If successful, control returns in hibernation_snaphot() 463 * @platform_mode: If set, use platform driver to prepare for the transition.
445 * @platform_mode - if set, use the platform driver, if available, to
446 * prepare the platform firmware for the transition.
447 * 464 *
448 * Must be called with pm_mutex held 465 * This routine must be called with pm_mutex held. If it is successful, control
466 * reappears in the restored target kernel in hibernation_snaphot().
449 */ 467 */
450
451int hibernation_restore(int platform_mode) 468int hibernation_restore(int platform_mode)
452{ 469{
453 int error; 470 int error;
454 gfp_t saved_mask;
455 471
456 pm_prepare_console(); 472 pm_prepare_console();
457 suspend_console(); 473 suspend_console();
458 saved_mask = clear_gfp_allowed_mask(GFP_IOFS); 474 pm_restrict_gfp_mask();
459 error = dpm_suspend_start(PMSG_QUIESCE); 475 error = dpm_suspend_start(PMSG_QUIESCE);
460 if (!error) { 476 if (!error) {
461 error = resume_target_kernel(platform_mode); 477 error = resume_target_kernel(platform_mode);
462 dpm_resume_end(PMSG_RECOVER); 478 dpm_resume_end(PMSG_RECOVER);
463 } 479 }
464 set_gfp_allowed_mask(saved_mask); 480 pm_restore_gfp_mask();
465 resume_console(); 481 resume_console();
466 pm_restore_console(); 482 pm_restore_console();
467 return error; 483 return error;
468} 484}
469 485
470/** 486/**
471 * hibernation_platform_enter - enter the hibernation state using the 487 * hibernation_platform_enter - Power off the system using the platform driver.
472 * platform driver (if available)
473 */ 488 */
474
475int hibernation_platform_enter(void) 489int hibernation_platform_enter(void)
476{ 490{
477 int error; 491 int error;
478 gfp_t saved_mask;
479 492
480 if (!hibernation_ops) 493 if (!hibernation_ops)
481 return -ENOSYS; 494 return -ENOSYS;
@@ -491,7 +504,6 @@ int hibernation_platform_enter(void)
491 504
492 entering_platform_hibernation = true; 505 entering_platform_hibernation = true;
493 suspend_console(); 506 suspend_console();
494 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
495 error = dpm_suspend_start(PMSG_HIBERNATE); 507 error = dpm_suspend_start(PMSG_HIBERNATE);
496 if (error) { 508 if (error) {
497 if (hibernation_ops->recover) 509 if (hibernation_ops->recover)
@@ -512,8 +524,8 @@ int hibernation_platform_enter(void)
512 goto Platform_finish; 524 goto Platform_finish;
513 525
514 local_irq_disable(); 526 local_irq_disable();
515 sysdev_suspend(PMSG_HIBERNATE); 527 syscore_suspend();
516 if (!pm_check_wakeup_events()) { 528 if (pm_wakeup_pending()) {
517 error = -EAGAIN; 529 error = -EAGAIN;
518 goto Power_up; 530 goto Power_up;
519 } 531 }
@@ -523,7 +535,7 @@ int hibernation_platform_enter(void)
523 while (1); 535 while (1);
524 536
525 Power_up: 537 Power_up:
526 sysdev_resume(); 538 syscore_resume();
527 local_irq_enable(); 539 local_irq_enable();
528 enable_nonboot_cpus(); 540 enable_nonboot_cpus();
529 541
@@ -535,7 +547,6 @@ int hibernation_platform_enter(void)
535 Resume_devices: 547 Resume_devices:
536 entering_platform_hibernation = false; 548 entering_platform_hibernation = false;
537 dpm_resume_end(PMSG_RESTORE); 549 dpm_resume_end(PMSG_RESTORE);
538 set_gfp_allowed_mask(saved_mask);
539 resume_console(); 550 resume_console();
540 551
541 Close: 552 Close:
@@ -545,12 +556,12 @@ int hibernation_platform_enter(void)
545} 556}
546 557
547/** 558/**
548 * power_down - Shut the machine down for hibernation. 559 * power_down - Shut the machine down for hibernation.
549 * 560 *
550 * Use the platform driver, if configured so; otherwise try 561 * Use the platform driver, if configured, to put the system into the sleep
551 * to power off or reboot. 562 * state corresponding to hibernation, or try to power it off or reboot,
563 * depending on the value of hibernation_mode.
552 */ 564 */
553
554static void power_down(void) 565static void power_down(void)
555{ 566{
556 switch (hibernation_mode) { 567 switch (hibernation_mode) {
@@ -587,9 +598,8 @@ static int prepare_processes(void)
587} 598}
588 599
589/** 600/**
590 * hibernate - The granpappy of the built-in hibernation management 601 * hibernate - Carry out system hibernation, including saving the image.
591 */ 602 */
592
593int hibernate(void) 603int hibernate(void)
594{ 604{
595 int error; 605 int error;
@@ -638,11 +648,15 @@ int hibernate(void)
638 648
639 if (hibernation_mode == HIBERNATION_PLATFORM) 649 if (hibernation_mode == HIBERNATION_PLATFORM)
640 flags |= SF_PLATFORM_MODE; 650 flags |= SF_PLATFORM_MODE;
651 if (nocompress)
652 flags |= SF_NOCOMPRESS_MODE;
641 pr_debug("PM: writing image.\n"); 653 pr_debug("PM: writing image.\n");
642 error = swsusp_write(flags); 654 error = swsusp_write(flags);
643 swsusp_free(); 655 swsusp_free();
644 if (!error) 656 if (!error)
645 power_down(); 657 power_down();
658 in_suspend = 0;
659 pm_restore_gfp_mask();
646 } else { 660 } else {
647 pr_debug("PM: Image restored successfully.\n"); 661 pr_debug("PM: Image restored successfully.\n");
648 } 662 }
@@ -663,17 +677,20 @@ int hibernate(void)
663 677
664 678
665/** 679/**
666 * software_resume - Resume from a saved image. 680 * software_resume - Resume from a saved hibernation image.
681 *
682 * This routine is called as a late initcall, when all devices have been
683 * discovered and initialized already.
667 * 684 *
668 * Called as a late_initcall (so all devices are discovered and 685 * The image reading code is called to see if there is a hibernation image
669 * initialized), we call swsusp to see if we have a saved image or not. 686 * available for reading. If that is the case, devices are quiesced and the
670 * If so, we quiesce devices, the restore the saved image. We will 687 * contents of memory is restored from the saved image.
671 * return above (in hibernate() ) if everything goes well.
672 * Otherwise, we fail gracefully and return to the normally
673 * scheduled program.
674 * 688 *
689 * If this is successful, control reappears in the restored target kernel in
690 * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine
691 * attempts to recover gracefully and make the kernel return to the normal mode
692 * of operation.
675 */ 693 */
676
677static int software_resume(void) 694static int software_resume(void)
678{ 695{
679 int error; 696 int error;
@@ -705,7 +722,7 @@ static int software_resume(void)
705 goto Unlock; 722 goto Unlock;
706 } 723 }
707 724
708 pr_debug("PM: Checking image partition %s\n", resume_file); 725 pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
709 726
710 /* Check if the device is there */ 727 /* Check if the device is there */
711 swsusp_resume_device = name_to_dev_t(resume_file); 728 swsusp_resume_device = name_to_dev_t(resume_file);
@@ -730,10 +747,10 @@ static int software_resume(void)
730 } 747 }
731 748
732 Check_image: 749 Check_image:
733 pr_debug("PM: Resume from partition %d:%d\n", 750 pr_debug("PM: Hibernation image partition %d:%d present\n",
734 MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); 751 MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
735 752
736 pr_debug("PM: Checking hibernation image.\n"); 753 pr_debug("PM: Looking for hibernation image.\n");
737 error = swsusp_check(); 754 error = swsusp_check();
738 if (error) 755 if (error)
739 goto Unlock; 756 goto Unlock;
@@ -765,14 +782,14 @@ static int software_resume(void)
765 goto Done; 782 goto Done;
766 } 783 }
767 784
768 pr_debug("PM: Reading hibernation image.\n"); 785 pr_debug("PM: Loading hibernation image.\n");
769 786
770 error = swsusp_read(&flags); 787 error = swsusp_read(&flags);
771 swsusp_close(FMODE_READ); 788 swsusp_close(FMODE_READ);
772 if (!error) 789 if (!error)
773 hibernation_restore(flags & SF_PLATFORM_MODE); 790 hibernation_restore(flags & SF_PLATFORM_MODE);
774 791
775 printk(KERN_ERR "PM: Restore failed, recovering.\n"); 792 printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
776 swsusp_free(); 793 swsusp_free();
777 thaw_processes(); 794 thaw_processes();
778 Done: 795 Done:
@@ -785,7 +802,7 @@ static int software_resume(void)
785 /* For success case, the suspend path will release the lock */ 802 /* For success case, the suspend path will release the lock */
786 Unlock: 803 Unlock:
787 mutex_unlock(&pm_mutex); 804 mutex_unlock(&pm_mutex);
788 pr_debug("PM: Resume from disk failed.\n"); 805 pr_debug("PM: Hibernation image not present or could not be loaded.\n");
789 return error; 806 return error;
790close_finish: 807close_finish:
791 swsusp_close(FMODE_READ); 808 swsusp_close(FMODE_READ);
@@ -803,21 +820,17 @@ static const char * const hibernation_modes[] = {
803 [HIBERNATION_TESTPROC] = "testproc", 820 [HIBERNATION_TESTPROC] = "testproc",
804}; 821};
805 822
806/** 823/*
807 * disk - Control hibernation mode 824 * /sys/power/disk - Control hibernation mode.
808 *
809 * Suspend-to-disk can be handled in several ways. We have a few options
810 * for putting the system to sleep - using the platform driver (e.g. ACPI
811 * or other hibernation_ops), powering off the system or rebooting the
812 * system (for testing) as well as the two test modes.
813 * 825 *
814 * The system can support 'platform', and that is known a priori (and 826 * Hibernation can be handled in several ways. There are a few different ways
815 * encoded by the presence of hibernation_ops). However, the user may 827 * to put the system into the sleep state: using the platform driver (e.g. ACPI
816 * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the 828 * or other hibernation_ops), powering it off or rebooting it (for testing
817 * test modes, 'test' or 'testproc'. 829 * mostly), or using one of the two available test modes.
818 * 830 *
819 * show() will display what the mode is currently set to. 831 * The sysfs file /sys/power/disk provides an interface for selecting the
820 * store() will accept one of 832 * hibernation mode to use. Reading from this file causes the available modes
833 * to be printed. There are 5 modes that can be supported:
821 * 834 *
822 * 'platform' 835 * 'platform'
823 * 'shutdown' 836 * 'shutdown'
@@ -825,8 +838,14 @@ static const char * const hibernation_modes[] = {
825 * 'test' 838 * 'test'
826 * 'testproc' 839 * 'testproc'
827 * 840 *
828 * It will only change to 'platform' if the system 841 * If a platform hibernation driver is in use, 'platform' will be supported
829 * supports it (as determined by having hibernation_ops). 842 * and will be used by default. Otherwise, 'shutdown' will be used by default.
843 * The selected option (i.e. the one corresponding to the current value of
844 * hibernation_mode) is enclosed by a square bracket.
845 *
846 * To select a given hibernation mode it is necessary to write the mode's
847 * string representation (as returned by reading from /sys/power/disk) back
848 * into /sys/power/disk.
830 */ 849 */
831 850
832static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, 851static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -859,7 +878,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
859 return buf-start; 878 return buf-start;
860} 879}
861 880
862
863static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, 881static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
864 const char *buf, size_t n) 882 const char *buf, size_t n)
865{ 883{
@@ -961,10 +979,33 @@ static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *att
961 979
962power_attr(image_size); 980power_attr(image_size);
963 981
982static ssize_t reserved_size_show(struct kobject *kobj,
983 struct kobj_attribute *attr, char *buf)
984{
985 return sprintf(buf, "%lu\n", reserved_size);
986}
987
988static ssize_t reserved_size_store(struct kobject *kobj,
989 struct kobj_attribute *attr,
990 const char *buf, size_t n)
991{
992 unsigned long size;
993
994 if (sscanf(buf, "%lu", &size) == 1) {
995 reserved_size = size;
996 return n;
997 }
998
999 return -EINVAL;
1000}
1001
1002power_attr(reserved_size);
1003
964static struct attribute * g[] = { 1004static struct attribute * g[] = {
965 &disk_attr.attr, 1005 &disk_attr.attr,
966 &resume_attr.attr, 1006 &resume_attr.attr,
967 &image_size_attr.attr, 1007 &image_size_attr.attr,
1008 &reserved_size_attr.attr,
968 NULL, 1009 NULL,
969}; 1010};
970 1011
@@ -1004,6 +1045,15 @@ static int __init resume_offset_setup(char *str)
1004 return 1; 1045 return 1;
1005} 1046}
1006 1047
1048static int __init hibernate_setup(char *str)
1049{
1050 if (!strncmp(str, "noresume", 8))
1051 noresume = 1;
1052 else if (!strncmp(str, "nocompress", 10))
1053 nocompress = 1;
1054 return 1;
1055}
1056
1007static int __init noresume_setup(char *str) 1057static int __init noresume_setup(char *str)
1008{ 1058{
1009 noresume = 1; 1059 noresume = 1;
@@ -1013,3 +1063,4 @@ static int __init noresume_setup(char *str)
1013__setup("noresume", noresume_setup); 1063__setup("noresume", noresume_setup);
1014__setup("resume_offset=", resume_offset_setup); 1064__setup("resume_offset=", resume_offset_setup);
1015__setup("resume=", resume_setup); 1065__setup("resume=", resume_setup);
1066__setup("hibernate=", hibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 62b0bc6e4983..2981af4ce7cb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -17,9 +17,6 @@
17 17
18DEFINE_MUTEX(pm_mutex); 18DEFINE_MUTEX(pm_mutex);
19 19
20unsigned int pm_flags;
21EXPORT_SYMBOL(pm_flags);
22
23#ifdef CONFIG_PM_SLEEP 20#ifdef CONFIG_PM_SLEEP
24 21
25/* Routines for PM-transition notifications */ 22/* Routines for PM-transition notifications */
@@ -227,7 +224,7 @@ power_attr(state);
227 * writing to 'state'. It first should read from 'wakeup_count' and store 224 * writing to 'state'. It first should read from 'wakeup_count' and store
228 * the read value. Then, after carrying out its own preparations for the system 225 * the read value. Then, after carrying out its own preparations for the system
229 * transition to a sleep state, it should write the stored value to 226 * transition to a sleep state, it should write the stored value to
230 * 'wakeup_count'. If that fails, at least one wakeup event has occured since 227 * 'wakeup_count'. If that fails, at least one wakeup event has occurred since
231 * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it 228 * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it
232 * is allowed to write to 'state', but the transition will be aborted if there 229 * is allowed to write to 'state', but the transition will be aborted if there
233 * are any wakeup events detected after 'wakeup_count' was written to. 230 * are any wakeup events detected after 'wakeup_count' was written to.
@@ -237,18 +234,18 @@ static ssize_t wakeup_count_show(struct kobject *kobj,
237 struct kobj_attribute *attr, 234 struct kobj_attribute *attr,
238 char *buf) 235 char *buf)
239{ 236{
240 unsigned long val; 237 unsigned int val;
241 238
242 return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR; 239 return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR;
243} 240}
244 241
245static ssize_t wakeup_count_store(struct kobject *kobj, 242static ssize_t wakeup_count_store(struct kobject *kobj,
246 struct kobj_attribute *attr, 243 struct kobj_attribute *attr,
247 const char *buf, size_t n) 244 const char *buf, size_t n)
248{ 245{
249 unsigned long val; 246 unsigned int val;
250 247
251 if (sscanf(buf, "%lu", &val) == 1) { 248 if (sscanf(buf, "%u", &val) == 1) {
252 if (pm_save_wakeup_count(val)) 249 if (pm_save_wakeup_count(val))
253 return n; 250 return n;
254 } 251 }
@@ -281,12 +278,30 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
281} 278}
282 279
283power_attr(pm_trace); 280power_attr(pm_trace);
281
282static ssize_t pm_trace_dev_match_show(struct kobject *kobj,
283 struct kobj_attribute *attr,
284 char *buf)
285{
286 return show_trace_dev_match(buf, PAGE_SIZE);
287}
288
289static ssize_t
290pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr,
291 const char *buf, size_t n)
292{
293 return -EINVAL;
294}
295
296power_attr(pm_trace_dev_match);
297
284#endif /* CONFIG_PM_TRACE */ 298#endif /* CONFIG_PM_TRACE */
285 299
286static struct attribute * g[] = { 300static struct attribute * g[] = {
287 &state_attr.attr, 301 &state_attr.attr,
288#ifdef CONFIG_PM_TRACE 302#ifdef CONFIG_PM_TRACE
289 &pm_trace_attr.attr, 303 &pm_trace_attr.attr,
304 &pm_trace_dev_match_attr.attr,
290#endif 305#endif
291#ifdef CONFIG_PM_SLEEP 306#ifdef CONFIG_PM_SLEEP
292 &pm_async_attr.attr, 307 &pm_async_attr.attr,
@@ -308,7 +323,7 @@ EXPORT_SYMBOL_GPL(pm_wq);
308 323
309static int __init pm_start_workqueue(void) 324static int __init pm_start_workqueue(void)
310{ 325{
311 pm_wq = create_freezeable_workqueue("pm"); 326 pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0);
312 327
313 return pm_wq ? 0 : -ENOMEM; 328 return pm_wq ? 0 : -ENOMEM;
314} 329}
@@ -321,6 +336,8 @@ static int __init pm_init(void)
321 int error = pm_start_workqueue(); 336 int error = pm_start_workqueue();
322 if (error) 337 if (error)
323 return error; 338 return error;
339 hibernate_image_size_init();
340 hibernate_reserved_size_init();
324 power_kobj = kobject_create_and_add("power", NULL); 341 power_kobj = kobject_create_and_add("power", NULL);
325 if (!power_kobj) 342 if (!power_kobj)
326 return -ENOMEM; 343 return -ENOMEM;
diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c
deleted file mode 100644
index 1836db60bbb6..000000000000
--- a/kernel/power/nvs.c
+++ /dev/null
@@ -1,136 +0,0 @@
1/*
2 * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
3 *
4 * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
5 *
6 * This file is released under the GPLv2.
7 */
8
9#include <linux/io.h>
10#include <linux/kernel.h>
11#include <linux/list.h>
12#include <linux/mm.h>
13#include <linux/slab.h>
14#include <linux/suspend.h>
15
16/*
17 * Platforms, like ACPI, may want us to save some memory used by them during
18 * suspend and to restore the contents of this memory during the subsequent
19 * resume. The code below implements a mechanism allowing us to do that.
20 */
21
22struct nvs_page {
23 unsigned long phys_start;
24 unsigned int size;
25 void *kaddr;
26 void *data;
27 struct list_head node;
28};
29
30static LIST_HEAD(nvs_list);
31
32/**
33 * suspend_nvs_register - register platform NVS memory region to save
34 * @start - physical address of the region
35 * @size - size of the region
36 *
37 * The NVS region need not be page-aligned (both ends) and we arrange
38 * things so that the data from page-aligned addresses in this region will
39 * be copied into separate RAM pages.
40 */
41int suspend_nvs_register(unsigned long start, unsigned long size)
42{
43 struct nvs_page *entry, *next;
44
45 while (size > 0) {
46 unsigned int nr_bytes;
47
48 entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
49 if (!entry)
50 goto Error;
51
52 list_add_tail(&entry->node, &nvs_list);
53 entry->phys_start = start;
54 nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
55 entry->size = (size < nr_bytes) ? size : nr_bytes;
56
57 start += entry->size;
58 size -= entry->size;
59 }
60 return 0;
61
62 Error:
63 list_for_each_entry_safe(entry, next, &nvs_list, node) {
64 list_del(&entry->node);
65 kfree(entry);
66 }
67 return -ENOMEM;
68}
69
70/**
71 * suspend_nvs_free - free data pages allocated for saving NVS regions
72 */
73void suspend_nvs_free(void)
74{
75 struct nvs_page *entry;
76
77 list_for_each_entry(entry, &nvs_list, node)
78 if (entry->data) {
79 free_page((unsigned long)entry->data);
80 entry->data = NULL;
81 if (entry->kaddr) {
82 iounmap(entry->kaddr);
83 entry->kaddr = NULL;
84 }
85 }
86}
87
88/**
89 * suspend_nvs_alloc - allocate memory necessary for saving NVS regions
90 */
91int suspend_nvs_alloc(void)
92{
93 struct nvs_page *entry;
94
95 list_for_each_entry(entry, &nvs_list, node) {
96 entry->data = (void *)__get_free_page(GFP_KERNEL);
97 if (!entry->data) {
98 suspend_nvs_free();
99 return -ENOMEM;
100 }
101 }
102 return 0;
103}
104
105/**
106 * suspend_nvs_save - save NVS memory regions
107 */
108void suspend_nvs_save(void)
109{
110 struct nvs_page *entry;
111
112 printk(KERN_INFO "PM: Saving platform NVS memory\n");
113
114 list_for_each_entry(entry, &nvs_list, node)
115 if (entry->data) {
116 entry->kaddr = ioremap(entry->phys_start, entry->size);
117 memcpy(entry->data, entry->kaddr, entry->size);
118 }
119}
120
121/**
122 * suspend_nvs_restore - restore NVS memory regions
123 *
124 * This function is going to be called with interrupts disabled, so it
125 * cannot iounmap the virtual addresses used to access the NVS region.
126 */
127void suspend_nvs_restore(void)
128{
129 struct nvs_page *entry;
130
131 printk(KERN_INFO "PM: Restoring platform NVS memory\n");
132
133 list_for_each_entry(entry, &nvs_list, node)
134 if (entry->data)
135 memcpy(entry->kaddr, entry->data, entry->size);
136}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 006270fe382d..9a00a0a26280 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -14,6 +14,10 @@ struct swsusp_info {
14} __attribute__((aligned(PAGE_SIZE))); 14} __attribute__((aligned(PAGE_SIZE)));
15 15
16#ifdef CONFIG_HIBERNATION 16#ifdef CONFIG_HIBERNATION
17/* kernel/power/snapshot.c */
18extern void __init hibernate_reserved_size_init(void);
19extern void __init hibernate_image_size_init(void);
20
17#ifdef CONFIG_ARCH_HIBERNATION_HEADER 21#ifdef CONFIG_ARCH_HIBERNATION_HEADER
18/* Maximum size of architecture specific data in a hibernation header */ 22/* Maximum size of architecture specific data in a hibernation header */
19#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) 23#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4)
@@ -49,7 +53,12 @@ static inline char *check_image_kernel(struct swsusp_info *info)
49extern int hibernation_snapshot(int platform_mode); 53extern int hibernation_snapshot(int platform_mode);
50extern int hibernation_restore(int platform_mode); 54extern int hibernation_restore(int platform_mode);
51extern int hibernation_platform_enter(void); 55extern int hibernation_platform_enter(void);
52#endif 56
57#else /* !CONFIG_HIBERNATION */
58
59static inline void hibernate_reserved_size_init(void) {}
60static inline void hibernate_image_size_init(void) {}
61#endif /* !CONFIG_HIBERNATION */
53 62
54extern int pfn_is_nosave(unsigned long); 63extern int pfn_is_nosave(unsigned long);
55 64
@@ -65,6 +74,8 @@ static struct kobj_attribute _name##_attr = { \
65 74
66/* Preferred image size in bytes (default 500 MB) */ 75/* Preferred image size in bytes (default 500 MB) */
67extern unsigned long image_size; 76extern unsigned long image_size;
77/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
78extern unsigned long reserved_size;
68extern int in_suspend; 79extern int in_suspend;
69extern dev_t swsusp_resume_device; 80extern dev_t swsusp_resume_device;
70extern sector_t swsusp_resume_block; 81extern sector_t swsusp_resume_block;
@@ -134,6 +145,7 @@ extern int swsusp_swap_in_use(void);
134 * the image header. 145 * the image header.
135 */ 146 */
136#define SF_PLATFORM_MODE 1 147#define SF_PLATFORM_MODE 1
148#define SF_NOCOMPRESS_MODE 2
137 149
138/* kernel/power/hibernate.c */ 150/* kernel/power/hibernate.c */
139extern int swsusp_check(void); 151extern int swsusp_check(void);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 028a99598f49..0cf3a27a6c9d 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -22,7 +22,7 @@
22 */ 22 */
23#define TIMEOUT (20 * HZ) 23#define TIMEOUT (20 * HZ)
24 24
25static inline int freezeable(struct task_struct * p) 25static inline int freezable(struct task_struct * p)
26{ 26{
27 if ((p == current) || 27 if ((p == current) ||
28 (p->flags & PF_NOFREEZE) || 28 (p->flags & PF_NOFREEZE) ||
@@ -40,6 +40,7 @@ static int try_to_freeze_tasks(bool sig_only)
40 struct timeval start, end; 40 struct timeval start, end;
41 u64 elapsed_csecs64; 41 u64 elapsed_csecs64;
42 unsigned int elapsed_csecs; 42 unsigned int elapsed_csecs;
43 bool wakeup = false;
43 44
44 do_gettimeofday(&start); 45 do_gettimeofday(&start);
45 46
@@ -52,7 +53,7 @@ static int try_to_freeze_tasks(bool sig_only)
52 todo = 0; 53 todo = 0;
53 read_lock(&tasklist_lock); 54 read_lock(&tasklist_lock);
54 do_each_thread(g, p) { 55 do_each_thread(g, p) {
55 if (frozen(p) || !freezeable(p)) 56 if (frozen(p) || !freezable(p))
56 continue; 57 continue;
57 58
58 if (!freeze_task(p, sig_only)) 59 if (!freeze_task(p, sig_only))
@@ -63,6 +64,12 @@ static int try_to_freeze_tasks(bool sig_only)
63 * perturb a task in TASK_STOPPED or TASK_TRACED. 64 * perturb a task in TASK_STOPPED or TASK_TRACED.
64 * It is "frozen enough". If the task does wake 65 * It is "frozen enough". If the task does wake
65 * up, it will immediately call try_to_freeze. 66 * up, it will immediately call try_to_freeze.
67 *
68 * Because freeze_task() goes through p's
69 * scheduler lock after setting TIF_FREEZE, it's
70 * guaranteed that either we see TASK_RUNNING or
71 * try_to_stop() after schedule() in ptrace/signal
72 * stop sees TIF_FREEZE.
66 */ 73 */
67 if (!task_is_stopped_or_traced(p) && 74 if (!task_is_stopped_or_traced(p) &&
68 !freezer_should_skip(p)) 75 !freezer_should_skip(p))
@@ -78,6 +85,11 @@ static int try_to_freeze_tasks(bool sig_only)
78 if (!todo || time_after(jiffies, end_time)) 85 if (!todo || time_after(jiffies, end_time))
79 break; 86 break;
80 87
88 if (pm_wakeup_pending()) {
89 wakeup = true;
90 break;
91 }
92
81 /* 93 /*
82 * We need to retry, but first give the freezing tasks some 94 * We need to retry, but first give the freezing tasks some
83 * time to enter the regrigerator. 95 * time to enter the regrigerator.
@@ -97,8 +109,9 @@ static int try_to_freeze_tasks(bool sig_only)
97 * but it cleans up leftover PF_FREEZE requests. 109 * but it cleans up leftover PF_FREEZE requests.
98 */ 110 */
99 printk("\n"); 111 printk("\n");
100 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " 112 printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds "
101 "(%d tasks refusing to freeze, wq_busy=%d):\n", 113 "(%d tasks refusing to freeze, wq_busy=%d):\n",
114 wakeup ? "aborted" : "failed",
102 elapsed_csecs / 100, elapsed_csecs % 100, 115 elapsed_csecs / 100, elapsed_csecs % 100,
103 todo - wq_busy, wq_busy); 116 todo - wq_busy, wq_busy);
104 117
@@ -107,7 +120,7 @@ static int try_to_freeze_tasks(bool sig_only)
107 read_lock(&tasklist_lock); 120 read_lock(&tasklist_lock);
108 do_each_thread(g, p) { 121 do_each_thread(g, p) {
109 task_lock(p); 122 task_lock(p);
110 if (freezing(p) && !freezer_should_skip(p)) 123 if (!wakeup && freezing(p) && !freezer_should_skip(p))
111 sched_show_task(p); 124 sched_show_task(p);
112 cancel_freezing(p); 125 cancel_freezing(p);
113 task_unlock(p); 126 task_unlock(p);
@@ -154,7 +167,7 @@ static void thaw_tasks(bool nosig_only)
154 167
155 read_lock(&tasklist_lock); 168 read_lock(&tasklist_lock);
156 do_each_thread(g, p) { 169 do_each_thread(g, p) {
157 if (!freezeable(p)) 170 if (!freezable(p))
158 continue; 171 continue;
159 172
160 if (nosig_only && should_send_signal(p)) 173 if (nosig_only && should_send_signal(p))
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d3f795f01bbc..06efa54f93d6 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -41,12 +41,29 @@ static void swsusp_set_page_forbidden(struct page *);
41static void swsusp_unset_page_forbidden(struct page *); 41static void swsusp_unset_page_forbidden(struct page *);
42 42
43/* 43/*
44 * Number of bytes to reserve for memory allocations made by device drivers
45 * from their ->freeze() and ->freeze_noirq() callbacks so that they don't
46 * cause image creation to fail (tunable via /sys/power/reserved_size).
47 */
48unsigned long reserved_size;
49
50void __init hibernate_reserved_size_init(void)
51{
52 reserved_size = SPARE_PAGES * PAGE_SIZE;
53}
54
55/*
44 * Preferred image size in bytes (tunable via /sys/power/image_size). 56 * Preferred image size in bytes (tunable via /sys/power/image_size).
45 * When it is set to N, swsusp will do its best to ensure the image 57 * When it is set to N, swsusp will do its best to ensure the image
46 * size will not exceed N bytes, but if that is impossible, it will 58 * size will not exceed N bytes, but if that is impossible, it will
47 * try to create the smallest image possible. 59 * try to create the smallest image possible.
48 */ 60 */
49unsigned long image_size = 500 * 1024 * 1024; 61unsigned long image_size;
62
63void __init hibernate_image_size_init(void)
64{
65 image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
66}
50 67
51/* List of PBEs needed for restoring the pages that were allocated before 68/* List of PBEs needed for restoring the pages that were allocated before
52 * the suspend and included in the suspend image, but have also been 69 * the suspend and included in the suspend image, but have also been
@@ -979,8 +996,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
979 src = kmap_atomic(s_page, KM_USER0); 996 src = kmap_atomic(s_page, KM_USER0);
980 dst = kmap_atomic(d_page, KM_USER1); 997 dst = kmap_atomic(d_page, KM_USER1);
981 do_copy_page(dst, src); 998 do_copy_page(dst, src);
982 kunmap_atomic(src, KM_USER0);
983 kunmap_atomic(dst, KM_USER1); 999 kunmap_atomic(dst, KM_USER1);
1000 kunmap_atomic(src, KM_USER0);
984 } else { 1001 } else {
985 if (PageHighMem(d_page)) { 1002 if (PageHighMem(d_page)) {
986 /* Page pointed to by src may contain some kernel 1003 /* Page pointed to by src may contain some kernel
@@ -988,7 +1005,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
988 */ 1005 */
989 safe_copy_page(buffer, s_page); 1006 safe_copy_page(buffer, s_page);
990 dst = kmap_atomic(d_page, KM_USER0); 1007 dst = kmap_atomic(d_page, KM_USER0);
991 memcpy(dst, buffer, PAGE_SIZE); 1008 copy_page(dst, buffer);
992 kunmap_atomic(dst, KM_USER0); 1009 kunmap_atomic(dst, KM_USER0);
993 } else { 1010 } else {
994 safe_copy_page(page_address(d_page), s_page); 1011 safe_copy_page(page_address(d_page), s_page);
@@ -1194,7 +1211,11 @@ static void free_unnecessary_pages(void)
1194 to_free_highmem = alloc_highmem - save; 1211 to_free_highmem = alloc_highmem - save;
1195 } else { 1212 } else {
1196 to_free_highmem = 0; 1213 to_free_highmem = 0;
1197 to_free_normal -= save - alloc_highmem; 1214 save -= alloc_highmem;
1215 if (to_free_normal > save)
1216 to_free_normal -= save;
1217 else
1218 to_free_normal = 0;
1198 } 1219 }
1199 1220
1200 memory_bm_position_reset(&copy_bm); 1221 memory_bm_position_reset(&copy_bm);
@@ -1258,11 +1279,13 @@ static unsigned long minimum_image_size(unsigned long saveable)
1258 * frame in use. We also need a number of page frames to be free during 1279 * frame in use. We also need a number of page frames to be free during
1259 * hibernation for allocations made while saving the image and for device 1280 * hibernation for allocations made while saving the image and for device
1260 * drivers, in case they need to allocate memory from their hibernation 1281 * drivers, in case they need to allocate memory from their hibernation
1261 * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES, 1282 * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough
1262 * respectively, both of which are rough estimates). To make this happen, we 1283 * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through
1263 * compute the total number of available page frames and allocate at least 1284 * /sys/power/reserved_size, respectively). To make this happen, we compute the
1285 * total number of available page frames and allocate at least
1264 * 1286 *
1265 * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES 1287 * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2
1288 * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
1266 * 1289 *
1267 * of them, which corresponds to the maximum size of a hibernation image. 1290 * of them, which corresponds to the maximum size of a hibernation image.
1268 * 1291 *
@@ -1317,13 +1340,16 @@ int hibernate_preallocate_memory(void)
1317 count -= totalreserve_pages; 1340 count -= totalreserve_pages;
1318 1341
1319 /* Compute the maximum number of saveable pages to leave in memory. */ 1342 /* Compute the maximum number of saveable pages to leave in memory. */
1320 max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; 1343 max_size = (count - (size + PAGES_FOR_IO)) / 2
1344 - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
1345 /* Compute the desired number of image pages specified by image_size. */
1321 size = DIV_ROUND_UP(image_size, PAGE_SIZE); 1346 size = DIV_ROUND_UP(image_size, PAGE_SIZE);
1322 if (size > max_size) 1347 if (size > max_size)
1323 size = max_size; 1348 size = max_size;
1324 /* 1349 /*
1325 * If the maximum is not less than the current number of saveable pages 1350 * If the desired number of image pages is at least as large as the
1326 * in memory, allocate page frames for the image and we're done. 1351 * current number of saveable pages in memory, allocate page frames for
1352 * the image and we're done.
1327 */ 1353 */
1328 if (size >= saveable) { 1354 if (size >= saveable) {
1329 pages = preallocate_image_highmem(save_highmem); 1355 pages = preallocate_image_highmem(save_highmem);
@@ -1512,11 +1538,8 @@ static int
1512swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, 1538swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1513 unsigned int nr_pages, unsigned int nr_highmem) 1539 unsigned int nr_pages, unsigned int nr_highmem)
1514{ 1540{
1515 int error = 0;
1516
1517 if (nr_highmem > 0) { 1541 if (nr_highmem > 0) {
1518 error = get_highmem_buffer(PG_ANY); 1542 if (get_highmem_buffer(PG_ANY))
1519 if (error)
1520 goto err_out; 1543 goto err_out;
1521 if (nr_highmem > alloc_highmem) { 1544 if (nr_highmem > alloc_highmem) {
1522 nr_highmem -= alloc_highmem; 1545 nr_highmem -= alloc_highmem;
@@ -1539,7 +1562,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1539 1562
1540 err_out: 1563 err_out:
1541 swsusp_free(); 1564 swsusp_free();
1542 return error; 1565 return -ENOMEM;
1543} 1566}
1544 1567
1545asmlinkage int swsusp_save(void) 1568asmlinkage int swsusp_save(void)
@@ -1680,7 +1703,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
1680 memory_bm_position_reset(&orig_bm); 1703 memory_bm_position_reset(&orig_bm);
1681 memory_bm_position_reset(&copy_bm); 1704 memory_bm_position_reset(&copy_bm);
1682 } else if (handle->cur <= nr_meta_pages) { 1705 } else if (handle->cur <= nr_meta_pages) {
1683 memset(buffer, 0, PAGE_SIZE); 1706 clear_page(buffer);
1684 pack_pfns(buffer, &orig_bm); 1707 pack_pfns(buffer, &orig_bm);
1685 } else { 1708 } else {
1686 struct page *page; 1709 struct page *page;
@@ -1694,7 +1717,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
1694 void *kaddr; 1717 void *kaddr;
1695 1718
1696 kaddr = kmap_atomic(page, KM_USER0); 1719 kaddr = kmap_atomic(page, KM_USER0);
1697 memcpy(buffer, kaddr, PAGE_SIZE); 1720 copy_page(buffer, kaddr);
1698 kunmap_atomic(kaddr, KM_USER0); 1721 kunmap_atomic(kaddr, KM_USER0);
1699 handle->buffer = buffer; 1722 handle->buffer = buffer;
1700 } else { 1723 } else {
@@ -1977,7 +2000,7 @@ static void copy_last_highmem_page(void)
1977 void *dst; 2000 void *dst;
1978 2001
1979 dst = kmap_atomic(last_highmem_page, KM_USER0); 2002 dst = kmap_atomic(last_highmem_page, KM_USER0);
1980 memcpy(dst, buffer, PAGE_SIZE); 2003 copy_page(dst, buffer);
1981 kunmap_atomic(dst, KM_USER0); 2004 kunmap_atomic(dst, KM_USER0);
1982 last_highmem_page = NULL; 2005 last_highmem_page = NULL;
1983 } 2006 }
@@ -2263,11 +2286,11 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
2263 2286
2264 kaddr1 = kmap_atomic(p1, KM_USER0); 2287 kaddr1 = kmap_atomic(p1, KM_USER0);
2265 kaddr2 = kmap_atomic(p2, KM_USER1); 2288 kaddr2 = kmap_atomic(p2, KM_USER1);
2266 memcpy(buf, kaddr1, PAGE_SIZE); 2289 copy_page(buf, kaddr1);
2267 memcpy(kaddr1, kaddr2, PAGE_SIZE); 2290 copy_page(kaddr1, kaddr2);
2268 memcpy(kaddr2, buf, PAGE_SIZE); 2291 copy_page(kaddr2, buf);
2269 kunmap_atomic(kaddr1, KM_USER0);
2270 kunmap_atomic(kaddr2, KM_USER1); 2292 kunmap_atomic(kaddr2, KM_USER1);
2293 kunmap_atomic(kaddr1, KM_USER0);
2271} 2294}
2272 2295
2273/** 2296/**
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 7335952ee473..1c41ba215419 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -22,6 +22,8 @@
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/suspend.h> 24#include <linux/suspend.h>
25#include <linux/syscore_ops.h>
26#include <trace/events/power.h>
25 27
26#include "power.h" 28#include "power.h"
27 29
@@ -30,13 +32,13 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
30 [PM_SUSPEND_MEM] = "mem", 32 [PM_SUSPEND_MEM] = "mem",
31}; 33};
32 34
33static struct platform_suspend_ops *suspend_ops; 35static const struct platform_suspend_ops *suspend_ops;
34 36
35/** 37/**
36 * suspend_set_ops - Set the global suspend method table. 38 * suspend_set_ops - Set the global suspend method table.
37 * @ops: Pointer to ops structure. 39 * @ops: Pointer to ops structure.
38 */ 40 */
39void suspend_set_ops(struct platform_suspend_ops *ops) 41void suspend_set_ops(const struct platform_suspend_ops *ops)
40{ 42{
41 mutex_lock(&pm_mutex); 43 mutex_lock(&pm_mutex);
42 suspend_ops = ops; 44 suspend_ops = ops;
@@ -161,13 +163,13 @@ static int suspend_enter(suspend_state_t state)
161 arch_suspend_disable_irqs(); 163 arch_suspend_disable_irqs();
162 BUG_ON(!irqs_disabled()); 164 BUG_ON(!irqs_disabled());
163 165
164 error = sysdev_suspend(PMSG_SUSPEND); 166 error = syscore_suspend();
165 if (!error) { 167 if (!error) {
166 if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { 168 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
167 error = suspend_ops->enter(state); 169 error = suspend_ops->enter(state);
168 events_check_enabled = false; 170 events_check_enabled = false;
169 } 171 }
170 sysdev_resume(); 172 syscore_resume();
171 } 173 }
172 174
173 arch_suspend_enable_irqs(); 175 arch_suspend_enable_irqs();
@@ -197,18 +199,17 @@ static int suspend_enter(suspend_state_t state)
197int suspend_devices_and_enter(suspend_state_t state) 199int suspend_devices_and_enter(suspend_state_t state)
198{ 200{
199 int error; 201 int error;
200 gfp_t saved_mask;
201 202
202 if (!suspend_ops) 203 if (!suspend_ops)
203 return -ENOSYS; 204 return -ENOSYS;
204 205
206 trace_machine_suspend(state);
205 if (suspend_ops->begin) { 207 if (suspend_ops->begin) {
206 error = suspend_ops->begin(state); 208 error = suspend_ops->begin(state);
207 if (error) 209 if (error)
208 goto Close; 210 goto Close;
209 } 211 }
210 suspend_console(); 212 suspend_console();
211 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
212 suspend_test_start(); 213 suspend_test_start();
213 error = dpm_suspend_start(PMSG_SUSPEND); 214 error = dpm_suspend_start(PMSG_SUSPEND);
214 if (error) { 215 if (error) {
@@ -219,17 +220,17 @@ int suspend_devices_and_enter(suspend_state_t state)
219 if (suspend_test(TEST_DEVICES)) 220 if (suspend_test(TEST_DEVICES))
220 goto Recover_platform; 221 goto Recover_platform;
221 222
222 suspend_enter(state); 223 error = suspend_enter(state);
223 224
224 Resume_devices: 225 Resume_devices:
225 suspend_test_start(); 226 suspend_test_start();
226 dpm_resume_end(PMSG_RESUME); 227 dpm_resume_end(PMSG_RESUME);
227 suspend_test_finish("resume devices"); 228 suspend_test_finish("resume devices");
228 set_gfp_allowed_mask(saved_mask);
229 resume_console(); 229 resume_console();
230 Close: 230 Close:
231 if (suspend_ops->end) 231 if (suspend_ops->end)
232 suspend_ops->end(); 232 suspend_ops->end();
233 trace_machine_suspend(PWR_EVENT_EXIT);
233 return error; 234 return error;
234 235
235 Recover_platform: 236 Recover_platform:
@@ -285,7 +286,9 @@ int enter_state(suspend_state_t state)
285 goto Finish; 286 goto Finish;
286 287
287 pr_debug("PM: Entering %s sleep\n", pm_states[state]); 288 pr_debug("PM: Entering %s sleep\n", pm_states[state]);
289 pm_restrict_gfp_mask();
288 error = suspend_devices_and_enter(state); 290 error = suspend_devices_and_enter(state);
291 pm_restore_gfp_mask();
289 292
290 Finish: 293 Finish:
291 pr_debug("PM: Finishing wakeup.\n"); 294 pr_debug("PM: Finishing wakeup.\n");
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index e6a5bdf61a37..7c97c3a0eee3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -6,6 +6,7 @@
6 * 6 *
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> 7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
9 * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com>
9 * 10 *
10 * This file is released under the GPLv2. 11 * This file is released under the GPLv2.
11 * 12 *
@@ -24,10 +25,12 @@
24#include <linux/swapops.h> 25#include <linux/swapops.h>
25#include <linux/pm.h> 26#include <linux/pm.h>
26#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/lzo.h>
29#include <linux/vmalloc.h>
27 30
28#include "power.h" 31#include "power.h"
29 32
30#define SWSUSP_SIG "S1SUSPEND" 33#define HIBERNATE_SIG "S1SUSPEND"
31 34
32/* 35/*
33 * The swap map is a data structure used for keeping track of each page 36 * The swap map is a data structure used for keeping track of each page
@@ -193,7 +196,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
193 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || 196 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
194 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { 197 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
195 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); 198 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
196 memcpy(swsusp_header->sig,SWSUSP_SIG, 10); 199 memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
197 swsusp_header->image = handle->first_sector; 200 swsusp_header->image = handle->first_sector;
198 swsusp_header->flags = flags; 201 swsusp_header->flags = flags;
199 error = hib_bio_write_page(swsusp_resume_block, 202 error = hib_bio_write_page(swsusp_resume_block,
@@ -221,7 +224,7 @@ static int swsusp_swap_check(void)
221 return res; 224 return res;
222 225
223 root_swap = res; 226 root_swap = res;
224 res = blkdev_get(hib_resume_bdev, FMODE_WRITE); 227 res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL);
225 if (res) 228 if (res)
226 return res; 229 return res;
227 230
@@ -249,7 +252,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
249 if (bio_chain) { 252 if (bio_chain) {
250 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 253 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
251 if (src) { 254 if (src) {
252 memcpy(src, buf, PAGE_SIZE); 255 copy_page(src, buf);
253 } else { 256 } else {
254 WARN_ON_ONCE(1); 257 WARN_ON_ONCE(1);
255 bio_chain = NULL; /* Go synchronous */ 258 bio_chain = NULL; /* Go synchronous */
@@ -323,7 +326,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
323 error = write_page(handle->cur, handle->cur_swap, NULL); 326 error = write_page(handle->cur, handle->cur_swap, NULL);
324 if (error) 327 if (error)
325 goto out; 328 goto out;
326 memset(handle->cur, 0, PAGE_SIZE); 329 clear_page(handle->cur);
327 handle->cur_swap = offset; 330 handle->cur_swap = offset;
328 handle->k = 0; 331 handle->k = 0;
329 } 332 }
@@ -357,6 +360,18 @@ static int swap_writer_finish(struct swap_map_handle *handle,
357 return error; 360 return error;
358} 361}
359 362
363/* We need to remember how much compressed data we need to read. */
364#define LZO_HEADER sizeof(size_t)
365
366/* Number of pages/bytes we'll compress at one time. */
367#define LZO_UNC_PAGES 32
368#define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE)
369
370/* Number of pages/bytes we need for compressed data (worst case). */
371#define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \
372 LZO_HEADER, PAGE_SIZE)
373#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE)
374
360/** 375/**
361 * save_image - save the suspend image data 376 * save_image - save the suspend image data
362 */ 377 */
@@ -404,6 +419,137 @@ static int save_image(struct swap_map_handle *handle,
404 return ret; 419 return ret;
405} 420}
406 421
422
423/**
424 * save_image_lzo - Save the suspend image data compressed with LZO.
425 * @handle: Swap mam handle to use for saving the image.
426 * @snapshot: Image to read data from.
427 * @nr_to_write: Number of pages to save.
428 */
429static int save_image_lzo(struct swap_map_handle *handle,
430 struct snapshot_handle *snapshot,
431 unsigned int nr_to_write)
432{
433 unsigned int m;
434 int ret = 0;
435 int nr_pages;
436 int err2;
437 struct bio *bio;
438 struct timeval start;
439 struct timeval stop;
440 size_t off, unc_len, cmp_len;
441 unsigned char *unc, *cmp, *wrk, *page;
442
443 page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
444 if (!page) {
445 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
446 return -ENOMEM;
447 }
448
449 wrk = vmalloc(LZO1X_1_MEM_COMPRESS);
450 if (!wrk) {
451 printk(KERN_ERR "PM: Failed to allocate LZO workspace\n");
452 free_page((unsigned long)page);
453 return -ENOMEM;
454 }
455
456 unc = vmalloc(LZO_UNC_SIZE);
457 if (!unc) {
458 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
459 vfree(wrk);
460 free_page((unsigned long)page);
461 return -ENOMEM;
462 }
463
464 cmp = vmalloc(LZO_CMP_SIZE);
465 if (!cmp) {
466 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
467 vfree(unc);
468 vfree(wrk);
469 free_page((unsigned long)page);
470 return -ENOMEM;
471 }
472
473 printk(KERN_INFO
474 "PM: Compressing and saving image data (%u pages) ... ",
475 nr_to_write);
476 m = nr_to_write / 100;
477 if (!m)
478 m = 1;
479 nr_pages = 0;
480 bio = NULL;
481 do_gettimeofday(&start);
482 for (;;) {
483 for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
484 ret = snapshot_read_next(snapshot);
485 if (ret < 0)
486 goto out_finish;
487
488 if (!ret)
489 break;
490
491 memcpy(unc + off, data_of(*snapshot), PAGE_SIZE);
492
493 if (!(nr_pages % m))
494 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
495 nr_pages++;
496 }
497
498 if (!off)
499 break;
500
501 unc_len = off;
502 ret = lzo1x_1_compress(unc, unc_len,
503 cmp + LZO_HEADER, &cmp_len, wrk);
504 if (ret < 0) {
505 printk(KERN_ERR "PM: LZO compression failed\n");
506 break;
507 }
508
509 if (unlikely(!cmp_len ||
510 cmp_len > lzo1x_worst_compress(unc_len))) {
511 printk(KERN_ERR "PM: Invalid LZO compressed length\n");
512 ret = -1;
513 break;
514 }
515
516 *(size_t *)cmp = cmp_len;
517
518 /*
519 * Given we are writing one page at a time to disk, we copy
520 * that much from the buffer, although the last bit will likely
521 * be smaller than full page. This is OK - we saved the length
522 * of the compressed data, so any garbage at the end will be
523 * discarded when we read it.
524 */
525 for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
526 memcpy(page, cmp + off, PAGE_SIZE);
527
528 ret = swap_write_page(handle, page, &bio);
529 if (ret)
530 goto out_finish;
531 }
532 }
533
534out_finish:
535 err2 = hib_wait_on_bio_chain(&bio);
536 do_gettimeofday(&stop);
537 if (!ret)
538 ret = err2;
539 if (!ret)
540 printk(KERN_CONT "\b\b\b\bdone\n");
541 else
542 printk(KERN_CONT "\n");
543 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
544
545 vfree(cmp);
546 vfree(unc);
547 vfree(wrk);
548 free_page((unsigned long)page);
549
550 return ret;
551}
552
407/** 553/**
408 * enough_swap - Make sure we have enough swap to save the image. 554 * enough_swap - Make sure we have enough swap to save the image.
409 * 555 *
@@ -411,12 +557,16 @@ static int save_image(struct swap_map_handle *handle,
411 * space avaiable from the resume partition. 557 * space avaiable from the resume partition.
412 */ 558 */
413 559
414static int enough_swap(unsigned int nr_pages) 560static int enough_swap(unsigned int nr_pages, unsigned int flags)
415{ 561{
416 unsigned int free_swap = count_swap_pages(root_swap, 1); 562 unsigned int free_swap = count_swap_pages(root_swap, 1);
563 unsigned int required;
417 564
418 pr_debug("PM: Free swap pages: %u\n", free_swap); 565 pr_debug("PM: Free swap pages: %u\n", free_swap);
419 return free_swap > nr_pages + PAGES_FOR_IO; 566
567 required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ?
568 nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1);
569 return free_swap > required;
420} 570}
421 571
422/** 572/**
@@ -443,7 +593,7 @@ int swsusp_write(unsigned int flags)
443 printk(KERN_ERR "PM: Cannot get swap writer\n"); 593 printk(KERN_ERR "PM: Cannot get swap writer\n");
444 return error; 594 return error;
445 } 595 }
446 if (!enough_swap(pages)) { 596 if (!enough_swap(pages, flags)) {
447 printk(KERN_ERR "PM: Not enough free swap\n"); 597 printk(KERN_ERR "PM: Not enough free swap\n");
448 error = -ENOSPC; 598 error = -ENOSPC;
449 goto out_finish; 599 goto out_finish;
@@ -458,8 +608,11 @@ int swsusp_write(unsigned int flags)
458 } 608 }
459 header = (struct swsusp_info *)data_of(snapshot); 609 header = (struct swsusp_info *)data_of(snapshot);
460 error = swap_write_page(&handle, header, NULL); 610 error = swap_write_page(&handle, header, NULL);
461 if (!error) 611 if (!error) {
462 error = save_image(&handle, &snapshot, pages - 1); 612 error = (flags & SF_NOCOMPRESS_MODE) ?
613 save_image(&handle, &snapshot, pages - 1) :
614 save_image_lzo(&handle, &snapshot, pages - 1);
615 }
463out_finish: 616out_finish:
464 error = swap_writer_finish(&handle, flags, error); 617 error = swap_writer_finish(&handle, flags, error);
465 return error; 618 return error;
@@ -590,9 +743,152 @@ static int load_image(struct swap_map_handle *handle,
590} 743}
591 744
592/** 745/**
746 * load_image_lzo - Load compressed image data and decompress them with LZO.
747 * @handle: Swap map handle to use for loading data.
748 * @snapshot: Image to copy uncompressed data into.
749 * @nr_to_read: Number of pages to load.
750 */
751static int load_image_lzo(struct swap_map_handle *handle,
752 struct snapshot_handle *snapshot,
753 unsigned int nr_to_read)
754{
755 unsigned int m;
756 int error = 0;
757 struct bio *bio;
758 struct timeval start;
759 struct timeval stop;
760 unsigned nr_pages;
761 size_t i, off, unc_len, cmp_len;
762 unsigned char *unc, *cmp, *page[LZO_CMP_PAGES];
763
764 for (i = 0; i < LZO_CMP_PAGES; i++) {
765 page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
766 if (!page[i]) {
767 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
768
769 while (i)
770 free_page((unsigned long)page[--i]);
771
772 return -ENOMEM;
773 }
774 }
775
776 unc = vmalloc(LZO_UNC_SIZE);
777 if (!unc) {
778 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
779
780 for (i = 0; i < LZO_CMP_PAGES; i++)
781 free_page((unsigned long)page[i]);
782
783 return -ENOMEM;
784 }
785
786 cmp = vmalloc(LZO_CMP_SIZE);
787 if (!cmp) {
788 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
789
790 vfree(unc);
791 for (i = 0; i < LZO_CMP_PAGES; i++)
792 free_page((unsigned long)page[i]);
793
794 return -ENOMEM;
795 }
796
797 printk(KERN_INFO
798 "PM: Loading and decompressing image data (%u pages) ... ",
799 nr_to_read);
800 m = nr_to_read / 100;
801 if (!m)
802 m = 1;
803 nr_pages = 0;
804 bio = NULL;
805 do_gettimeofday(&start);
806
807 error = snapshot_write_next(snapshot);
808 if (error <= 0)
809 goto out_finish;
810
811 for (;;) {
812 error = swap_read_page(handle, page[0], NULL); /* sync */
813 if (error)
814 break;
815
816 cmp_len = *(size_t *)page[0];
817 if (unlikely(!cmp_len ||
818 cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) {
819 printk(KERN_ERR "PM: Invalid LZO compressed length\n");
820 error = -1;
821 break;
822 }
823
824 for (off = PAGE_SIZE, i = 1;
825 off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
826 error = swap_read_page(handle, page[i], &bio);
827 if (error)
828 goto out_finish;
829 }
830
831 error = hib_wait_on_bio_chain(&bio); /* need all data now */
832 if (error)
833 goto out_finish;
834
835 for (off = 0, i = 0;
836 off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
837 memcpy(cmp + off, page[i], PAGE_SIZE);
838 }
839
840 unc_len = LZO_UNC_SIZE;
841 error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len,
842 unc, &unc_len);
843 if (error < 0) {
844 printk(KERN_ERR "PM: LZO decompression failed\n");
845 break;
846 }
847
848 if (unlikely(!unc_len ||
849 unc_len > LZO_UNC_SIZE ||
850 unc_len & (PAGE_SIZE - 1))) {
851 printk(KERN_ERR "PM: Invalid LZO uncompressed length\n");
852 error = -1;
853 break;
854 }
855
856 for (off = 0; off < unc_len; off += PAGE_SIZE) {
857 memcpy(data_of(*snapshot), unc + off, PAGE_SIZE);
858
859 if (!(nr_pages % m))
860 printk("\b\b\b\b%3d%%", nr_pages / m);
861 nr_pages++;
862
863 error = snapshot_write_next(snapshot);
864 if (error <= 0)
865 goto out_finish;
866 }
867 }
868
869out_finish:
870 do_gettimeofday(&stop);
871 if (!error) {
872 printk("\b\b\b\bdone\n");
873 snapshot_write_finalize(snapshot);
874 if (!snapshot_image_loaded(snapshot))
875 error = -ENODATA;
876 } else
877 printk("\n");
878 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
879
880 vfree(cmp);
881 vfree(unc);
882 for (i = 0; i < LZO_CMP_PAGES; i++)
883 free_page((unsigned long)page[i]);
884
885 return error;
886}
887
888/**
593 * swsusp_read - read the hibernation image. 889 * swsusp_read - read the hibernation image.
594 * @flags_p: flags passed by the "frozen" kernel in the image header should 890 * @flags_p: flags passed by the "frozen" kernel in the image header should
595 * be written into this memeory location 891 * be written into this memory location
596 */ 892 */
597 893
598int swsusp_read(unsigned int *flags_p) 894int swsusp_read(unsigned int *flags_p)
@@ -612,8 +908,11 @@ int swsusp_read(unsigned int *flags_p)
612 goto end; 908 goto end;
613 if (!error) 909 if (!error)
614 error = swap_read_page(&handle, header, NULL); 910 error = swap_read_page(&handle, header, NULL);
615 if (!error) 911 if (!error) {
616 error = load_image(&handle, &snapshot, header->pages - 1); 912 error = (*flags_p & SF_NOCOMPRESS_MODE) ?
913 load_image(&handle, &snapshot, header->pages - 1) :
914 load_image_lzo(&handle, &snapshot, header->pages - 1);
915 }
617 swap_reader_finish(&handle); 916 swap_reader_finish(&handle);
618end: 917end:
619 if (!error) 918 if (!error)
@@ -631,16 +930,17 @@ int swsusp_check(void)
631{ 930{
632 int error; 931 int error;
633 932
634 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 933 hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
934 FMODE_READ, NULL);
635 if (!IS_ERR(hib_resume_bdev)) { 935 if (!IS_ERR(hib_resume_bdev)) {
636 set_blocksize(hib_resume_bdev, PAGE_SIZE); 936 set_blocksize(hib_resume_bdev, PAGE_SIZE);
637 memset(swsusp_header, 0, PAGE_SIZE); 937 clear_page(swsusp_header);
638 error = hib_bio_read_page(swsusp_resume_block, 938 error = hib_bio_read_page(swsusp_resume_block,
639 swsusp_header, NULL); 939 swsusp_header, NULL);
640 if (error) 940 if (error)
641 goto put; 941 goto put;
642 942
643 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { 943 if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
644 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); 944 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
645 /* Reset swap signature now */ 945 /* Reset swap signature now */
646 error = hib_bio_write_page(swsusp_resume_block, 946 error = hib_bio_write_page(swsusp_resume_block,
@@ -653,13 +953,13 @@ put:
653 if (error) 953 if (error)
654 blkdev_put(hib_resume_bdev, FMODE_READ); 954 blkdev_put(hib_resume_bdev, FMODE_READ);
655 else 955 else
656 pr_debug("PM: Signature found, resuming\n"); 956 pr_debug("PM: Image signature found, resuming\n");
657 } else { 957 } else {
658 error = PTR_ERR(hib_resume_bdev); 958 error = PTR_ERR(hib_resume_bdev);
659 } 959 }
660 960
661 if (error) 961 if (error)
662 pr_debug("PM: Error %d checking image file\n", error); 962 pr_debug("PM: Image not found (code %d)\n", error);
663 963
664 return error; 964 return error;
665} 965}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index e819e17877ca..42ddbc6f0de6 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -113,8 +113,10 @@ static int snapshot_open(struct inode *inode, struct file *filp)
113 if (error) 113 if (error)
114 pm_notifier_call_chain(PM_POST_RESTORE); 114 pm_notifier_call_chain(PM_POST_RESTORE);
115 } 115 }
116 if (error) 116 if (error) {
117 free_basic_memory_bitmaps();
117 atomic_inc(&snapshot_device_available); 118 atomic_inc(&snapshot_device_available);
119 }
118 data->frozen = 0; 120 data->frozen = 0;
119 data->ready = 0; 121 data->ready = 0;
120 data->platform_support = 0; 122 data->platform_support = 0;
@@ -135,9 +137,11 @@ static int snapshot_release(struct inode *inode, struct file *filp)
135 free_basic_memory_bitmaps(); 137 free_basic_memory_bitmaps();
136 data = filp->private_data; 138 data = filp->private_data;
137 free_all_swap_pages(data->swap); 139 free_all_swap_pages(data->swap);
138 if (data->frozen) 140 if (data->frozen) {
141 pm_restore_gfp_mask();
139 thaw_processes(); 142 thaw_processes();
140 pm_notifier_call_chain(data->mode == O_WRONLY ? 143 }
144 pm_notifier_call_chain(data->mode == O_RDONLY ?
141 PM_POST_HIBERNATION : PM_POST_RESTORE); 145 PM_POST_HIBERNATION : PM_POST_RESTORE);
142 atomic_inc(&snapshot_device_available); 146 atomic_inc(&snapshot_device_available);
143 147
@@ -263,6 +267,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
263 case SNAPSHOT_UNFREEZE: 267 case SNAPSHOT_UNFREEZE:
264 if (!data->frozen || data->ready) 268 if (!data->frozen || data->ready)
265 break; 269 break;
270 pm_restore_gfp_mask();
266 thaw_processes(); 271 thaw_processes();
267 usermodehelper_enable(); 272 usermodehelper_enable();
268 data->frozen = 0; 273 data->frozen = 0;
@@ -275,6 +280,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
275 error = -EPERM; 280 error = -EPERM;
276 break; 281 break;
277 } 282 }
283 pm_restore_gfp_mask();
278 error = hibernation_snapshot(data->platform_support); 284 error = hibernation_snapshot(data->platform_support);
279 if (!error) 285 if (!error)
280 error = put_user(in_suspend, (int __user *)arg); 286 error = put_user(in_suspend, (int __user *)arg);
@@ -377,6 +383,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
377 * PM_HIBERNATION_PREPARE 383 * PM_HIBERNATION_PREPARE
378 */ 384 */
379 error = suspend_devices_and_enter(PM_SUSPEND_MEM); 385 error = suspend_devices_and_enter(PM_SUSPEND_MEM);
386 data->ready = 0;
380 break; 387 break;
381 388
382 case SNAPSHOT_PLATFORM_SUPPORT: 389 case SNAPSHOT_PLATFORM_SUPPORT:
diff --git a/kernel/printk.c b/kernel/printk.c
index 9dc8ea140426..b799a2ee96e5 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -31,6 +31,7 @@
31#include <linux/smp.h> 31#include <linux/smp.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/memblock.h>
34#include <linux/syscalls.h> 35#include <linux/syscalls.h>
35#include <linux/kexec.h> 36#include <linux/kexec.h>
36#include <linux/kdb.h> 37#include <linux/kdb.h>
@@ -39,16 +40,11 @@
39#include <linux/syslog.h> 40#include <linux/syslog.h>
40#include <linux/cpu.h> 41#include <linux/cpu.h>
41#include <linux/notifier.h> 42#include <linux/notifier.h>
43#include <linux/rculist.h>
42 44
43#include <asm/uaccess.h> 45#include <asm/uaccess.h>
44 46
45/* 47/*
46 * for_each_console() allows you to iterate on each console
47 */
48#define for_each_console(con) \
49 for (con = console_drivers; con != NULL; con = con->next)
50
51/*
52 * Architectures can override it: 48 * Architectures can override it:
53 */ 49 */
54void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) 50void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
@@ -58,7 +54,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
58#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) 54#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
59 55
60/* printk's without a loglevel use this.. */ 56/* printk's without a loglevel use this.. */
61#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */ 57#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
62 58
63/* We show everything that is MORE important than this.. */ 59/* We show everything that is MORE important than this.. */
64#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ 60#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
@@ -92,7 +88,7 @@ EXPORT_SYMBOL(oops_in_progress);
92 * provides serialisation for access to the entire console 88 * provides serialisation for access to the entire console
93 * driver system. 89 * driver system.
94 */ 90 */
95static DECLARE_MUTEX(console_sem); 91static DEFINE_SEMAPHORE(console_sem);
96struct console *console_drivers; 92struct console *console_drivers;
97EXPORT_SYMBOL_GPL(console_drivers); 93EXPORT_SYMBOL_GPL(console_drivers);
98 94
@@ -109,7 +105,7 @@ static int console_locked, console_suspended;
109/* 105/*
110 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars 106 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
111 * It is also used in interesting ways to provide interlocking in 107 * It is also used in interesting ways to provide interlocking in
112 * release_console_sem(). 108 * console_unlock();.
113 */ 109 */
114static DEFINE_SPINLOCK(logbuf_lock); 110static DEFINE_SPINLOCK(logbuf_lock);
115 111
@@ -125,6 +121,11 @@ static unsigned con_start; /* Index into log_buf: next char to be sent to consol
125static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ 121static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
126 122
127/* 123/*
124 * If exclusive_console is non-NULL then only this console is to be printed to.
125 */
126static struct console *exclusive_console;
127
128/*
128 * Array of consoles built from command line options (console=) 129 * Array of consoles built from command line options (console=)
129 */ 130 */
130struct console_cmdline 131struct console_cmdline
@@ -174,50 +175,78 @@ void log_buf_kexec_setup(void)
174} 175}
175#endif 176#endif
176 177
178/* requested log_buf_len from kernel cmdline */
179static unsigned long __initdata new_log_buf_len;
180
181/* save requested log_buf_len since it's too early to process it */
177static int __init log_buf_len_setup(char *str) 182static int __init log_buf_len_setup(char *str)
178{ 183{
179 unsigned size = memparse(str, &str); 184 unsigned size = memparse(str, &str);
180 unsigned long flags;
181 185
182 if (size) 186 if (size)
183 size = roundup_pow_of_two(size); 187 size = roundup_pow_of_two(size);
184 if (size > log_buf_len) { 188 if (size > log_buf_len)
185 unsigned start, dest_idx, offset; 189 new_log_buf_len = size;
186 char *new_log_buf;
187 190
188 new_log_buf = alloc_bootmem(size); 191 return 0;
189 if (!new_log_buf) { 192}
190 printk(KERN_WARNING "log_buf_len: allocation failed\n"); 193early_param("log_buf_len", log_buf_len_setup);
191 goto out;
192 }
193 194
194 spin_lock_irqsave(&logbuf_lock, flags); 195void __init setup_log_buf(int early)
195 log_buf_len = size; 196{
196 log_buf = new_log_buf; 197 unsigned long flags;
197 198 unsigned start, dest_idx, offset;
198 offset = start = min(con_start, log_start); 199 char *new_log_buf;
199 dest_idx = 0; 200 int free;
200 while (start != log_end) { 201
201 log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)]; 202 if (!new_log_buf_len)
202 start++; 203 return;
203 dest_idx++; 204
204 } 205 if (early) {
205 log_start -= offset; 206 unsigned long mem;
206 con_start -= offset;
207 log_end -= offset;
208 spin_unlock_irqrestore(&logbuf_lock, flags);
209 207
210 printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); 208 mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
209 if (mem == MEMBLOCK_ERROR)
210 return;
211 new_log_buf = __va(mem);
212 } else {
213 new_log_buf = alloc_bootmem_nopanic(new_log_buf_len);
211 } 214 }
212out:
213 return 1;
214}
215 215
216__setup("log_buf_len=", log_buf_len_setup); 216 if (unlikely(!new_log_buf)) {
217 pr_err("log_buf_len: %ld bytes not available\n",
218 new_log_buf_len);
219 return;
220 }
221
222 spin_lock_irqsave(&logbuf_lock, flags);
223 log_buf_len = new_log_buf_len;
224 log_buf = new_log_buf;
225 new_log_buf_len = 0;
226 free = __LOG_BUF_LEN - log_end;
227
228 offset = start = min(con_start, log_start);
229 dest_idx = 0;
230 while (start != log_end) {
231 unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1);
232
233 log_buf[dest_idx] = __log_buf[log_idx_mask];
234 start++;
235 dest_idx++;
236 }
237 log_start -= offset;
238 con_start -= offset;
239 log_end -= offset;
240 spin_unlock_irqrestore(&logbuf_lock, flags);
241
242 pr_info("log_buf_len: %d\n", log_buf_len);
243 pr_info("early log buf free: %d(%d%%)\n",
244 free, (free * 100) / __LOG_BUF_LEN);
245}
217 246
218#ifdef CONFIG_BOOT_PRINTK_DELAY 247#ifdef CONFIG_BOOT_PRINTK_DELAY
219 248
220static unsigned int boot_delay; /* msecs delay after each printk during bootup */ 249static int boot_delay; /* msecs delay after each printk during bootup */
221static unsigned long long loops_per_msec; /* based on boot_delay */ 250static unsigned long long loops_per_msec; /* based on boot_delay */
222 251
223static int __init boot_delay_setup(char *str) 252static int __init boot_delay_setup(char *str)
@@ -268,14 +297,55 @@ static inline void boot_delay_msec(void)
268} 297}
269#endif 298#endif
270 299
300#ifdef CONFIG_SECURITY_DMESG_RESTRICT
301int dmesg_restrict = 1;
302#else
303int dmesg_restrict;
304#endif
305
306static int syslog_action_restricted(int type)
307{
308 if (dmesg_restrict)
309 return 1;
310 /* Unless restricted, we allow "read all" and "get buffer size" for everybody */
311 return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER;
312}
313
314static int check_syslog_permissions(int type, bool from_file)
315{
316 /*
317 * If this is from /proc/kmsg and we've already opened it, then we've
318 * already done the capabilities checks at open time.
319 */
320 if (from_file && type != SYSLOG_ACTION_OPEN)
321 return 0;
322
323 if (syslog_action_restricted(type)) {
324 if (capable(CAP_SYSLOG))
325 return 0;
326 /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
327 if (capable(CAP_SYS_ADMIN)) {
328 WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
329 "but no CAP_SYSLOG (deprecated).\n");
330 return 0;
331 }
332 return -EPERM;
333 }
334 return 0;
335}
336
271int do_syslog(int type, char __user *buf, int len, bool from_file) 337int do_syslog(int type, char __user *buf, int len, bool from_file)
272{ 338{
273 unsigned i, j, limit, count; 339 unsigned i, j, limit, count;
274 int do_clear = 0; 340 int do_clear = 0;
275 char c; 341 char c;
276 int error = 0; 342 int error;
343
344 error = check_syslog_permissions(type, from_file);
345 if (error)
346 goto out;
277 347
278 error = security_syslog(type, from_file); 348 error = security_syslog(type);
279 if (error) 349 if (error)
280 return error; 350 return error;
281 351
@@ -447,6 +517,8 @@ static void __call_console_drivers(unsigned start, unsigned end)
447 struct console *con; 517 struct console *con;
448 518
449 for_each_console(con) { 519 for_each_console(con) {
520 if (exclusive_console && con != exclusive_console)
521 continue;
450 if ((con->flags & CON_ENABLED) && con->write && 522 if ((con->flags & CON_ENABLED) && con->write &&
451 (cpu_online(smp_processor_id()) || 523 (cpu_online(smp_processor_id()) ||
452 (con->flags & CON_ANYTIME))) 524 (con->flags & CON_ANYTIME)))
@@ -486,9 +558,74 @@ static void _call_console_drivers(unsigned start,
486} 558}
487 559
488/* 560/*
561 * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
562 * lower 3 bit are the log level, the rest are the log facility. In case
563 * userspace passes usual userspace syslog messages to /dev/kmsg or
564 * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
565 * to extract the correct log level for in-kernel processing, and not mangle
566 * the original value.
567 *
568 * If a prefix is found, the length of the prefix is returned. If 'level' is
569 * passed, it will be filled in with the log level without a possible facility
570 * value. If 'special' is passed, the special printk prefix chars are accepted
571 * and returned. If no valid header is found, 0 is returned and the passed
572 * variables are not touched.
573 */
574static size_t log_prefix(const char *p, unsigned int *level, char *special)
575{
576 unsigned int lev = 0;
577 char sp = '\0';
578 size_t len;
579
580 if (p[0] != '<' || !p[1])
581 return 0;
582 if (p[2] == '>') {
583 /* usual single digit level number or special char */
584 switch (p[1]) {
585 case '0' ... '7':
586 lev = p[1] - '0';
587 break;
588 case 'c': /* KERN_CONT */
589 case 'd': /* KERN_DEFAULT */
590 sp = p[1];
591 break;
592 default:
593 return 0;
594 }
595 len = 3;
596 } else {
597 /* multi digit including the level and facility number */
598 char *endp = NULL;
599
600 if (p[1] < '0' && p[1] > '9')
601 return 0;
602
603 lev = (simple_strtoul(&p[1], &endp, 10) & 7);
604 if (endp == NULL || endp[0] != '>')
605 return 0;
606 len = (endp + 1) - p;
607 }
608
609 /* do not accept special char if not asked for */
610 if (sp && !special)
611 return 0;
612
613 if (special) {
614 *special = sp;
615 /* return special char, do not touch level */
616 if (sp)
617 return len;
618 }
619
620 if (level)
621 *level = lev;
622 return len;
623}
624
625/*
489 * Call the console drivers, asking them to write out 626 * Call the console drivers, asking them to write out
490 * log_buf[start] to log_buf[end - 1]. 627 * log_buf[start] to log_buf[end - 1].
491 * The console_sem must be held. 628 * The console_lock must be held.
492 */ 629 */
493static void call_console_drivers(unsigned start, unsigned end) 630static void call_console_drivers(unsigned start, unsigned end)
494{ 631{
@@ -500,13 +637,9 @@ static void call_console_drivers(unsigned start, unsigned end)
500 cur_index = start; 637 cur_index = start;
501 start_print = start; 638 start_print = start;
502 while (cur_index != end) { 639 while (cur_index != end) {
503 if (msg_level < 0 && ((end - cur_index) > 2) && 640 if (msg_level < 0 && ((end - cur_index) > 2)) {
504 LOG_BUF(cur_index + 0) == '<' && 641 /* strip log prefix */
505 LOG_BUF(cur_index + 1) >= '0' && 642 cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
506 LOG_BUF(cur_index + 1) <= '7' &&
507 LOG_BUF(cur_index + 2) == '>') {
508 msg_level = LOG_BUF(cur_index + 1) - '0';
509 cur_index += 3;
510 start_print = cur_index; 643 start_print = cur_index;
511 } 644 }
512 while (cur_index != end) { 645 while (cur_index != end) {
@@ -563,7 +696,7 @@ static void zap_locks(void)
563 /* If a crash is occurring, make sure we can't deadlock */ 696 /* If a crash is occurring, make sure we can't deadlock */
564 spin_lock_init(&logbuf_lock); 697 spin_lock_init(&logbuf_lock);
565 /* And make sure that we print immediately */ 698 /* And make sure that we print immediately */
566 init_MUTEX(&console_sem); 699 sema_init(&console_sem, 1);
567} 700}
568 701
569#if defined(CONFIG_PRINTK_TIME) 702#if defined(CONFIG_PRINTK_TIME)
@@ -591,11 +724,11 @@ static int have_callable_console(void)
591 * 724 *
592 * This is printk(). It can be called from any context. We want it to work. 725 * This is printk(). It can be called from any context. We want it to work.
593 * 726 *
594 * We try to grab the console_sem. If we succeed, it's easy - we log the output and 727 * We try to grab the console_lock. If we succeed, it's easy - we log the output and
595 * call the console drivers. If we fail to get the semaphore we place the output 728 * call the console drivers. If we fail to get the semaphore we place the output
596 * into the log buffer and return. The current holder of the console_sem will 729 * into the log buffer and return. The current holder of the console_sem will
597 * notice the new output in release_console_sem() and will send it to the 730 * notice the new output in console_unlock(); and will send it to the
598 * consoles before releasing the semaphore. 731 * consoles before releasing the lock.
599 * 732 *
600 * One effect of this deferred printing is that code which calls printk() and 733 * One effect of this deferred printing is that code which calls printk() and
601 * then changes console_loglevel may break. This is because console_loglevel 734 * then changes console_loglevel may break. This is because console_loglevel
@@ -646,18 +779,19 @@ static inline int can_use_console(unsigned int cpu)
646/* 779/*
647 * Try to get console ownership to actually show the kernel 780 * Try to get console ownership to actually show the kernel
648 * messages from a 'printk'. Return true (and with the 781 * messages from a 'printk'. Return true (and with the
649 * console_semaphore held, and 'console_locked' set) if it 782 * console_lock held, and 'console_locked' set) if it
650 * is successful, false otherwise. 783 * is successful, false otherwise.
651 * 784 *
652 * This gets called with the 'logbuf_lock' spinlock held and 785 * This gets called with the 'logbuf_lock' spinlock held and
653 * interrupts disabled. It should return with 'lockbuf_lock' 786 * interrupts disabled. It should return with 'lockbuf_lock'
654 * released but interrupts still disabled. 787 * released but interrupts still disabled.
655 */ 788 */
656static int acquire_console_semaphore_for_printk(unsigned int cpu) 789static int console_trylock_for_printk(unsigned int cpu)
790 __releases(&logbuf_lock)
657{ 791{
658 int retval = 0; 792 int retval = 0;
659 793
660 if (!try_acquire_console_sem()) { 794 if (console_trylock()) {
661 retval = 1; 795 retval = 1;
662 796
663 /* 797 /*
@@ -703,6 +837,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
703 unsigned long flags; 837 unsigned long flags;
704 int this_cpu; 838 int this_cpu;
705 char *p; 839 char *p;
840 size_t plen;
841 char special;
706 842
707 boot_delay_msec(); 843 boot_delay_msec();
708 printk_delay(); 844 printk_delay();
@@ -746,45 +882,52 @@ asmlinkage int vprintk(const char *fmt, va_list args)
746 if (trace_override && !trace_recurse) 882 if (trace_override && !trace_recurse)
747 TRACE("%s", printk_buf); 883 TRACE("%s", printk_buf);
748 884
749
750 p = printk_buf; 885 p = printk_buf;
751 886
752 /* Do we have a loglevel in the string? */ 887 /* Read log level and handle special printk prefix */
753 if (p[0] == '<') { 888 plen = log_prefix(p, &current_log_level, &special);
754 unsigned char c = p[1]; 889 if (plen) {
755 if (c && p[2] == '>') { 890 p += plen;
756 switch (c) { 891
757 case '0' ... '7': /* loglevel */ 892 switch (special) {
758 current_log_level = c - '0'; 893 case 'c': /* Strip <c> KERN_CONT, continue line */
759 /* Fallthrough - make sure we're on a new line */ 894 plen = 0;
760 case 'd': /* KERN_DEFAULT */ 895 break;
761 if (!new_text_line) { 896 case 'd': /* Strip <d> KERN_DEFAULT, start new line */
762 emit_log_char('\n'); 897 plen = 0;
763 new_text_line = 1; 898 default:
764 } 899 if (!new_text_line) {
765 /* Fallthrough - skip the loglevel */ 900 emit_log_char('\n');
766 case 'c': /* KERN_CONT */ 901 new_text_line = 1;
767 p += 3;
768 break;
769 } 902 }
770 } 903 }
771 } 904 }
772 905
773 /* 906 /*
774 * Copy the output into log_buf. If the caller didn't provide 907 * Copy the output into log_buf. If the caller didn't provide
775 * appropriate log level tags, we insert them here 908 * the appropriate log prefix, we insert them here
776 */ 909 */
777 for ( ; *p; p++) { 910 for (; *p; p++) {
778 if (new_text_line) { 911 if (new_text_line) {
779 /* Always output the token */
780 emit_log_char('<');
781 emit_log_char(current_log_level + '0');
782 emit_log_char('>');
783 printed_len += 3;
784 new_text_line = 0; 912 new_text_line = 0;
785 913
914 if (plen) {
915 /* Copy original log prefix */
916 int i;
917
918 for (i = 0; i < plen; i++)
919 emit_log_char(printk_buf[i]);
920 printed_len += plen;
921 } else {
922 /* Add log prefix */
923 emit_log_char('<');
924 emit_log_char(current_log_level + '0');
925 emit_log_char('>');
926 printed_len += 3;
927 }
928
786 if (printk_time) { 929 if (printk_time) {
787 /* Follow the token with the time */ 930 /* Add the current time stamp */
788 char tbuf[50], *tp; 931 char tbuf[50], *tp;
789 unsigned tlen; 932 unsigned tlen;
790 unsigned long long t; 933 unsigned long long t;
@@ -816,12 +959,12 @@ asmlinkage int vprintk(const char *fmt, va_list args)
816 * actual magic (print out buffers, wake up klogd, 959 * actual magic (print out buffers, wake up klogd,
817 * etc). 960 * etc).
818 * 961 *
819 * The acquire_console_semaphore_for_printk() function 962 * The console_trylock_for_printk() function
820 * will release 'logbuf_lock' regardless of whether it 963 * will release 'logbuf_lock' regardless of whether it
821 * actually gets the semaphore or not. 964 * actually gets the semaphore or not.
822 */ 965 */
823 if (acquire_console_semaphore_for_printk(this_cpu)) 966 if (console_trylock_for_printk(this_cpu))
824 release_console_sem(); 967 console_unlock();
825 968
826 lockdep_on(); 969 lockdep_on();
827out_restore_irqs: 970out_restore_irqs:
@@ -982,7 +1125,7 @@ void suspend_console(void)
982 if (!console_suspend_enabled) 1125 if (!console_suspend_enabled)
983 return; 1126 return;
984 printk("Suspending console(s) (use no_console_suspend to debug)\n"); 1127 printk("Suspending console(s) (use no_console_suspend to debug)\n");
985 acquire_console_sem(); 1128 console_lock();
986 console_suspended = 1; 1129 console_suspended = 1;
987 up(&console_sem); 1130 up(&console_sem);
988} 1131}
@@ -993,7 +1136,7 @@ void resume_console(void)
993 return; 1136 return;
994 down(&console_sem); 1137 down(&console_sem);
995 console_suspended = 0; 1138 console_suspended = 0;
996 release_console_sem(); 1139 console_unlock();
997} 1140}
998 1141
999/** 1142/**
@@ -1016,21 +1159,21 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
1016 case CPU_DYING: 1159 case CPU_DYING:
1017 case CPU_DOWN_FAILED: 1160 case CPU_DOWN_FAILED:
1018 case CPU_UP_CANCELED: 1161 case CPU_UP_CANCELED:
1019 acquire_console_sem(); 1162 console_lock();
1020 release_console_sem(); 1163 console_unlock();
1021 } 1164 }
1022 return NOTIFY_OK; 1165 return NOTIFY_OK;
1023} 1166}
1024 1167
1025/** 1168/**
1026 * acquire_console_sem - lock the console system for exclusive use. 1169 * console_lock - lock the console system for exclusive use.
1027 * 1170 *
1028 * Acquires a semaphore which guarantees that the caller has 1171 * Acquires a lock which guarantees that the caller has
1029 * exclusive access to the console system and the console_drivers list. 1172 * exclusive access to the console system and the console_drivers list.
1030 * 1173 *
1031 * Can sleep, returns nothing. 1174 * Can sleep, returns nothing.
1032 */ 1175 */
1033void acquire_console_sem(void) 1176void console_lock(void)
1034{ 1177{
1035 BUG_ON(in_interrupt()); 1178 BUG_ON(in_interrupt());
1036 down(&console_sem); 1179 down(&console_sem);
@@ -1039,21 +1182,29 @@ void acquire_console_sem(void)
1039 console_locked = 1; 1182 console_locked = 1;
1040 console_may_schedule = 1; 1183 console_may_schedule = 1;
1041} 1184}
1042EXPORT_SYMBOL(acquire_console_sem); 1185EXPORT_SYMBOL(console_lock);
1043 1186
1044int try_acquire_console_sem(void) 1187/**
1188 * console_trylock - try to lock the console system for exclusive use.
1189 *
1190 * Tried to acquire a lock which guarantees that the caller has
1191 * exclusive access to the console system and the console_drivers list.
1192 *
1193 * returns 1 on success, and 0 on failure to acquire the lock.
1194 */
1195int console_trylock(void)
1045{ 1196{
1046 if (down_trylock(&console_sem)) 1197 if (down_trylock(&console_sem))
1047 return -1; 1198 return 0;
1048 if (console_suspended) { 1199 if (console_suspended) {
1049 up(&console_sem); 1200 up(&console_sem);
1050 return -1; 1201 return 0;
1051 } 1202 }
1052 console_locked = 1; 1203 console_locked = 1;
1053 console_may_schedule = 0; 1204 console_may_schedule = 0;
1054 return 0; 1205 return 1;
1055} 1206}
1056EXPORT_SYMBOL(try_acquire_console_sem); 1207EXPORT_SYMBOL(console_trylock);
1057 1208
1058int is_console_locked(void) 1209int is_console_locked(void)
1059{ 1210{
@@ -1064,38 +1215,40 @@ static DEFINE_PER_CPU(int, printk_pending);
1064 1215
1065void printk_tick(void) 1216void printk_tick(void)
1066{ 1217{
1067 if (__get_cpu_var(printk_pending)) { 1218 if (__this_cpu_read(printk_pending)) {
1068 __get_cpu_var(printk_pending) = 0; 1219 __this_cpu_write(printk_pending, 0);
1069 wake_up_interruptible(&log_wait); 1220 wake_up_interruptible(&log_wait);
1070 } 1221 }
1071} 1222}
1072 1223
1073int printk_needs_cpu(int cpu) 1224int printk_needs_cpu(int cpu)
1074{ 1225{
1075 return per_cpu(printk_pending, cpu); 1226 if (cpu_is_offline(cpu))
1227 printk_tick();
1228 return __this_cpu_read(printk_pending);
1076} 1229}
1077 1230
1078void wake_up_klogd(void) 1231void wake_up_klogd(void)
1079{ 1232{
1080 if (!trace_override && waitqueue_active(&log_wait)) 1233 if (!trace_override && waitqueue_active(&log_wait))
1081 __raw_get_cpu_var(printk_pending) = 1; 1234 this_cpu_write(printk_pending, 1);
1082} 1235}
1083 1236
1084/** 1237/**
1085 * release_console_sem - unlock the console system 1238 * console_unlock - unlock the console system
1086 * 1239 *
1087 * Releases the semaphore which the caller holds on the console system 1240 * Releases the console_lock which the caller holds on the console system
1088 * and the console driver list. 1241 * and the console driver list.
1089 * 1242 *
1090 * While the semaphore was held, console output may have been buffered 1243 * While the console_lock was held, console output may have been buffered
1091 * by printk(). If this is the case, release_console_sem() emits 1244 * by printk(). If this is the case, console_unlock(); emits
1092 * the output prior to releasing the semaphore. 1245 * the output prior to releasing the lock.
1093 * 1246 *
1094 * If there is output waiting for klogd, we wake it up. 1247 * If there is output waiting for klogd, we wake it up.
1095 * 1248 *
1096 * release_console_sem() may be called from any context. 1249 * console_unlock(); may be called from any context.
1097 */ 1250 */
1098void release_console_sem(void) 1251void console_unlock(void)
1099{ 1252{
1100 unsigned long flags; 1253 unsigned long flags;
1101 unsigned _con_start, _log_end; 1254 unsigned _con_start, _log_end;
@@ -1123,12 +1276,17 @@ void release_console_sem(void)
1123 local_irq_restore(flags); 1276 local_irq_restore(flags);
1124 } 1277 }
1125 console_locked = 0; 1278 console_locked = 0;
1279
1280 /* Release the exclusive_console once it is used */
1281 if (unlikely(exclusive_console))
1282 exclusive_console = NULL;
1283
1126 up(&console_sem); 1284 up(&console_sem);
1127 spin_unlock_irqrestore(&logbuf_lock, flags); 1285 spin_unlock_irqrestore(&logbuf_lock, flags);
1128 if (wake_klogd) 1286 if (wake_klogd)
1129 wake_up_klogd(); 1287 wake_up_klogd();
1130} 1288}
1131EXPORT_SYMBOL(release_console_sem); 1289EXPORT_SYMBOL(console_unlock);
1132 1290
1133/** 1291/**
1134 * console_conditional_schedule - yield the CPU if required 1292 * console_conditional_schedule - yield the CPU if required
@@ -1137,7 +1295,7 @@ EXPORT_SYMBOL(release_console_sem);
1137 * if this CPU should yield the CPU to another task, do 1295 * if this CPU should yield the CPU to another task, do
1138 * so here. 1296 * so here.
1139 * 1297 *
1140 * Must be called within acquire_console_sem(). 1298 * Must be called within console_lock();.
1141 */ 1299 */
1142void __sched console_conditional_schedule(void) 1300void __sched console_conditional_schedule(void)
1143{ 1301{
@@ -1158,14 +1316,14 @@ void console_unblank(void)
1158 if (down_trylock(&console_sem) != 0) 1316 if (down_trylock(&console_sem) != 0)
1159 return; 1317 return;
1160 } else 1318 } else
1161 acquire_console_sem(); 1319 console_lock();
1162 1320
1163 console_locked = 1; 1321 console_locked = 1;
1164 console_may_schedule = 0; 1322 console_may_schedule = 0;
1165 for_each_console(c) 1323 for_each_console(c)
1166 if ((c->flags & CON_ENABLED) && c->unblank) 1324 if ((c->flags & CON_ENABLED) && c->unblank)
1167 c->unblank(); 1325 c->unblank();
1168 release_console_sem(); 1326 console_unlock();
1169} 1327}
1170 1328
1171/* 1329/*
@@ -1176,7 +1334,7 @@ struct tty_driver *console_device(int *index)
1176 struct console *c; 1334 struct console *c;
1177 struct tty_driver *driver = NULL; 1335 struct tty_driver *driver = NULL;
1178 1336
1179 acquire_console_sem(); 1337 console_lock();
1180 for_each_console(c) { 1338 for_each_console(c) {
1181 if (!c->device) 1339 if (!c->device)
1182 continue; 1340 continue;
@@ -1184,7 +1342,7 @@ struct tty_driver *console_device(int *index)
1184 if (driver) 1342 if (driver)
1185 break; 1343 break;
1186 } 1344 }
1187 release_console_sem(); 1345 console_unlock();
1188 return driver; 1346 return driver;
1189} 1347}
1190 1348
@@ -1195,20 +1353,32 @@ struct tty_driver *console_device(int *index)
1195 */ 1353 */
1196void console_stop(struct console *console) 1354void console_stop(struct console *console)
1197{ 1355{
1198 acquire_console_sem(); 1356 console_lock();
1199 console->flags &= ~CON_ENABLED; 1357 console->flags &= ~CON_ENABLED;
1200 release_console_sem(); 1358 console_unlock();
1201} 1359}
1202EXPORT_SYMBOL(console_stop); 1360EXPORT_SYMBOL(console_stop);
1203 1361
1204void console_start(struct console *console) 1362void console_start(struct console *console)
1205{ 1363{
1206 acquire_console_sem(); 1364 console_lock();
1207 console->flags |= CON_ENABLED; 1365 console->flags |= CON_ENABLED;
1208 release_console_sem(); 1366 console_unlock();
1209} 1367}
1210EXPORT_SYMBOL(console_start); 1368EXPORT_SYMBOL(console_start);
1211 1369
1370static int __read_mostly keep_bootcon;
1371
1372static int __init keep_bootcon_setup(char *str)
1373{
1374 keep_bootcon = 1;
1375 printk(KERN_INFO "debug: skip boot console de-registration.\n");
1376
1377 return 0;
1378}
1379
1380early_param("keep_bootcon", keep_bootcon_setup);
1381
1212/* 1382/*
1213 * The console driver calls this routine during kernel initialization 1383 * The console driver calls this routine during kernel initialization
1214 * to register the console printing procedure with printk() and to 1384 * to register the console printing procedure with printk() and to
@@ -1327,7 +1497,7 @@ void register_console(struct console *newcon)
1327 * Put this console in the list - keep the 1497 * Put this console in the list - keep the
1328 * preferred driver at the head of the list. 1498 * preferred driver at the head of the list.
1329 */ 1499 */
1330 acquire_console_sem(); 1500 console_lock();
1331 if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { 1501 if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
1332 newcon->next = console_drivers; 1502 newcon->next = console_drivers;
1333 console_drivers = newcon; 1503 console_drivers = newcon;
@@ -1339,14 +1509,21 @@ void register_console(struct console *newcon)
1339 } 1509 }
1340 if (newcon->flags & CON_PRINTBUFFER) { 1510 if (newcon->flags & CON_PRINTBUFFER) {
1341 /* 1511 /*
1342 * release_console_sem() will print out the buffered messages 1512 * console_unlock(); will print out the buffered messages
1343 * for us. 1513 * for us.
1344 */ 1514 */
1345 spin_lock_irqsave(&logbuf_lock, flags); 1515 spin_lock_irqsave(&logbuf_lock, flags);
1346 con_start = log_start; 1516 con_start = log_start;
1347 spin_unlock_irqrestore(&logbuf_lock, flags); 1517 spin_unlock_irqrestore(&logbuf_lock, flags);
1518 /*
1519 * We're about to replay the log buffer. Only do this to the
1520 * just-registered console to avoid excessive message spam to
1521 * the already-registered consoles.
1522 */
1523 exclusive_console = newcon;
1348 } 1524 }
1349 release_console_sem(); 1525 console_unlock();
1526 console_sysfs_notify();
1350 1527
1351 /* 1528 /*
1352 * By unregistering the bootconsoles after we enable the real console 1529 * By unregistering the bootconsoles after we enable the real console
@@ -1355,7 +1532,9 @@ void register_console(struct console *newcon)
1355 * users know there might be something in the kernel's log buffer that 1532 * users know there might be something in the kernel's log buffer that
1356 * went to the bootconsole (that they do not see on the real console) 1533 * went to the bootconsole (that they do not see on the real console)
1357 */ 1534 */
1358 if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { 1535 if (bcon &&
1536 ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
1537 !keep_bootcon) {
1359 /* we need to iterate through twice, to make sure we print 1538 /* we need to iterate through twice, to make sure we print
1360 * everything out, before we unregister the console(s) 1539 * everything out, before we unregister the console(s)
1361 */ 1540 */
@@ -1382,7 +1561,7 @@ int unregister_console(struct console *console)
1382 return braille_unregister_console(console); 1561 return braille_unregister_console(console);
1383#endif 1562#endif
1384 1563
1385 acquire_console_sem(); 1564 console_lock();
1386 if (console_drivers == console) { 1565 if (console_drivers == console) {
1387 console_drivers=console->next; 1566 console_drivers=console->next;
1388 res = 0; 1567 res = 0;
@@ -1404,7 +1583,8 @@ int unregister_console(struct console *console)
1404 if (console_drivers != NULL && console->flags & CON_CONSDEV) 1583 if (console_drivers != NULL && console->flags & CON_CONSDEV)
1405 console_drivers->flags |= CON_CONSDEV; 1584 console_drivers->flags |= CON_CONSDEV;
1406 1585
1407 release_console_sem(); 1586 console_unlock();
1587 console_sysfs_notify();
1408 return res; 1588 return res;
1409} 1589}
1410EXPORT_SYMBOL(unregister_console); 1590EXPORT_SYMBOL(unregister_console);
@@ -1488,7 +1668,7 @@ int kmsg_dump_register(struct kmsg_dumper *dumper)
1488 /* Don't allow registering multiple times */ 1668 /* Don't allow registering multiple times */
1489 if (!dumper->registered) { 1669 if (!dumper->registered) {
1490 dumper->registered = 1; 1670 dumper->registered = 1;
1491 list_add_tail(&dumper->list, &dump_list); 1671 list_add_tail_rcu(&dumper->list, &dump_list);
1492 err = 0; 1672 err = 0;
1493 } 1673 }
1494 spin_unlock_irqrestore(&dump_list_lock, flags); 1674 spin_unlock_irqrestore(&dump_list_lock, flags);
@@ -1512,29 +1692,16 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
1512 spin_lock_irqsave(&dump_list_lock, flags); 1692 spin_lock_irqsave(&dump_list_lock, flags);
1513 if (dumper->registered) { 1693 if (dumper->registered) {
1514 dumper->registered = 0; 1694 dumper->registered = 0;
1515 list_del(&dumper->list); 1695 list_del_rcu(&dumper->list);
1516 err = 0; 1696 err = 0;
1517 } 1697 }
1518 spin_unlock_irqrestore(&dump_list_lock, flags); 1698 spin_unlock_irqrestore(&dump_list_lock, flags);
1699 synchronize_rcu();
1519 1700
1520 return err; 1701 return err;
1521} 1702}
1522EXPORT_SYMBOL_GPL(kmsg_dump_unregister); 1703EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1523 1704
1524static const char const *kmsg_reasons[] = {
1525 [KMSG_DUMP_OOPS] = "oops",
1526 [KMSG_DUMP_PANIC] = "panic",
1527 [KMSG_DUMP_KEXEC] = "kexec",
1528};
1529
1530static const char *kmsg_to_str(enum kmsg_dump_reason reason)
1531{
1532 if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0)
1533 return "unknown";
1534
1535 return kmsg_reasons[reason];
1536}
1537
1538/** 1705/**
1539 * kmsg_dump - dump kernel log to kernel message dumpers. 1706 * kmsg_dump - dump kernel log to kernel message dumpers.
1540 * @reason: the reason (oops, panic etc) for dumping 1707 * @reason: the reason (oops, panic etc) for dumping
@@ -1573,13 +1740,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1573 l2 = chars; 1740 l2 = chars;
1574 } 1741 }
1575 1742
1576 if (!spin_trylock_irqsave(&dump_list_lock, flags)) { 1743 rcu_read_lock();
1577 printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n", 1744 list_for_each_entry_rcu(dumper, &dump_list, list)
1578 kmsg_to_str(reason));
1579 return;
1580 }
1581 list_for_each_entry(dumper, &dump_list, list)
1582 dumper->dump(dumper, reason, s1, l1, s2, l2); 1745 dumper->dump(dumper, reason, s1, l1, s2, l2);
1583 spin_unlock_irqrestore(&dump_list_lock, flags); 1746 rcu_read_unlock();
1584} 1747}
1585#endif 1748#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index b22a899934cc..961b389fe52f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -126,11 +126,9 @@ int __ref profile_init(void)
126 if (prof_buffer) 126 if (prof_buffer)
127 return 0; 127 return 0;
128 128
129 prof_buffer = vmalloc(buffer_bytes); 129 prof_buffer = vzalloc(buffer_bytes);
130 if (prof_buffer) { 130 if (prof_buffer)
131 memset(prof_buffer, 0, buffer_bytes);
132 return 0; 131 return 0;
133 }
134 132
135 free_cpumask_var(prof_cpu_mask); 133 free_cpumask_var(prof_cpu_mask);
136 return -ENOMEM; 134 return -ENOMEM;
@@ -305,14 +303,12 @@ static void profile_discard_flip_buffers(void)
305 mutex_unlock(&profile_flip_mutex); 303 mutex_unlock(&profile_flip_mutex);
306} 304}
307 305
308void profile_hits(int type, void *__pc, unsigned int nr_hits) 306static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
309{ 307{
310 unsigned long primary, secondary, flags, pc = (unsigned long)__pc; 308 unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
311 int i, j, cpu; 309 int i, j, cpu;
312 struct profile_hit *hits; 310 struct profile_hit *hits;
313 311
314 if (prof_on != type || !prof_buffer)
315 return;
316 pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); 312 pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
317 i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; 313 i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
318 secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; 314 secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
@@ -419,16 +415,20 @@ out_free:
419#define profile_discard_flip_buffers() do { } while (0) 415#define profile_discard_flip_buffers() do { } while (0)
420#define profile_cpu_callback NULL 416#define profile_cpu_callback NULL
421 417
422void profile_hits(int type, void *__pc, unsigned int nr_hits) 418static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
423{ 419{
424 unsigned long pc; 420 unsigned long pc;
425
426 if (prof_on != type || !prof_buffer)
427 return;
428 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; 421 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
429 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); 422 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
430} 423}
431#endif /* !CONFIG_SMP */ 424#endif /* !CONFIG_SMP */
425
426void profile_hits(int type, void *__pc, unsigned int nr_hits)
427{
428 if (prof_on != type || !prof_buffer)
429 return;
430 do_profile_hits(type, __pc, nr_hits);
431}
432EXPORT_SYMBOL_GPL(profile_hits); 432EXPORT_SYMBOL_GPL(profile_hits);
433 433
434void profile_tick(int type) 434void profile_tick(int type)
@@ -555,6 +555,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
555static const struct file_operations proc_profile_operations = { 555static const struct file_operations proc_profile_operations = {
556 .read = read_profile, 556 .read = read_profile,
557 .write = write_profile, 557 .write = write_profile,
558 .llseek = default_llseek,
558}; 559};
559 560
560#ifdef CONFIG_SMP 561#ifdef CONFIG_SMP
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f34d798ef4a2..2df115790cd9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
22#include <linux/syscalls.h> 22#include <linux/syscalls.h>
23#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/regset.h> 24#include <linux/regset.h>
25#include <linux/hw_breakpoint.h>
25 26
26 27
27/* 28/*
@@ -37,35 +38,33 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
37 child->parent = new_parent; 38 child->parent = new_parent;
38} 39}
39 40
40/* 41/**
41 * Turn a tracing stop into a normal stop now, since with no tracer there 42 * __ptrace_unlink - unlink ptracee and restore its execution state
42 * would be no way to wake it up with SIGCONT or SIGKILL. If there was a 43 * @child: ptracee to be unlinked
43 * signal sent that would resume the child, but didn't because it was in
44 * TASK_TRACED, resume it now.
45 * Requires that irqs be disabled.
46 */
47static void ptrace_untrace(struct task_struct *child)
48{
49 spin_lock(&child->sighand->siglock);
50 if (task_is_traced(child)) {
51 /*
52 * If the group stop is completed or in progress,
53 * this thread was already counted as stopped.
54 */
55 if (child->signal->flags & SIGNAL_STOP_STOPPED ||
56 child->signal->group_stop_count)
57 __set_task_state(child, TASK_STOPPED);
58 else
59 signal_wake_up(child, 1);
60 }
61 spin_unlock(&child->sighand->siglock);
62}
63
64/*
65 * unptrace a task: move it back to its original parent and
66 * remove it from the ptrace list.
67 * 44 *
68 * Must be called with the tasklist lock write-held. 45 * Remove @child from the ptrace list, move it back to the original parent,
46 * and restore the execution state so that it conforms to the group stop
47 * state.
48 *
49 * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer
50 * exiting. For PTRACE_DETACH, unless the ptracee has been killed between
51 * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED.
52 * If the ptracer is exiting, the ptracee can be in any state.
53 *
54 * After detach, the ptracee should be in a state which conforms to the
55 * group stop. If the group is stopped or in the process of stopping, the
56 * ptracee should be put into TASK_STOPPED; otherwise, it should be woken
57 * up from TASK_TRACED.
58 *
59 * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED,
60 * it goes through TRACED -> RUNNING -> STOPPED transition which is similar
61 * to but in the opposite direction of what happens while attaching to a
62 * stopped task. However, in this direction, the intermediate RUNNING
63 * state is not hidden even from the current ptracer and if it immediately
64 * re-attaches and performs a WNOHANG wait(2), it may fail.
65 *
66 * CONTEXT:
67 * write_lock_irq(tasklist_lock)
69 */ 68 */
70void __ptrace_unlink(struct task_struct *child) 69void __ptrace_unlink(struct task_struct *child)
71{ 70{
@@ -75,8 +74,27 @@ void __ptrace_unlink(struct task_struct *child)
75 child->parent = child->real_parent; 74 child->parent = child->real_parent;
76 list_del_init(&child->ptrace_entry); 75 list_del_init(&child->ptrace_entry);
77 76
78 if (task_is_traced(child)) 77 spin_lock(&child->sighand->siglock);
79 ptrace_untrace(child); 78
79 /*
80 * Reinstate GROUP_STOP_PENDING if group stop is in effect and
81 * @child isn't dead.
82 */
83 if (!(child->flags & PF_EXITING) &&
84 (child->signal->flags & SIGNAL_STOP_STOPPED ||
85 child->signal->group_stop_count))
86 child->group_stop |= GROUP_STOP_PENDING;
87
88 /*
89 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
90 * @child in the butt. Note that @resume should be used iff @child
91 * is in TASK_TRACED; otherwise, we might unduly disrupt
92 * TASK_KILLABLE sleeps.
93 */
94 if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child))
95 signal_wake_up(child, task_is_traced(child));
96
97 spin_unlock(&child->sighand->siglock);
80} 98}
81 99
82/* 100/*
@@ -95,16 +113,14 @@ int ptrace_check_attach(struct task_struct *child, int kill)
95 */ 113 */
96 read_lock(&tasklist_lock); 114 read_lock(&tasklist_lock);
97 if ((child->ptrace & PT_PTRACED) && child->parent == current) { 115 if ((child->ptrace & PT_PTRACED) && child->parent == current) {
98 ret = 0;
99 /* 116 /*
100 * child->sighand can't be NULL, release_task() 117 * child->sighand can't be NULL, release_task()
101 * does ptrace_unlink() before __exit_signal(). 118 * does ptrace_unlink() before __exit_signal().
102 */ 119 */
103 spin_lock_irq(&child->sighand->siglock); 120 spin_lock_irq(&child->sighand->siglock);
104 if (task_is_stopped(child)) 121 WARN_ON_ONCE(task_is_stopped(child));
105 child->state = TASK_TRACED; 122 if (task_is_traced(child) || kill)
106 else if (!task_is_traced(child) && !kill) 123 ret = 0;
107 ret = -ESRCH;
108 spin_unlock_irq(&child->sighand->siglock); 124 spin_unlock_irq(&child->sighand->siglock);
109 } 125 }
110 read_unlock(&tasklist_lock); 126 read_unlock(&tasklist_lock);
@@ -134,21 +150,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
134 return 0; 150 return 0;
135 rcu_read_lock(); 151 rcu_read_lock();
136 tcred = __task_cred(task); 152 tcred = __task_cred(task);
137 if ((cred->uid != tcred->euid || 153 if (cred->user->user_ns == tcred->user->user_ns &&
138 cred->uid != tcred->suid || 154 (cred->uid == tcred->euid &&
139 cred->uid != tcred->uid || 155 cred->uid == tcred->suid &&
140 cred->gid != tcred->egid || 156 cred->uid == tcred->uid &&
141 cred->gid != tcred->sgid || 157 cred->gid == tcred->egid &&
142 cred->gid != tcred->gid) && 158 cred->gid == tcred->sgid &&
143 !capable(CAP_SYS_PTRACE)) { 159 cred->gid == tcred->gid))
144 rcu_read_unlock(); 160 goto ok;
145 return -EPERM; 161 if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
146 } 162 goto ok;
163 rcu_read_unlock();
164 return -EPERM;
165ok:
147 rcu_read_unlock(); 166 rcu_read_unlock();
148 smp_rmb(); 167 smp_rmb();
149 if (task->mm) 168 if (task->mm)
150 dumpable = get_dumpable(task->mm); 169 dumpable = get_dumpable(task->mm);
151 if (!dumpable && !capable(CAP_SYS_PTRACE)) 170 if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE))
152 return -EPERM; 171 return -EPERM;
153 172
154 return security_ptrace_access_check(task, mode); 173 return security_ptrace_access_check(task, mode);
@@ -163,8 +182,9 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
163 return !err; 182 return !err;
164} 183}
165 184
166int ptrace_attach(struct task_struct *task) 185static int ptrace_attach(struct task_struct *task)
167{ 186{
187 bool wait_trap = false;
168 int retval; 188 int retval;
169 189
170 audit_ptrace(task); 190 audit_ptrace(task);
@@ -181,7 +201,7 @@ int ptrace_attach(struct task_struct *task)
181 * under ptrace. 201 * under ptrace.
182 */ 202 */
183 retval = -ERESTARTNOINTR; 203 retval = -ERESTARTNOINTR;
184 if (mutex_lock_interruptible(&task->cred_guard_mutex)) 204 if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
185 goto out; 205 goto out;
186 206
187 task_lock(task); 207 task_lock(task);
@@ -198,18 +218,48 @@ int ptrace_attach(struct task_struct *task)
198 goto unlock_tasklist; 218 goto unlock_tasklist;
199 219
200 task->ptrace = PT_PTRACED; 220 task->ptrace = PT_PTRACED;
201 if (capable(CAP_SYS_PTRACE)) 221 if (task_ns_capable(task, CAP_SYS_PTRACE))
202 task->ptrace |= PT_PTRACE_CAP; 222 task->ptrace |= PT_PTRACE_CAP;
203 223
204 __ptrace_link(task, current); 224 __ptrace_link(task, current);
205 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); 225 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
206 226
227 spin_lock(&task->sighand->siglock);
228
229 /*
230 * If the task is already STOPPED, set GROUP_STOP_PENDING and
231 * TRAPPING, and kick it so that it transits to TRACED. TRAPPING
232 * will be cleared if the child completes the transition or any
233 * event which clears the group stop states happens. We'll wait
234 * for the transition to complete before returning from this
235 * function.
236 *
237 * This hides STOPPED -> RUNNING -> TRACED transition from the
238 * attaching thread but a different thread in the same group can
239 * still observe the transient RUNNING state. IOW, if another
240 * thread's WNOHANG wait(2) on the stopped tracee races against
241 * ATTACH, the wait(2) may fail due to the transient RUNNING.
242 *
243 * The following task_is_stopped() test is safe as both transitions
244 * in and out of STOPPED are protected by siglock.
245 */
246 if (task_is_stopped(task)) {
247 task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING;
248 signal_wake_up(task, 1);
249 wait_trap = true;
250 }
251
252 spin_unlock(&task->sighand->siglock);
253
207 retval = 0; 254 retval = 0;
208unlock_tasklist: 255unlock_tasklist:
209 write_unlock_irq(&tasklist_lock); 256 write_unlock_irq(&tasklist_lock);
210unlock_creds: 257unlock_creds:
211 mutex_unlock(&task->cred_guard_mutex); 258 mutex_unlock(&task->signal->cred_guard_mutex);
212out: 259out:
260 if (wait_trap)
261 wait_event(current->signal->wait_chldexit,
262 !(task->group_stop & GROUP_STOP_TRAPPING));
213 return retval; 263 return retval;
214} 264}
215 265
@@ -219,7 +269,7 @@ out:
219 * Performs checks and sets PT_PTRACED. 269 * Performs checks and sets PT_PTRACED.
220 * Should be used by all ptrace implementations for PTRACE_TRACEME. 270 * Should be used by all ptrace implementations for PTRACE_TRACEME.
221 */ 271 */
222int ptrace_traceme(void) 272static int ptrace_traceme(void)
223{ 273{
224 int ret = -EPERM; 274 int ret = -EPERM;
225 275
@@ -293,7 +343,7 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
293 return false; 343 return false;
294} 344}
295 345
296int ptrace_detach(struct task_struct *child, unsigned int data) 346static int ptrace_detach(struct task_struct *child, unsigned int data)
297{ 347{
298 bool dead = false; 348 bool dead = false;
299 349
@@ -312,8 +362,6 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
312 if (child->ptrace) { 362 if (child->ptrace) {
313 child->exit_code = data; 363 child->exit_code = data;
314 dead = __ptrace_detach(current, child); 364 dead = __ptrace_detach(current, child);
315 if (!child->exit_state)
316 wake_up_process(child);
317 } 365 }
318 write_unlock_irq(&tasklist_lock); 366 write_unlock_irq(&tasklist_lock);
319 367
@@ -329,6 +377,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
329 * and reacquire the lock. 377 * and reacquire the lock.
330 */ 378 */
331void exit_ptrace(struct task_struct *tracer) 379void exit_ptrace(struct task_struct *tracer)
380 __releases(&tasklist_lock)
381 __acquires(&tasklist_lock)
332{ 382{
333 struct task_struct *p, *n; 383 struct task_struct *p, *n;
334 LIST_HEAD(ptrace_dead); 384 LIST_HEAD(ptrace_dead);
@@ -402,7 +452,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
402 return copied; 452 return copied;
403} 453}
404 454
405static int ptrace_setoptions(struct task_struct *child, long data) 455static int ptrace_setoptions(struct task_struct *child, unsigned long data)
406{ 456{
407 child->ptrace &= ~PT_TRACE_MASK; 457 child->ptrace &= ~PT_TRACE_MASK;
408 458
@@ -481,7 +531,8 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
481#define is_sysemu_singlestep(request) 0 531#define is_sysemu_singlestep(request) 0
482#endif 532#endif
483 533
484static int ptrace_resume(struct task_struct *child, long request, long data) 534static int ptrace_resume(struct task_struct *child, long request,
535 unsigned long data)
485{ 536{
486 if (!valid_signal(data)) 537 if (!valid_signal(data))
487 return -EIO; 538 return -EIO;
@@ -511,7 +562,7 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
511 } 562 }
512 563
513 child->exit_code = data; 564 child->exit_code = data;
514 wake_up_process(child); 565 wake_up_state(child, __TASK_TRACED);
515 566
516 return 0; 567 return 0;
517} 568}
@@ -558,10 +609,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
558#endif 609#endif
559 610
560int ptrace_request(struct task_struct *child, long request, 611int ptrace_request(struct task_struct *child, long request,
561 long addr, long data) 612 unsigned long addr, unsigned long data)
562{ 613{
563 int ret = -EIO; 614 int ret = -EIO;
564 siginfo_t siginfo; 615 siginfo_t siginfo;
616 void __user *datavp = (void __user *) data;
617 unsigned long __user *datalp = datavp;
565 618
566 switch (request) { 619 switch (request) {
567 case PTRACE_PEEKTEXT: 620 case PTRACE_PEEKTEXT:
@@ -578,19 +631,17 @@ int ptrace_request(struct task_struct *child, long request,
578 ret = ptrace_setoptions(child, data); 631 ret = ptrace_setoptions(child, data);
579 break; 632 break;
580 case PTRACE_GETEVENTMSG: 633 case PTRACE_GETEVENTMSG:
581 ret = put_user(child->ptrace_message, (unsigned long __user *) data); 634 ret = put_user(child->ptrace_message, datalp);
582 break; 635 break;
583 636
584 case PTRACE_GETSIGINFO: 637 case PTRACE_GETSIGINFO:
585 ret = ptrace_getsiginfo(child, &siginfo); 638 ret = ptrace_getsiginfo(child, &siginfo);
586 if (!ret) 639 if (!ret)
587 ret = copy_siginfo_to_user((siginfo_t __user *) data, 640 ret = copy_siginfo_to_user(datavp, &siginfo);
588 &siginfo);
589 break; 641 break;
590 642
591 case PTRACE_SETSIGINFO: 643 case PTRACE_SETSIGINFO:
592 if (copy_from_user(&siginfo, (siginfo_t __user *) data, 644 if (copy_from_user(&siginfo, datavp, sizeof siginfo))
593 sizeof siginfo))
594 ret = -EFAULT; 645 ret = -EFAULT;
595 else 646 else
596 ret = ptrace_setsiginfo(child, &siginfo); 647 ret = ptrace_setsiginfo(child, &siginfo);
@@ -621,7 +672,7 @@ int ptrace_request(struct task_struct *child, long request,
621 } 672 }
622 mmput(mm); 673 mmput(mm);
623 674
624 ret = put_user(tmp, (unsigned long __user *) data); 675 ret = put_user(tmp, datalp);
625 break; 676 break;
626 } 677 }
627#endif 678#endif
@@ -650,7 +701,7 @@ int ptrace_request(struct task_struct *child, long request,
650 case PTRACE_SETREGSET: 701 case PTRACE_SETREGSET:
651 { 702 {
652 struct iovec kiov; 703 struct iovec kiov;
653 struct iovec __user *uiov = (struct iovec __user *) data; 704 struct iovec __user *uiov = datavp;
654 705
655 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) 706 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
656 return -EFAULT; 707 return -EFAULT;
@@ -691,7 +742,8 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid)
691#define arch_ptrace_attach(child) do { } while (0) 742#define arch_ptrace_attach(child) do { } while (0)
692#endif 743#endif
693 744
694SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) 745SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
746 unsigned long, data)
695{ 747{
696 struct task_struct *child; 748 struct task_struct *child;
697 long ret; 749 long ret;
@@ -732,7 +784,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
732 return ret; 784 return ret;
733} 785}
734 786
735int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) 787int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
788 unsigned long data)
736{ 789{
737 unsigned long tmp; 790 unsigned long tmp;
738 int copied; 791 int copied;
@@ -743,7 +796,8 @@ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
743 return put_user(tmp, (unsigned long __user *)data); 796 return put_user(tmp, (unsigned long __user *)data);
744} 797}
745 798
746int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) 799int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
800 unsigned long data)
747{ 801{
748 int copied; 802 int copied;
749 803
@@ -870,3 +924,19 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
870 return ret; 924 return ret;
871} 925}
872#endif /* CONFIG_COMPAT */ 926#endif /* CONFIG_COMPAT */
927
928#ifdef CONFIG_HAVE_HW_BREAKPOINT
929int ptrace_get_breakpoints(struct task_struct *tsk)
930{
931 if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt))
932 return 0;
933
934 return -1;
935}
936
937void ptrace_put_breakpoints(struct task_struct *tsk)
938{
939 if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt))
940 flush_ptrace_hw_breakpoint(tsk);
941}
942#endif /* CONFIG_HAVE_HW_BREAKPOINT */
diff --git a/kernel/range.c b/kernel/range.c
index 471b66acabb5..37fa9b99ad58 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -119,7 +119,7 @@ static int cmp_range(const void *x1, const void *x2)
119 119
120int clean_sort_range(struct range *range, int az) 120int clean_sort_range(struct range *range, int az)
121{ 121{
122 int i, j, k = az - 1, nr_range = 0; 122 int i, j, k = az - 1, nr_range = az;
123 123
124 for (i = 0; i < k; i++) { 124 for (i = 0; i < k; i++) {
125 if (range[i].end) 125 if (range[i].end)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 4d169835fb36..7784bd216b6a 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void)
73EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); 73EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
74 74
75/** 75/**
76 * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? 76 * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
77 * 77 *
78 * Check for bottom half being disabled, which covers both the 78 * Check for bottom half being disabled, which covers both the
79 * CONFIG_PROVE_RCU and not cases. Note that if someone uses 79 * CONFIG_PROVE_RCU and not cases. Note that if someone uses
80 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) 80 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
81 * will show the situation. 81 * will show the situation. This is useful for debug checks in functions
82 * that require that they be called within an RCU read-side critical
83 * section.
82 * 84 *
83 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. 85 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
84 */ 86 */
@@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void)
86{ 88{
87 if (!debug_lockdep_rcu_enabled()) 89 if (!debug_lockdep_rcu_enabled())
88 return 1; 90 return 1;
89 return in_softirq(); 91 return in_softirq() || irqs_disabled();
90} 92}
91EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); 93EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
92 94
@@ -140,10 +142,17 @@ static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
140 * Ensure that queued callbacks are all executed. 142 * Ensure that queued callbacks are all executed.
141 * If we detect that we are nested in a RCU read-side critical 143 * If we detect that we are nested in a RCU read-side critical
142 * section, we should simply fail, otherwise we would deadlock. 144 * section, we should simply fail, otherwise we would deadlock.
145 * In !PREEMPT configurations, there is no way to tell if we are
146 * in a RCU read-side critical section or not, so we never
147 * attempt any fixup and just print a warning.
143 */ 148 */
149#ifndef CONFIG_PREEMPT
150 WARN_ON_ONCE(1);
151 return 0;
152#endif
144 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 153 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
145 irqs_disabled()) { 154 irqs_disabled()) {
146 WARN_ON(1); 155 WARN_ON_ONCE(1);
147 return 0; 156 return 0;
148 } 157 }
149 rcu_barrier(); 158 rcu_barrier();
@@ -182,10 +191,17 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
182 * Ensure that queued callbacks are all executed. 191 * Ensure that queued callbacks are all executed.
183 * If we detect that we are nested in a RCU read-side critical 192 * If we detect that we are nested in a RCU read-side critical
184 * section, we should simply fail, otherwise we would deadlock. 193 * section, we should simply fail, otherwise we would deadlock.
194 * In !PREEMPT configurations, there is no way to tell if we are
195 * in a RCU read-side critical section or not, so we never
196 * attempt any fixup and just print a warning.
185 */ 197 */
198#ifndef CONFIG_PREEMPT
199 WARN_ON_ONCE(1);
200 return 0;
201#endif
186 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 202 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
187 irqs_disabled()) { 203 irqs_disabled()) {
188 WARN_ON(1); 204 WARN_ON_ONCE(1);
189 return 0; 205 return 0;
190 } 206 }
191 rcu_barrier(); 207 rcu_barrier();
@@ -212,14 +228,17 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
212 * Ensure that queued callbacks are all executed. 228 * Ensure that queued callbacks are all executed.
213 * If we detect that we are nested in a RCU read-side critical 229 * If we detect that we are nested in a RCU read-side critical
214 * section, we should simply fail, otherwise we would deadlock. 230 * section, we should simply fail, otherwise we would deadlock.
231 * In !PREEMPT configurations, there is no way to tell if we are
232 * in a RCU read-side critical section or not, so we never
233 * attempt any fixup and just print a warning.
215 */ 234 */
216#ifndef CONFIG_PREEMPT 235#ifndef CONFIG_PREEMPT
217 WARN_ON(1); 236 WARN_ON_ONCE(1);
218 return 0; 237 return 0;
219#else 238#endif
220 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 239 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
221 irqs_disabled()) { 240 irqs_disabled()) {
222 WARN_ON(1); 241 WARN_ON_ONCE(1);
223 return 0; 242 return 0;
224 } 243 }
225 rcu_barrier(); 244 rcu_barrier();
@@ -227,7 +246,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
227 rcu_barrier_bh(); 246 rcu_barrier_bh();
228 debug_object_free(head, &rcuhead_debug_descr); 247 debug_object_free(head, &rcuhead_debug_descr);
229 return 1; 248 return 1;
230#endif
231 default: 249 default:
232 return 0; 250 return 0;
233 } 251 }
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 196ec02f8be0..7bbac7d0f5ab 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -35,29 +35,23 @@
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/time.h> 36#include <linux/time.h>
37#include <linux/cpu.h> 37#include <linux/cpu.h>
38#include <linux/prefetch.h>
39
40/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
41static struct task_struct *rcu_kthread_task;
42static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
43static unsigned long have_rcu_kthread_work;
44
45/* Forward declarations for rcutiny_plugin.h. */
46struct rcu_ctrlblk;
47static void invoke_rcu_kthread(void);
48static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
49static int rcu_kthread(void *arg);
50static void __call_rcu(struct rcu_head *head,
51 void (*func)(struct rcu_head *rcu),
52 struct rcu_ctrlblk *rcp);
38 53
39/* Global control variables for rcupdate callback mechanism. */ 54#include "rcutiny_plugin.h"
40struct rcu_ctrlblk {
41 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
42 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
43 struct rcu_head **curtail; /* ->next pointer of last CB. */
44};
45
46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_sched_ctrlblk = {
48 .donetail = &rcu_sched_ctrlblk.rcucblist,
49 .curtail = &rcu_sched_ctrlblk.rcucblist,
50};
51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
53 .donetail = &rcu_bh_ctrlblk.rcucblist,
54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55};
56
57#ifdef CONFIG_DEBUG_LOCK_ALLOC
58int rcu_scheduler_active __read_mostly;
59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61 55
62#ifdef CONFIG_NO_HZ 56#ifdef CONFIG_NO_HZ
63 57
@@ -86,36 +80,45 @@ void rcu_exit_nohz(void)
86#endif /* #ifdef CONFIG_NO_HZ */ 80#endif /* #ifdef CONFIG_NO_HZ */
87 81
88/* 82/*
89 * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc(). 83 * Helper function for rcu_sched_qs() and rcu_bh_qs().
90 * Also disable irqs to avoid confusion due to interrupt handlers 84 * Also irqs are disabled to avoid confusion due to interrupt handlers
91 * invoking call_rcu(). 85 * invoking call_rcu().
92 */ 86 */
93static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 87static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
94{ 88{
95 unsigned long flags;
96
97 local_irq_save(flags);
98 if (rcp->rcucblist != NULL && 89 if (rcp->rcucblist != NULL &&
99 rcp->donetail != rcp->curtail) { 90 rcp->donetail != rcp->curtail) {
100 rcp->donetail = rcp->curtail; 91 rcp->donetail = rcp->curtail;
101 local_irq_restore(flags);
102 return 1; 92 return 1;
103 } 93 }
104 local_irq_restore(flags);
105 94
106 return 0; 95 return 0;
107} 96}
108 97
109/* 98/*
99 * Wake up rcu_kthread() to process callbacks now eligible for invocation
100 * or to boost readers.
101 */
102static void invoke_rcu_kthread(void)
103{
104 have_rcu_kthread_work = 1;
105 wake_up(&rcu_kthread_wq);
106}
107
108/*
110 * Record an rcu quiescent state. And an rcu_bh quiescent state while we 109 * Record an rcu quiescent state. And an rcu_bh quiescent state while we
111 * are at it, given that any rcu quiescent state is also an rcu_bh 110 * are at it, given that any rcu quiescent state is also an rcu_bh
112 * quiescent state. Use "+" instead of "||" to defeat short circuiting. 111 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
113 */ 112 */
114void rcu_sched_qs(int cpu) 113void rcu_sched_qs(int cpu)
115{ 114{
115 unsigned long flags;
116
117 local_irq_save(flags);
116 if (rcu_qsctr_help(&rcu_sched_ctrlblk) + 118 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
117 rcu_qsctr_help(&rcu_bh_ctrlblk)) 119 rcu_qsctr_help(&rcu_bh_ctrlblk))
118 raise_softirq(RCU_SOFTIRQ); 120 invoke_rcu_kthread();
121 local_irq_restore(flags);
119} 122}
120 123
121/* 124/*
@@ -123,8 +126,12 @@ void rcu_sched_qs(int cpu)
123 */ 126 */
124void rcu_bh_qs(int cpu) 127void rcu_bh_qs(int cpu)
125{ 128{
129 unsigned long flags;
130
131 local_irq_save(flags);
126 if (rcu_qsctr_help(&rcu_bh_ctrlblk)) 132 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
127 raise_softirq(RCU_SOFTIRQ); 133 invoke_rcu_kthread();
134 local_irq_restore(flags);
128} 135}
129 136
130/* 137/*
@@ -140,16 +147,18 @@ void rcu_check_callbacks(int cpu, int user)
140 rcu_sched_qs(cpu); 147 rcu_sched_qs(cpu);
141 else if (!in_softirq()) 148 else if (!in_softirq())
142 rcu_bh_qs(cpu); 149 rcu_bh_qs(cpu);
150 rcu_preempt_check_callbacks();
143} 151}
144 152
145/* 153/*
146 * Helper function for rcu_process_callbacks() that operates on the 154 * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
147 * specified rcu_ctrlkblk structure. 155 * whose grace period has elapsed.
148 */ 156 */
149static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) 157static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
150{ 158{
151 struct rcu_head *next, *list; 159 struct rcu_head *next, *list;
152 unsigned long flags; 160 unsigned long flags;
161 RCU_TRACE(int cb_count = 0);
153 162
154 /* If no RCU callbacks ready to invoke, just return. */ 163 /* If no RCU callbacks ready to invoke, just return. */
155 if (&rcp->rcucblist == rcp->donetail) 164 if (&rcp->rcucblist == rcp->donetail)
@@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
162 *rcp->donetail = NULL; 171 *rcp->donetail = NULL;
163 if (rcp->curtail == rcp->donetail) 172 if (rcp->curtail == rcp->donetail)
164 rcp->curtail = &rcp->rcucblist; 173 rcp->curtail = &rcp->rcucblist;
174 rcu_preempt_remove_callbacks(rcp);
165 rcp->donetail = &rcp->rcucblist; 175 rcp->donetail = &rcp->rcucblist;
166 local_irq_restore(flags); 176 local_irq_restore(flags);
167 177
@@ -170,18 +180,45 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
170 next = list->next; 180 next = list->next;
171 prefetch(next); 181 prefetch(next);
172 debug_rcu_head_unqueue(list); 182 debug_rcu_head_unqueue(list);
173 list->func(list); 183 local_bh_disable();
184 __rcu_reclaim(list);
185 local_bh_enable();
174 list = next; 186 list = next;
187 RCU_TRACE(cb_count++);
175 } 188 }
189 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
176} 190}
177 191
178/* 192/*
179 * Invoke any callbacks whose grace period has completed. 193 * This kthread invokes RCU callbacks whose grace periods have
194 * elapsed. It is awakened as needed, and takes the place of the
195 * RCU_SOFTIRQ that was used previously for this purpose.
196 * This is a kthread, but it is never stopped, at least not until
197 * the system goes down.
180 */ 198 */
181static void rcu_process_callbacks(struct softirq_action *unused) 199static int rcu_kthread(void *arg)
182{ 200{
183 __rcu_process_callbacks(&rcu_sched_ctrlblk); 201 unsigned long work;
184 __rcu_process_callbacks(&rcu_bh_ctrlblk); 202 unsigned long morework;
203 unsigned long flags;
204
205 for (;;) {
206 wait_event_interruptible(rcu_kthread_wq,
207 have_rcu_kthread_work != 0);
208 morework = rcu_boost();
209 local_irq_save(flags);
210 work = have_rcu_kthread_work;
211 have_rcu_kthread_work = morework;
212 local_irq_restore(flags);
213 if (work) {
214 rcu_process_callbacks(&rcu_sched_ctrlblk);
215 rcu_process_callbacks(&rcu_bh_ctrlblk);
216 rcu_preempt_process_callbacks();
217 }
218 schedule_timeout_interruptible(1); /* Leave CPU for others. */
219 }
220
221 return 0; /* Not reached, but needed to shut gcc up. */
185} 222}
186 223
187/* 224/*
@@ -219,19 +256,20 @@ static void __call_rcu(struct rcu_head *head,
219 local_irq_save(flags); 256 local_irq_save(flags);
220 *rcp->curtail = head; 257 *rcp->curtail = head;
221 rcp->curtail = &head->next; 258 rcp->curtail = &head->next;
259 RCU_TRACE(rcp->qlen++);
222 local_irq_restore(flags); 260 local_irq_restore(flags);
223} 261}
224 262
225/* 263/*
226 * Post an RCU callback to be invoked after the end of an RCU grace 264 * Post an RCU callback to be invoked after the end of an RCU-sched grace
227 * period. But since we have but one CPU, that would be after any 265 * period. But since we have but one CPU, that would be after any
228 * quiescent state. 266 * quiescent state.
229 */ 267 */
230void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 268void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
231{ 269{
232 __call_rcu(head, func, &rcu_sched_ctrlblk); 270 __call_rcu(head, func, &rcu_sched_ctrlblk);
233} 271}
234EXPORT_SYMBOL_GPL(call_rcu); 272EXPORT_SYMBOL_GPL(call_rcu_sched);
235 273
236/* 274/*
237 * Post an RCU bottom-half callback to be invoked after any subsequent 275 * Post an RCU bottom-half callback to be invoked after any subsequent
@@ -243,20 +281,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
243} 281}
244EXPORT_SYMBOL_GPL(call_rcu_bh); 282EXPORT_SYMBOL_GPL(call_rcu_bh);
245 283
246void rcu_barrier(void)
247{
248 struct rcu_synchronize rcu;
249
250 init_rcu_head_on_stack(&rcu.head);
251 init_completion(&rcu.completion);
252 /* Will wake me after RCU finished. */
253 call_rcu(&rcu.head, wakeme_after_rcu);
254 /* Wait for it. */
255 wait_for_completion(&rcu.completion);
256 destroy_rcu_head_on_stack(&rcu.head);
257}
258EXPORT_SYMBOL_GPL(rcu_barrier);
259
260void rcu_barrier_bh(void) 284void rcu_barrier_bh(void)
261{ 285{
262 struct rcu_synchronize rcu; 286 struct rcu_synchronize rcu;
@@ -285,9 +309,16 @@ void rcu_barrier_sched(void)
285} 309}
286EXPORT_SYMBOL_GPL(rcu_barrier_sched); 310EXPORT_SYMBOL_GPL(rcu_barrier_sched);
287 311
288void __init rcu_init(void) 312/*
313 * Spawn the kthread that invokes RCU callbacks.
314 */
315static int __init rcu_spawn_kthreads(void)
289{ 316{
290 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 317 struct sched_param sp;
291}
292 318
293#include "rcutiny_plugin.h" 319 rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
320 sp.sched_priority = RCU_BOOST_PRIO;
321 sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
322 return 0;
323}
324early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index d223a92bc742..f259c676195f 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version) 2 * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
3 * Internal non-public definitions that provide either classic 3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics. 4 * or preemptible semantics.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -17,23 +17,991 @@
17 * along with this program; if not, write to the Free Software 17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 * 19 *
20 * Copyright IBM Corporation, 2009 20 * Copyright (c) 2010 Linaro
21 * 21 *
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */ 23 */
24 24
25#include <linux/kthread.h>
26#include <linux/debugfs.h>
27#include <linux/seq_file.h>
28
29#ifdef CONFIG_RCU_TRACE
30#define RCU_TRACE(stmt) stmt
31#else /* #ifdef CONFIG_RCU_TRACE */
32#define RCU_TRACE(stmt)
33#endif /* #else #ifdef CONFIG_RCU_TRACE */
34
35/* Global control variables for rcupdate callback mechanism. */
36struct rcu_ctrlblk {
37 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
38 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
39 struct rcu_head **curtail; /* ->next pointer of last CB. */
40 RCU_TRACE(long qlen); /* Number of pending CBs. */
41};
42
43/* Definition for rcupdate control block. */
44static struct rcu_ctrlblk rcu_sched_ctrlblk = {
45 .donetail = &rcu_sched_ctrlblk.rcucblist,
46 .curtail = &rcu_sched_ctrlblk.rcucblist,
47};
48
49static struct rcu_ctrlblk rcu_bh_ctrlblk = {
50 .donetail = &rcu_bh_ctrlblk.rcucblist,
51 .curtail = &rcu_bh_ctrlblk.rcucblist,
52};
53
25#ifdef CONFIG_DEBUG_LOCK_ALLOC 54#ifdef CONFIG_DEBUG_LOCK_ALLOC
55int rcu_scheduler_active __read_mostly;
56EXPORT_SYMBOL_GPL(rcu_scheduler_active);
57#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
58
59#ifdef CONFIG_TINY_PREEMPT_RCU
60
61#include <linux/delay.h>
62
63/* Global control variables for preemptible RCU. */
64struct rcu_preempt_ctrlblk {
65 struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */
66 struct rcu_head **nexttail;
67 /* Tasks blocked in a preemptible RCU */
68 /* read-side critical section while an */
69 /* preemptible-RCU grace period is in */
70 /* progress must wait for a later grace */
71 /* period. This pointer points to the */
72 /* ->next pointer of the last task that */
73 /* must wait for a later grace period, or */
74 /* to &->rcb.rcucblist if there is no */
75 /* such task. */
76 struct list_head blkd_tasks;
77 /* Tasks blocked in RCU read-side critical */
78 /* section. Tasks are placed at the head */
79 /* of this list and age towards the tail. */
80 struct list_head *gp_tasks;
81 /* Pointer to the first task blocking the */
82 /* current grace period, or NULL if there */
83 /* is no such task. */
84 struct list_head *exp_tasks;
85 /* Pointer to first task blocking the */
86 /* current expedited grace period, or NULL */
87 /* if there is no such task. If there */
88 /* is no current expedited grace period, */
89 /* then there cannot be any such task. */
90#ifdef CONFIG_RCU_BOOST
91 struct list_head *boost_tasks;
92 /* Pointer to first task that needs to be */
93 /* priority-boosted, or NULL if no priority */
94 /* boosting is needed. If there is no */
95 /* current or expedited grace period, there */
96 /* can be no such task. */
97#endif /* #ifdef CONFIG_RCU_BOOST */
98 u8 gpnum; /* Current grace period. */
99 u8 gpcpu; /* Last grace period blocked by the CPU. */
100 u8 completed; /* Last grace period completed. */
101 /* If all three are equal, RCU is idle. */
102#ifdef CONFIG_RCU_BOOST
103 unsigned long boost_time; /* When to start boosting (jiffies) */
104#endif /* #ifdef CONFIG_RCU_BOOST */
105#ifdef CONFIG_RCU_TRACE
106 unsigned long n_grace_periods;
107#ifdef CONFIG_RCU_BOOST
108 unsigned long n_tasks_boosted;
109 /* Total number of tasks boosted. */
110 unsigned long n_exp_boosts;
111 /* Number of tasks boosted for expedited GP. */
112 unsigned long n_normal_boosts;
113 /* Number of tasks boosted for normal GP. */
114 unsigned long n_balk_blkd_tasks;
115 /* Refused to boost: no blocked tasks. */
116 unsigned long n_balk_exp_gp_tasks;
117 /* Refused to boost: nothing blocking GP. */
118 unsigned long n_balk_boost_tasks;
119 /* Refused to boost: already boosting. */
120 unsigned long n_balk_notyet;
121 /* Refused to boost: not yet time. */
122 unsigned long n_balk_nos;
123 /* Refused to boost: not sure why, though. */
124 /* This can happen due to race conditions. */
125#endif /* #ifdef CONFIG_RCU_BOOST */
126#endif /* #ifdef CONFIG_RCU_TRACE */
127};
128
129static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
130 .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist,
131 .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
132 .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
133 .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
134};
135
136static int rcu_preempted_readers_exp(void);
137static void rcu_report_exp_done(void);
138
139/*
140 * Return true if the CPU has not yet responded to the current grace period.
141 */
142static int rcu_cpu_blocking_cur_gp(void)
143{
144 return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum;
145}
146
147/*
148 * Check for a running RCU reader. Because there is only one CPU,
149 * there can be but one running RCU reader at a time. ;-)
150 */
151static int rcu_preempt_running_reader(void)
152{
153 return current->rcu_read_lock_nesting;
154}
155
156/*
157 * Check for preempted RCU readers blocking any grace period.
158 * If the caller needs a reliable answer, it must disable hard irqs.
159 */
160static int rcu_preempt_blocked_readers_any(void)
161{
162 return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks);
163}
164
165/*
166 * Check for preempted RCU readers blocking the current grace period.
167 * If the caller needs a reliable answer, it must disable hard irqs.
168 */
169static int rcu_preempt_blocked_readers_cgp(void)
170{
171 return rcu_preempt_ctrlblk.gp_tasks != NULL;
172}
173
174/*
175 * Return true if another preemptible-RCU grace period is needed.
176 */
177static int rcu_preempt_needs_another_gp(void)
178{
179 return *rcu_preempt_ctrlblk.rcb.curtail != NULL;
180}
181
182/*
183 * Return true if a preemptible-RCU grace period is in progress.
184 * The caller must disable hardirqs.
185 */
186static int rcu_preempt_gp_in_progress(void)
187{
188 return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
189}
190
191/*
192 * Advance a ->blkd_tasks-list pointer to the next entry, instead
193 * returning NULL if at the end of the list.
194 */
195static struct list_head *rcu_next_node_entry(struct task_struct *t)
196{
197 struct list_head *np;
198
199 np = t->rcu_node_entry.next;
200 if (np == &rcu_preempt_ctrlblk.blkd_tasks)
201 np = NULL;
202 return np;
203}
204
205#ifdef CONFIG_RCU_TRACE
206
207#ifdef CONFIG_RCU_BOOST
208static void rcu_initiate_boost_trace(void);
209#endif /* #ifdef CONFIG_RCU_BOOST */
210
211/*
212 * Dump additional statistice for TINY_PREEMPT_RCU.
213 */
214static void show_tiny_preempt_stats(struct seq_file *m)
215{
216 seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
217 rcu_preempt_ctrlblk.rcb.qlen,
218 rcu_preempt_ctrlblk.n_grace_periods,
219 rcu_preempt_ctrlblk.gpnum,
220 rcu_preempt_ctrlblk.gpcpu,
221 rcu_preempt_ctrlblk.completed,
222 "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
223 "N."[!rcu_preempt_ctrlblk.gp_tasks],
224 "E."[!rcu_preempt_ctrlblk.exp_tasks]);
225#ifdef CONFIG_RCU_BOOST
226 seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
227 " ",
228 "B."[!rcu_preempt_ctrlblk.boost_tasks],
229 rcu_preempt_ctrlblk.n_tasks_boosted,
230 rcu_preempt_ctrlblk.n_exp_boosts,
231 rcu_preempt_ctrlblk.n_normal_boosts,
232 (int)(jiffies & 0xffff),
233 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
234 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n",
235 " balk",
236 rcu_preempt_ctrlblk.n_balk_blkd_tasks,
237 rcu_preempt_ctrlblk.n_balk_exp_gp_tasks,
238 rcu_preempt_ctrlblk.n_balk_boost_tasks,
239 rcu_preempt_ctrlblk.n_balk_notyet,
240 rcu_preempt_ctrlblk.n_balk_nos);
241#endif /* #ifdef CONFIG_RCU_BOOST */
242}
243
244#endif /* #ifdef CONFIG_RCU_TRACE */
245
246#ifdef CONFIG_RCU_BOOST
247
248#include "rtmutex_common.h"
249
250/*
251 * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
252 * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
253 */
254static int rcu_boost(void)
255{
256 unsigned long flags;
257 struct rt_mutex mtx;
258 struct task_struct *t;
259 struct list_head *tb;
260
261 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
262 rcu_preempt_ctrlblk.exp_tasks == NULL)
263 return 0; /* Nothing to boost. */
264
265 raw_local_irq_save(flags);
266
267 /*
268 * Recheck with irqs disabled: all tasks in need of boosting
269 * might exit their RCU read-side critical sections on their own
270 * if we are preempted just before disabling irqs.
271 */
272 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
273 rcu_preempt_ctrlblk.exp_tasks == NULL) {
274 raw_local_irq_restore(flags);
275 return 0;
276 }
277
278 /*
279 * Preferentially boost tasks blocking expedited grace periods.
280 * This cannot starve the normal grace periods because a second
281 * expedited grace period must boost all blocked tasks, including
282 * those blocking the pre-existing normal grace period.
283 */
284 if (rcu_preempt_ctrlblk.exp_tasks != NULL) {
285 tb = rcu_preempt_ctrlblk.exp_tasks;
286 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
287 } else {
288 tb = rcu_preempt_ctrlblk.boost_tasks;
289 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
290 }
291 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
292
293 /*
294 * We boost task t by manufacturing an rt_mutex that appears to
295 * be held by task t. We leave a pointer to that rt_mutex where
296 * task t can find it, and task t will release the mutex when it
297 * exits its outermost RCU read-side critical section. Then
298 * simply acquiring this artificial rt_mutex will boost task
299 * t's priority. (Thanks to tglx for suggesting this approach!)
300 */
301 t = container_of(tb, struct task_struct, rcu_node_entry);
302 rt_mutex_init_proxy_locked(&mtx, t);
303 t->rcu_boost_mutex = &mtx;
304 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
305 raw_local_irq_restore(flags);
306 rt_mutex_lock(&mtx);
307 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
308
309 return rcu_preempt_ctrlblk.boost_tasks != NULL ||
310 rcu_preempt_ctrlblk.exp_tasks != NULL;
311}
312
313/*
314 * Check to see if it is now time to start boosting RCU readers blocking
315 * the current grace period, and, if so, tell the rcu_kthread_task to
316 * start boosting them. If there is an expedited boost in progress,
317 * we wait for it to complete.
318 *
319 * If there are no blocked readers blocking the current grace period,
320 * return 0 to let the caller know, otherwise return 1. Note that this
321 * return value is independent of whether or not boosting was done.
322 */
323static int rcu_initiate_boost(void)
324{
325 if (!rcu_preempt_blocked_readers_cgp() &&
326 rcu_preempt_ctrlblk.exp_tasks == NULL) {
327 RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++);
328 return 0;
329 }
330 if (rcu_preempt_ctrlblk.exp_tasks != NULL ||
331 (rcu_preempt_ctrlblk.gp_tasks != NULL &&
332 rcu_preempt_ctrlblk.boost_tasks == NULL &&
333 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) {
334 if (rcu_preempt_ctrlblk.exp_tasks == NULL)
335 rcu_preempt_ctrlblk.boost_tasks =
336 rcu_preempt_ctrlblk.gp_tasks;
337 invoke_rcu_kthread();
338 } else
339 RCU_TRACE(rcu_initiate_boost_trace());
340 return 1;
341}
342
343#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
344
345/*
346 * Do priority-boost accounting for the start of a new grace period.
347 */
348static void rcu_preempt_boost_start_gp(void)
349{
350 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
351}
352
353#else /* #ifdef CONFIG_RCU_BOOST */
354
355/*
356 * If there is no RCU priority boosting, we don't boost.
357 */
358static int rcu_boost(void)
359{
360 return 0;
361}
362
363/*
364 * If there is no RCU priority boosting, we don't initiate boosting,
365 * but we do indicate whether there are blocked readers blocking the
366 * current grace period.
367 */
368static int rcu_initiate_boost(void)
369{
370 return rcu_preempt_blocked_readers_cgp();
371}
372
373/*
374 * If there is no RCU priority boosting, nothing to do at grace-period start.
375 */
376static void rcu_preempt_boost_start_gp(void)
377{
378}
379
380#endif /* else #ifdef CONFIG_RCU_BOOST */
381
382/*
383 * Record a preemptible-RCU quiescent state for the specified CPU. Note
384 * that this just means that the task currently running on the CPU is
385 * in a quiescent state. There might be any number of tasks blocked
386 * while in an RCU read-side critical section.
387 *
388 * Unlike the other rcu_*_qs() functions, callers to this function
389 * must disable irqs in order to protect the assignment to
390 * ->rcu_read_unlock_special.
391 *
392 * Because this is a single-CPU implementation, the only way a grace
393 * period can end is if the CPU is in a quiescent state. The reason is
394 * that a blocked preemptible-RCU reader can exit its critical section
395 * only if the CPU is running it at the time. Therefore, when the
396 * last task blocking the current grace period exits its RCU read-side
397 * critical section, neither the CPU nor blocked tasks will be stopping
398 * the current grace period. (In contrast, SMP implementations
399 * might have CPUs running in RCU read-side critical sections that
400 * block later grace periods -- but this is not possible given only
401 * one CPU.)
402 */
403static void rcu_preempt_cpu_qs(void)
404{
405 /* Record both CPU and task as having responded to current GP. */
406 rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
407 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
408
409 /* If there is no GP then there is nothing more to do. */
410 if (!rcu_preempt_gp_in_progress())
411 return;
412 /*
413 * Check up on boosting. If there are readers blocking the
414 * current grace period, leave.
415 */
416 if (rcu_initiate_boost())
417 return;
418
419 /* Advance callbacks. */
420 rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
421 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail;
422 rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail;
423
424 /* If there are no blocked readers, next GP is done instantly. */
425 if (!rcu_preempt_blocked_readers_any())
426 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
427
428 /* If there are done callbacks, cause them to be invoked. */
429 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
430 invoke_rcu_kthread();
431}
432
433/*
434 * Start a new RCU grace period if warranted. Hard irqs must be disabled.
435 */
436static void rcu_preempt_start_gp(void)
437{
438 if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) {
439
440 /* Official start of GP. */
441 rcu_preempt_ctrlblk.gpnum++;
442 RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
443
444 /* Any blocked RCU readers block new GP. */
445 if (rcu_preempt_blocked_readers_any())
446 rcu_preempt_ctrlblk.gp_tasks =
447 rcu_preempt_ctrlblk.blkd_tasks.next;
448
449 /* Set up for RCU priority boosting. */
450 rcu_preempt_boost_start_gp();
451
452 /* If there is no running reader, CPU is done with GP. */
453 if (!rcu_preempt_running_reader())
454 rcu_preempt_cpu_qs();
455 }
456}
457
458/*
459 * We have entered the scheduler, and the current task might soon be
460 * context-switched away from. If this task is in an RCU read-side
461 * critical section, we will no longer be able to rely on the CPU to
462 * record that fact, so we enqueue the task on the blkd_tasks list.
463 * If the task started after the current grace period began, as recorded
464 * by ->gpcpu, we enqueue at the beginning of the list. Otherwise
465 * before the element referenced by ->gp_tasks (or at the tail if
466 * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element.
467 * The task will dequeue itself when it exits the outermost enclosing
468 * RCU read-side critical section. Therefore, the current grace period
469 * cannot be permitted to complete until the ->gp_tasks pointer becomes
470 * NULL.
471 *
472 * Caller must disable preemption.
473 */
474void rcu_preempt_note_context_switch(void)
475{
476 struct task_struct *t = current;
477 unsigned long flags;
478
479 local_irq_save(flags); /* must exclude scheduler_tick(). */
480 if (rcu_preempt_running_reader() &&
481 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
482
483 /* Possibly blocking in an RCU read-side critical section. */
484 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
485
486 /*
487 * If this CPU has already checked in, then this task
488 * will hold up the next grace period rather than the
489 * current grace period. Queue the task accordingly.
490 * If the task is queued for the current grace period
491 * (i.e., this CPU has not yet passed through a quiescent
492 * state for the current grace period), then as long
493 * as that task remains queued, the current grace period
494 * cannot end.
495 */
496 list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
497 if (rcu_cpu_blocking_cur_gp())
498 rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
499 }
500
501 /*
502 * Either we were not in an RCU read-side critical section to
503 * begin with, or we have now recorded that critical section
504 * globally. Either way, we can now note a quiescent state
505 * for this CPU. Again, if we were in an RCU read-side critical
506 * section, and if that critical section was blocking the current
507 * grace period, then the fact that the task has been enqueued
508 * means that current grace period continues to be blocked.
509 */
510 rcu_preempt_cpu_qs();
511 local_irq_restore(flags);
512}
513
514/*
515 * Tiny-preemptible RCU implementation for rcu_read_lock().
516 * Just increment ->rcu_read_lock_nesting, shared state will be updated
517 * if we block.
518 */
519void __rcu_read_lock(void)
520{
521 current->rcu_read_lock_nesting++;
522 barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */
523}
524EXPORT_SYMBOL_GPL(__rcu_read_lock);
525
526/*
527 * Handle special cases during rcu_read_unlock(), such as needing to
528 * notify RCU core processing or task having blocked during the RCU
529 * read-side critical section.
530 */
531static void rcu_read_unlock_special(struct task_struct *t)
532{
533 int empty;
534 int empty_exp;
535 unsigned long flags;
536 struct list_head *np;
537 int special;
538
539 /*
540 * NMI handlers cannot block and cannot safely manipulate state.
541 * They therefore cannot possibly be special, so just leave.
542 */
543 if (in_nmi())
544 return;
545
546 local_irq_save(flags);
547
548 /*
549 * If RCU core is waiting for this CPU to exit critical section,
550 * let it know that we have done so.
551 */
552 special = t->rcu_read_unlock_special;
553 if (special & RCU_READ_UNLOCK_NEED_QS)
554 rcu_preempt_cpu_qs();
555
556 /* Hardware IRQ handlers cannot block. */
557 if (in_irq()) {
558 local_irq_restore(flags);
559 return;
560 }
561
562 /* Clean up if blocked during RCU read-side critical section. */
563 if (special & RCU_READ_UNLOCK_BLOCKED) {
564 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
565
566 /*
567 * Remove this task from the ->blkd_tasks list and adjust
568 * any pointers that might have been referencing it.
569 */
570 empty = !rcu_preempt_blocked_readers_cgp();
571 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
572 np = rcu_next_node_entry(t);
573 list_del_init(&t->rcu_node_entry);
574 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
575 rcu_preempt_ctrlblk.gp_tasks = np;
576 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
577 rcu_preempt_ctrlblk.exp_tasks = np;
578#ifdef CONFIG_RCU_BOOST
579 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
580 rcu_preempt_ctrlblk.boost_tasks = np;
581#endif /* #ifdef CONFIG_RCU_BOOST */
582
583 /*
584 * If this was the last task on the current list, and if
585 * we aren't waiting on the CPU, report the quiescent state
586 * and start a new grace period if needed.
587 */
588 if (!empty && !rcu_preempt_blocked_readers_cgp()) {
589 rcu_preempt_cpu_qs();
590 rcu_preempt_start_gp();
591 }
26 592
593 /*
594 * If this was the last task on the expedited lists,
595 * then we need wake up the waiting task.
596 */
597 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
598 rcu_report_exp_done();
599 }
600#ifdef CONFIG_RCU_BOOST
601 /* Unboost self if was boosted. */
602 if (special & RCU_READ_UNLOCK_BOOSTED) {
603 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
604 rt_mutex_unlock(t->rcu_boost_mutex);
605 t->rcu_boost_mutex = NULL;
606 }
607#endif /* #ifdef CONFIG_RCU_BOOST */
608 local_irq_restore(flags);
609}
610
611/*
612 * Tiny-preemptible RCU implementation for rcu_read_unlock().
613 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
614 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
615 * invoke rcu_read_unlock_special() to clean up after a context switch
616 * in an RCU read-side critical section and other special cases.
617 */
618void __rcu_read_unlock(void)
619{
620 struct task_struct *t = current;
621
622 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
623 --t->rcu_read_lock_nesting;
624 barrier(); /* decrement before load of ->rcu_read_unlock_special */
625 if (t->rcu_read_lock_nesting == 0 &&
626 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
627 rcu_read_unlock_special(t);
628#ifdef CONFIG_PROVE_LOCKING
629 WARN_ON_ONCE(t->rcu_read_lock_nesting < 0);
630#endif /* #ifdef CONFIG_PROVE_LOCKING */
631}
632EXPORT_SYMBOL_GPL(__rcu_read_unlock);
633
634/*
635 * Check for a quiescent state from the current CPU. When a task blocks,
636 * the task is recorded in the rcu_preempt_ctrlblk structure, which is
637 * checked elsewhere. This is called from the scheduling-clock interrupt.
638 *
639 * Caller must disable hard irqs.
640 */
641static void rcu_preempt_check_callbacks(void)
642{
643 struct task_struct *t = current;
644
645 if (rcu_preempt_gp_in_progress() &&
646 (!rcu_preempt_running_reader() ||
647 !rcu_cpu_blocking_cur_gp()))
648 rcu_preempt_cpu_qs();
649 if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
650 rcu_preempt_ctrlblk.rcb.donetail)
651 invoke_rcu_kthread();
652 if (rcu_preempt_gp_in_progress() &&
653 rcu_cpu_blocking_cur_gp() &&
654 rcu_preempt_running_reader())
655 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
656}
657
658/*
659 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
660 * update, so this is invoked from rcu_process_callbacks() to
661 * handle that case. Of course, it is invoked for all flavors of
662 * RCU, but RCU callbacks can appear only on one of the lists, and
663 * neither ->nexttail nor ->donetail can possibly be NULL, so there
664 * is no need for an explicit check.
665 */
666static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
667{
668 if (rcu_preempt_ctrlblk.nexttail == rcp->donetail)
669 rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist;
670}
671
672/*
673 * Process callbacks for preemptible RCU.
674 */
675static void rcu_preempt_process_callbacks(void)
676{
677 rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
678}
679
680/*
681 * Queue a preemptible -RCU callback for invocation after a grace period.
682 */
683void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
684{
685 unsigned long flags;
686
687 debug_rcu_head_queue(head);
688 head->func = func;
689 head->next = NULL;
690
691 local_irq_save(flags);
692 *rcu_preempt_ctrlblk.nexttail = head;
693 rcu_preempt_ctrlblk.nexttail = &head->next;
694 RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
695 rcu_preempt_start_gp(); /* checks to see if GP needed. */
696 local_irq_restore(flags);
697}
698EXPORT_SYMBOL_GPL(call_rcu);
699
700void rcu_barrier(void)
701{
702 struct rcu_synchronize rcu;
703
704 init_rcu_head_on_stack(&rcu.head);
705 init_completion(&rcu.completion);
706 /* Will wake me after RCU finished. */
707 call_rcu(&rcu.head, wakeme_after_rcu);
708 /* Wait for it. */
709 wait_for_completion(&rcu.completion);
710 destroy_rcu_head_on_stack(&rcu.head);
711}
712EXPORT_SYMBOL_GPL(rcu_barrier);
713
714/*
715 * synchronize_rcu - wait until a grace period has elapsed.
716 *
717 * Control will return to the caller some time after a full grace
718 * period has elapsed, in other words after all currently executing RCU
719 * read-side critical sections have completed. RCU read-side critical
720 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
721 * and may be nested.
722 */
723void synchronize_rcu(void)
724{
725#ifdef CONFIG_DEBUG_LOCK_ALLOC
726 if (!rcu_scheduler_active)
727 return;
728#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
729
730 WARN_ON_ONCE(rcu_preempt_running_reader());
731 if (!rcu_preempt_blocked_readers_any())
732 return;
733
734 /* Once we get past the fastpath checks, same code as rcu_barrier(). */
735 rcu_barrier();
736}
737EXPORT_SYMBOL_GPL(synchronize_rcu);
738
739static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
740static unsigned long sync_rcu_preempt_exp_count;
741static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
742
743/*
744 * Return non-zero if there are any tasks in RCU read-side critical
745 * sections blocking the current preemptible-RCU expedited grace period.
746 * If there is no preemptible-RCU expedited grace period currently in
747 * progress, returns zero unconditionally.
748 */
749static int rcu_preempted_readers_exp(void)
750{
751 return rcu_preempt_ctrlblk.exp_tasks != NULL;
752}
753
754/*
755 * Report the exit from RCU read-side critical section for the last task
756 * that queued itself during or before the current expedited preemptible-RCU
757 * grace period.
758 */
759static void rcu_report_exp_done(void)
760{
761 wake_up(&sync_rcu_preempt_exp_wq);
762}
763
764/*
765 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
766 * is to rely in the fact that there is but one CPU, and that it is
767 * illegal for a task to invoke synchronize_rcu_expedited() while in a
768 * preemptible-RCU read-side critical section. Therefore, any such
769 * critical sections must correspond to blocked tasks, which must therefore
770 * be on the ->blkd_tasks list. So just record the current head of the
771 * list in the ->exp_tasks pointer, and wait for all tasks including and
772 * after the task pointed to by ->exp_tasks to drain.
773 */
774void synchronize_rcu_expedited(void)
775{
776 unsigned long flags;
777 struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk;
778 unsigned long snap;
779
780 barrier(); /* ensure prior action seen before grace period. */
781
782 WARN_ON_ONCE(rcu_preempt_running_reader());
783
784 /*
785 * Acquire lock so that there is only one preemptible RCU grace
786 * period in flight. Of course, if someone does the expedited
787 * grace period for us while we are acquiring the lock, just leave.
788 */
789 snap = sync_rcu_preempt_exp_count + 1;
790 mutex_lock(&sync_rcu_preempt_exp_mutex);
791 if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count))
792 goto unlock_mb_ret; /* Others did our work for us. */
793
794 local_irq_save(flags);
795
796 /*
797 * All RCU readers have to already be on blkd_tasks because
798 * we cannot legally be executing in an RCU read-side critical
799 * section.
800 */
801
802 /* Snapshot current head of ->blkd_tasks list. */
803 rpcp->exp_tasks = rpcp->blkd_tasks.next;
804 if (rpcp->exp_tasks == &rpcp->blkd_tasks)
805 rpcp->exp_tasks = NULL;
806
807 /* Wait for tail of ->blkd_tasks list to drain. */
808 if (!rcu_preempted_readers_exp())
809 local_irq_restore(flags);
810 else {
811 rcu_initiate_boost();
812 local_irq_restore(flags);
813 wait_event(sync_rcu_preempt_exp_wq,
814 !rcu_preempted_readers_exp());
815 }
816
817 /* Clean up and exit. */
818 barrier(); /* ensure expedited GP seen before counter increment. */
819 sync_rcu_preempt_exp_count++;
820unlock_mb_ret:
821 mutex_unlock(&sync_rcu_preempt_exp_mutex);
822 barrier(); /* ensure subsequent action seen after grace period. */
823}
824EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
825
826/*
827 * Does preemptible RCU need the CPU to stay out of dynticks mode?
828 */
829int rcu_preempt_needs_cpu(void)
830{
831 if (!rcu_preempt_running_reader())
832 rcu_preempt_cpu_qs();
833 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
834}
835
836/*
837 * Check for a task exiting while in a preemptible -RCU read-side
838 * critical section, clean up if so. No need to issue warnings,
839 * as debug_check_no_locks_held() already does this if lockdep
840 * is enabled.
841 */
842void exit_rcu(void)
843{
844 struct task_struct *t = current;
845
846 if (t->rcu_read_lock_nesting == 0)
847 return;
848 t->rcu_read_lock_nesting = 1;
849 __rcu_read_unlock();
850}
851
852#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
853
854#ifdef CONFIG_RCU_TRACE
855
856/*
857 * Because preemptible RCU does not exist, it is not necessary to
858 * dump out its statistics.
859 */
860static void show_tiny_preempt_stats(struct seq_file *m)
861{
862}
863
864#endif /* #ifdef CONFIG_RCU_TRACE */
865
866/*
867 * Because preemptible RCU does not exist, it is never necessary to
868 * boost preempted RCU readers.
869 */
870static int rcu_boost(void)
871{
872 return 0;
873}
874
875/*
876 * Because preemptible RCU does not exist, it never has any callbacks
877 * to check.
878 */
879static void rcu_preempt_check_callbacks(void)
880{
881}
882
883/*
884 * Because preemptible RCU does not exist, it never has any callbacks
885 * to remove.
886 */
887static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
888{
889}
890
891/*
892 * Because preemptible RCU does not exist, it never has any callbacks
893 * to process.
894 */
895static void rcu_preempt_process_callbacks(void)
896{
897}
898
899#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
900
901#ifdef CONFIG_DEBUG_LOCK_ALLOC
27#include <linux/kernel_stat.h> 902#include <linux/kernel_stat.h>
28 903
29/* 904/*
30 * During boot, we forgive RCU lockdep issues. After this function is 905 * During boot, we forgive RCU lockdep issues. After this function is
31 * invoked, we start taking RCU lockdep issues seriously. 906 * invoked, we start taking RCU lockdep issues seriously.
32 */ 907 */
33void rcu_scheduler_starting(void) 908void __init rcu_scheduler_starting(void)
34{ 909{
35 WARN_ON(nr_context_switches() > 0); 910 WARN_ON(nr_context_switches() > 0);
36 rcu_scheduler_active = 1; 911 rcu_scheduler_active = 1;
37} 912}
38 913
39#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 914#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
915
916#ifdef CONFIG_RCU_BOOST
917#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
918#else /* #ifdef CONFIG_RCU_BOOST */
919#define RCU_BOOST_PRIO 1
920#endif /* #else #ifdef CONFIG_RCU_BOOST */
921
922#ifdef CONFIG_RCU_TRACE
923
924#ifdef CONFIG_RCU_BOOST
925
926static void rcu_initiate_boost_trace(void)
927{
928 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
929 rcu_preempt_ctrlblk.n_balk_blkd_tasks++;
930 else if (rcu_preempt_ctrlblk.gp_tasks == NULL &&
931 rcu_preempt_ctrlblk.exp_tasks == NULL)
932 rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++;
933 else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
934 rcu_preempt_ctrlblk.n_balk_boost_tasks++;
935 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
936 rcu_preempt_ctrlblk.n_balk_notyet++;
937 else
938 rcu_preempt_ctrlblk.n_balk_nos++;
939}
940
941#endif /* #ifdef CONFIG_RCU_BOOST */
942
943static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
944{
945 unsigned long flags;
946
947 raw_local_irq_save(flags);
948 rcp->qlen -= n;
949 raw_local_irq_restore(flags);
950}
951
952/*
953 * Dump statistics for TINY_RCU, such as they are.
954 */
955static int show_tiny_stats(struct seq_file *m, void *unused)
956{
957 show_tiny_preempt_stats(m);
958 seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
959 seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
960 return 0;
961}
962
963static int show_tiny_stats_open(struct inode *inode, struct file *file)
964{
965 return single_open(file, show_tiny_stats, NULL);
966}
967
968static const struct file_operations show_tiny_stats_fops = {
969 .owner = THIS_MODULE,
970 .open = show_tiny_stats_open,
971 .read = seq_read,
972 .llseek = seq_lseek,
973 .release = single_release,
974};
975
976static struct dentry *rcudir;
977
978static int __init rcutiny_trace_init(void)
979{
980 struct dentry *retval;
981
982 rcudir = debugfs_create_dir("rcu", NULL);
983 if (!rcudir)
984 goto free_out;
985 retval = debugfs_create_file("rcudata", 0444, rcudir,
986 NULL, &show_tiny_stats_fops);
987 if (!retval)
988 goto free_out;
989 return 0;
990free_out:
991 debugfs_remove_recursive(rcudir);
992 return 1;
993}
994
995static void __exit rcutiny_trace_cleanup(void)
996{
997 debugfs_remove_recursive(rcudir);
998}
999
1000module_init(rcutiny_trace_init);
1001module_exit(rcutiny_trace_cleanup);
1002
1003MODULE_AUTHOR("Paul E. McKenney");
1004MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
1005MODULE_LICENSE("GPL");
1006
1007#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 2e2726d790b9..2e138db03382 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -64,6 +64,9 @@ static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ 64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff = 0; /* Hold time within burst (us). */ 65static int fqs_holdoff = 0; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 66static int fqs_stutter = 3; /* Wait time between bursts (s). */
67static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
68static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
69static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
67static char *torture_type = "rcu"; /* What RCU implementation to torture. */ 70static char *torture_type = "rcu"; /* What RCU implementation to torture. */
68 71
69module_param(nreaders, int, 0444); 72module_param(nreaders, int, 0444);
@@ -88,6 +91,12 @@ module_param(fqs_holdoff, int, 0444);
88MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 91MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
89module_param(fqs_stutter, int, 0444); 92module_param(fqs_stutter, int, 0444);
90MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 93MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
94module_param(test_boost, int, 0444);
95MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
96module_param(test_boost_interval, int, 0444);
97MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
98module_param(test_boost_duration, int, 0444);
99MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
91module_param(torture_type, charp, 0444); 100module_param(torture_type, charp, 0444);
92MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 101MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
93 102
@@ -109,6 +118,7 @@ static struct task_struct *stats_task;
109static struct task_struct *shuffler_task; 118static struct task_struct *shuffler_task;
110static struct task_struct *stutter_task; 119static struct task_struct *stutter_task;
111static struct task_struct *fqs_task; 120static struct task_struct *fqs_task;
121static struct task_struct *boost_tasks[NR_CPUS];
112 122
113#define RCU_TORTURE_PIPE_LEN 10 123#define RCU_TORTURE_PIPE_LEN 10
114 124
@@ -120,8 +130,8 @@ struct rcu_torture {
120}; 130};
121 131
122static LIST_HEAD(rcu_torture_freelist); 132static LIST_HEAD(rcu_torture_freelist);
123static struct rcu_torture *rcu_torture_current; 133static struct rcu_torture __rcu *rcu_torture_current;
124static long rcu_torture_current_version; 134static unsigned long rcu_torture_current_version;
125static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 135static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
126static DEFINE_SPINLOCK(rcu_torture_lock); 136static DEFINE_SPINLOCK(rcu_torture_lock);
127static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = 137static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
@@ -134,6 +144,10 @@ static atomic_t n_rcu_torture_alloc_fail;
134static atomic_t n_rcu_torture_free; 144static atomic_t n_rcu_torture_free;
135static atomic_t n_rcu_torture_mberror; 145static atomic_t n_rcu_torture_mberror;
136static atomic_t n_rcu_torture_error; 146static atomic_t n_rcu_torture_error;
147static long n_rcu_torture_boost_ktrerror;
148static long n_rcu_torture_boost_rterror;
149static long n_rcu_torture_boost_failure;
150static long n_rcu_torture_boosts;
137static long n_rcu_torture_timers; 151static long n_rcu_torture_timers;
138static struct list_head rcu_torture_removed; 152static struct list_head rcu_torture_removed;
139static cpumask_var_t shuffle_tmp_mask; 153static cpumask_var_t shuffle_tmp_mask;
@@ -147,14 +161,26 @@ static int stutter_pause_test;
147#endif 161#endif
148int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 162int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
149 163
164#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
165#define rcu_can_boost() 1
166#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
167#define rcu_can_boost() 0
168#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
169
170static unsigned long boost_starttime; /* jiffies of next boost test start. */
171DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
172 /* and boost task create/destroy. */
173
150/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ 174/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
151 175
152#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ 176#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
153#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ 177#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */
154#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ 178#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */
155static int fullstop = FULLSTOP_RMMOD; 179static int fullstop = FULLSTOP_RMMOD;
156DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */ 180/*
157 /* of kthreads. */ 181 * Protect fullstop transitions and spawning of kthreads.
182 */
183static DEFINE_MUTEX(fullstop_mutex);
158 184
159/* 185/*
160 * Detect and respond to a system shutdown. 186 * Detect and respond to a system shutdown.
@@ -275,6 +301,7 @@ struct rcu_torture_ops {
275 void (*fqs)(void); 301 void (*fqs)(void);
276 int (*stats)(char *page); 302 int (*stats)(char *page);
277 int irq_capable; 303 int irq_capable;
304 int can_boost;
278 char *name; 305 char *name;
279}; 306};
280 307
@@ -303,6 +330,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp)
303 mdelay(longdelay_ms); 330 mdelay(longdelay_ms);
304 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) 331 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
305 udelay(shortdelay_us); 332 udelay(shortdelay_us);
333#ifdef CONFIG_PREEMPT
334 if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
335 preempt_schedule(); /* No QS if preempt_disable() in effect */
336#endif
306} 337}
307 338
308static void rcu_torture_read_unlock(int idx) __releases(RCU) 339static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -360,6 +391,7 @@ static struct rcu_torture_ops rcu_ops = {
360 .fqs = rcu_force_quiescent_state, 391 .fqs = rcu_force_quiescent_state,
361 .stats = NULL, 392 .stats = NULL,
362 .irq_capable = 1, 393 .irq_capable = 1,
394 .can_boost = rcu_can_boost(),
363 .name = "rcu" 395 .name = "rcu"
364}; 396};
365 397
@@ -402,6 +434,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
402 .fqs = rcu_force_quiescent_state, 434 .fqs = rcu_force_quiescent_state,
403 .stats = NULL, 435 .stats = NULL,
404 .irq_capable = 1, 436 .irq_capable = 1,
437 .can_boost = rcu_can_boost(),
405 .name = "rcu_sync" 438 .name = "rcu_sync"
406}; 439};
407 440
@@ -418,6 +451,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
418 .fqs = rcu_force_quiescent_state, 451 .fqs = rcu_force_quiescent_state,
419 .stats = NULL, 452 .stats = NULL,
420 .irq_capable = 1, 453 .irq_capable = 1,
454 .can_boost = rcu_can_boost(),
421 .name = "rcu_expedited" 455 .name = "rcu_expedited"
422}; 456};
423 457
@@ -536,6 +570,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
536 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); 570 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
537 if (!delay) 571 if (!delay)
538 schedule_timeout_interruptible(longdelay); 572 schedule_timeout_interruptible(longdelay);
573 else
574 rcu_read_delay(rrsp);
539} 575}
540 576
541static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) 577static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
@@ -676,6 +712,112 @@ static struct rcu_torture_ops sched_expedited_ops = {
676}; 712};
677 713
678/* 714/*
715 * RCU torture priority-boost testing. Runs one real-time thread per
716 * CPU for moderate bursts, repeatedly registering RCU callbacks and
717 * spinning waiting for them to be invoked. If a given callback takes
718 * too long to be invoked, we assume that priority inversion has occurred.
719 */
720
721struct rcu_boost_inflight {
722 struct rcu_head rcu;
723 int inflight;
724};
725
726static void rcu_torture_boost_cb(struct rcu_head *head)
727{
728 struct rcu_boost_inflight *rbip =
729 container_of(head, struct rcu_boost_inflight, rcu);
730
731 smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
732 rbip->inflight = 0;
733}
734
735static int rcu_torture_boost(void *arg)
736{
737 unsigned long call_rcu_time;
738 unsigned long endtime;
739 unsigned long oldstarttime;
740 struct rcu_boost_inflight rbi = { .inflight = 0 };
741 struct sched_param sp;
742
743 VERBOSE_PRINTK_STRING("rcu_torture_boost started");
744
745 /* Set real-time priority. */
746 sp.sched_priority = 1;
747 if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
748 VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
749 n_rcu_torture_boost_rterror++;
750 }
751
752 init_rcu_head_on_stack(&rbi.rcu);
753 /* Each pass through the following loop does one boost-test cycle. */
754 do {
755 /* Wait for the next test interval. */
756 oldstarttime = boost_starttime;
757 while (jiffies - oldstarttime > ULONG_MAX / 2) {
758 schedule_timeout_uninterruptible(1);
759 rcu_stutter_wait("rcu_torture_boost");
760 if (kthread_should_stop() ||
761 fullstop != FULLSTOP_DONTSTOP)
762 goto checkwait;
763 }
764
765 /* Do one boost-test interval. */
766 endtime = oldstarttime + test_boost_duration * HZ;
767 call_rcu_time = jiffies;
768 while (jiffies - endtime > ULONG_MAX / 2) {
769 /* If we don't have a callback in flight, post one. */
770 if (!rbi.inflight) {
771 smp_mb(); /* RCU core before ->inflight = 1. */
772 rbi.inflight = 1;
773 call_rcu(&rbi.rcu, rcu_torture_boost_cb);
774 if (jiffies - call_rcu_time >
775 test_boost_duration * HZ - HZ / 2) {
776 VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
777 n_rcu_torture_boost_failure++;
778 }
779 call_rcu_time = jiffies;
780 }
781 cond_resched();
782 rcu_stutter_wait("rcu_torture_boost");
783 if (kthread_should_stop() ||
784 fullstop != FULLSTOP_DONTSTOP)
785 goto checkwait;
786 }
787
788 /*
789 * Set the start time of the next test interval.
790 * Yes, this is vulnerable to long delays, but such
791 * delays simply cause a false negative for the next
792 * interval. Besides, we are running at RT priority,
793 * so delays should be relatively rare.
794 */
795 while (oldstarttime == boost_starttime) {
796 if (mutex_trylock(&boost_mutex)) {
797 boost_starttime = jiffies +
798 test_boost_interval * HZ;
799 n_rcu_torture_boosts++;
800 mutex_unlock(&boost_mutex);
801 break;
802 }
803 schedule_timeout_uninterruptible(1);
804 }
805
806 /* Go do the stutter. */
807checkwait: rcu_stutter_wait("rcu_torture_boost");
808 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
809
810 /* Clean up and exit. */
811 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
812 destroy_rcu_head_on_stack(&rbi.rcu);
813 rcutorture_shutdown_absorb("rcu_torture_boost");
814 while (!kthread_should_stop() || rbi.inflight)
815 schedule_timeout_uninterruptible(1);
816 smp_mb(); /* order accesses to ->inflight before stack-frame death. */
817 return 0;
818}
819
820/*
679 * RCU torture force-quiescent-state kthread. Repeatedly induces 821 * RCU torture force-quiescent-state kthread. Repeatedly induces
680 * bursts of calls to force_quiescent_state(), increasing the probability 822 * bursts of calls to force_quiescent_state(), increasing the probability
681 * of occurrence of some important types of race conditions. 823 * of occurrence of some important types of race conditions.
@@ -731,7 +873,8 @@ rcu_torture_writer(void *arg)
731 continue; 873 continue;
732 rp->rtort_pipe_count = 0; 874 rp->rtort_pipe_count = 0;
733 udelay(rcu_random(&rand) & 0x3ff); 875 udelay(rcu_random(&rand) & 0x3ff);
734 old_rp = rcu_torture_current; 876 old_rp = rcu_dereference_check(rcu_torture_current,
877 current == writer_task);
735 rp->rtort_mbtest = 1; 878 rp->rtort_mbtest = 1;
736 rcu_assign_pointer(rcu_torture_current, rp); 879 rcu_assign_pointer(rcu_torture_current, rp);
737 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ 880 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
@@ -743,7 +886,7 @@ rcu_torture_writer(void *arg)
743 old_rp->rtort_pipe_count++; 886 old_rp->rtort_pipe_count++;
744 cur_ops->deferred_free(old_rp); 887 cur_ops->deferred_free(old_rp);
745 } 888 }
746 rcu_torture_current_version++; 889 rcutorture_record_progress(++rcu_torture_current_version);
747 oldbatch = cur_ops->completed(); 890 oldbatch = cur_ops->completed();
748 rcu_stutter_wait("rcu_torture_writer"); 891 rcu_stutter_wait("rcu_torture_writer");
749 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 892 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
@@ -923,8 +1066,9 @@ rcu_torture_printk(char *page)
923 } 1066 }
924 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 1067 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
925 cnt += sprintf(&page[cnt], 1068 cnt += sprintf(&page[cnt],
926 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 1069 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
927 "rtmbe: %d nt: %ld", 1070 "rtmbe: %d rtbke: %ld rtbre: %ld "
1071 "rtbf: %ld rtb: %ld nt: %ld",
928 rcu_torture_current, 1072 rcu_torture_current,
929 rcu_torture_current_version, 1073 rcu_torture_current_version,
930 list_empty(&rcu_torture_freelist), 1074 list_empty(&rcu_torture_freelist),
@@ -932,8 +1076,15 @@ rcu_torture_printk(char *page)
932 atomic_read(&n_rcu_torture_alloc_fail), 1076 atomic_read(&n_rcu_torture_alloc_fail),
933 atomic_read(&n_rcu_torture_free), 1077 atomic_read(&n_rcu_torture_free),
934 atomic_read(&n_rcu_torture_mberror), 1078 atomic_read(&n_rcu_torture_mberror),
1079 n_rcu_torture_boost_ktrerror,
1080 n_rcu_torture_boost_rterror,
1081 n_rcu_torture_boost_failure,
1082 n_rcu_torture_boosts,
935 n_rcu_torture_timers); 1083 n_rcu_torture_timers);
936 if (atomic_read(&n_rcu_torture_mberror) != 0) 1084 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1085 n_rcu_torture_boost_ktrerror != 0 ||
1086 n_rcu_torture_boost_rterror != 0 ||
1087 n_rcu_torture_boost_failure != 0)
937 cnt += sprintf(&page[cnt], " !!!"); 1088 cnt += sprintf(&page[cnt], " !!!");
938 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1089 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
939 if (i > 1) { 1090 if (i > 1) {
@@ -1085,28 +1236,98 @@ rcu_torture_stutter(void *arg)
1085} 1236}
1086 1237
1087static inline void 1238static inline void
1088rcu_torture_print_module_parms(char *tag) 1239rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1089{ 1240{
1090 printk(KERN_ALERT "%s" TORTURE_FLAG 1241 printk(KERN_ALERT "%s" TORTURE_FLAG
1091 "--- %s: nreaders=%d nfakewriters=%d " 1242 "--- %s: nreaders=%d nfakewriters=%d "
1092 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 1243 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
1093 "shuffle_interval=%d stutter=%d irqreader=%d " 1244 "shuffle_interval=%d stutter=%d irqreader=%d "
1094 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n", 1245 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1246 "test_boost=%d/%d test_boost_interval=%d "
1247 "test_boost_duration=%d\n",
1095 torture_type, tag, nrealreaders, nfakewriters, 1248 torture_type, tag, nrealreaders, nfakewriters,
1096 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1249 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1097 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter); 1250 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1251 test_boost, cur_ops->can_boost,
1252 test_boost_interval, test_boost_duration);
1098} 1253}
1099 1254
1100static struct notifier_block rcutorture_nb = { 1255static struct notifier_block rcutorture_shutdown_nb = {
1101 .notifier_call = rcutorture_shutdown_notify, 1256 .notifier_call = rcutorture_shutdown_notify,
1102}; 1257};
1103 1258
1259static void rcutorture_booster_cleanup(int cpu)
1260{
1261 struct task_struct *t;
1262
1263 if (boost_tasks[cpu] == NULL)
1264 return;
1265 mutex_lock(&boost_mutex);
1266 VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
1267 t = boost_tasks[cpu];
1268 boost_tasks[cpu] = NULL;
1269 mutex_unlock(&boost_mutex);
1270
1271 /* This must be outside of the mutex, otherwise deadlock! */
1272 kthread_stop(t);
1273}
1274
1275static int rcutorture_booster_init(int cpu)
1276{
1277 int retval;
1278
1279 if (boost_tasks[cpu] != NULL)
1280 return 0; /* Already created, nothing more to do. */
1281
1282 /* Don't allow time recalculation while creating a new task. */
1283 mutex_lock(&boost_mutex);
1284 VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
1285 boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
1286 "rcu_torture_boost");
1287 if (IS_ERR(boost_tasks[cpu])) {
1288 retval = PTR_ERR(boost_tasks[cpu]);
1289 VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
1290 n_rcu_torture_boost_ktrerror++;
1291 boost_tasks[cpu] = NULL;
1292 mutex_unlock(&boost_mutex);
1293 return retval;
1294 }
1295 kthread_bind(boost_tasks[cpu], cpu);
1296 wake_up_process(boost_tasks[cpu]);
1297 mutex_unlock(&boost_mutex);
1298 return 0;
1299}
1300
1301static int rcutorture_cpu_notify(struct notifier_block *self,
1302 unsigned long action, void *hcpu)
1303{
1304 long cpu = (long)hcpu;
1305
1306 switch (action) {
1307 case CPU_ONLINE:
1308 case CPU_DOWN_FAILED:
1309 (void)rcutorture_booster_init(cpu);
1310 break;
1311 case CPU_DOWN_PREPARE:
1312 rcutorture_booster_cleanup(cpu);
1313 break;
1314 default:
1315 break;
1316 }
1317 return NOTIFY_OK;
1318}
1319
1320static struct notifier_block rcutorture_cpu_nb = {
1321 .notifier_call = rcutorture_cpu_notify,
1322};
1323
1104static void 1324static void
1105rcu_torture_cleanup(void) 1325rcu_torture_cleanup(void)
1106{ 1326{
1107 int i; 1327 int i;
1108 1328
1109 mutex_lock(&fullstop_mutex); 1329 mutex_lock(&fullstop_mutex);
1330 rcutorture_record_test_transition();
1110 if (fullstop == FULLSTOP_SHUTDOWN) { 1331 if (fullstop == FULLSTOP_SHUTDOWN) {
1111 printk(KERN_WARNING /* but going down anyway, so... */ 1332 printk(KERN_WARNING /* but going down anyway, so... */
1112 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); 1333 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
@@ -1118,7 +1339,7 @@ rcu_torture_cleanup(void)
1118 } 1339 }
1119 fullstop = FULLSTOP_RMMOD; 1340 fullstop = FULLSTOP_RMMOD;
1120 mutex_unlock(&fullstop_mutex); 1341 mutex_unlock(&fullstop_mutex);
1121 unregister_reboot_notifier(&rcutorture_nb); 1342 unregister_reboot_notifier(&rcutorture_shutdown_nb);
1122 if (stutter_task) { 1343 if (stutter_task) {
1123 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 1344 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
1124 kthread_stop(stutter_task); 1345 kthread_stop(stutter_task);
@@ -1175,6 +1396,12 @@ rcu_torture_cleanup(void)
1175 kthread_stop(fqs_task); 1396 kthread_stop(fqs_task);
1176 } 1397 }
1177 fqs_task = NULL; 1398 fqs_task = NULL;
1399 if ((test_boost == 1 && cur_ops->can_boost) ||
1400 test_boost == 2) {
1401 unregister_cpu_notifier(&rcutorture_cpu_nb);
1402 for_each_possible_cpu(i)
1403 rcutorture_booster_cleanup(i);
1404 }
1178 1405
1179 /* Wait for all RCU callbacks to fire. */ 1406 /* Wait for all RCU callbacks to fire. */
1180 1407
@@ -1186,9 +1413,9 @@ rcu_torture_cleanup(void)
1186 if (cur_ops->cleanup) 1413 if (cur_ops->cleanup)
1187 cur_ops->cleanup(); 1414 cur_ops->cleanup();
1188 if (atomic_read(&n_rcu_torture_error)) 1415 if (atomic_read(&n_rcu_torture_error))
1189 rcu_torture_print_module_parms("End of test: FAILURE"); 1416 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1190 else 1417 else
1191 rcu_torture_print_module_parms("End of test: SUCCESS"); 1418 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
1192} 1419}
1193 1420
1194static int __init 1421static int __init
@@ -1233,7 +1460,7 @@ rcu_torture_init(void)
1233 nrealreaders = nreaders; 1460 nrealreaders = nreaders;
1234 else 1461 else
1235 nrealreaders = 2 * num_online_cpus(); 1462 nrealreaders = 2 * num_online_cpus();
1236 rcu_torture_print_module_parms("Start of test"); 1463 rcu_torture_print_module_parms(cur_ops, "Start of test");
1237 fullstop = FULLSTOP_DONTSTOP; 1464 fullstop = FULLSTOP_DONTSTOP;
1238 1465
1239 /* Set up the freelist. */ 1466 /* Set up the freelist. */
@@ -1254,6 +1481,10 @@ rcu_torture_init(void)
1254 atomic_set(&n_rcu_torture_free, 0); 1481 atomic_set(&n_rcu_torture_free, 0);
1255 atomic_set(&n_rcu_torture_mberror, 0); 1482 atomic_set(&n_rcu_torture_mberror, 0);
1256 atomic_set(&n_rcu_torture_error, 0); 1483 atomic_set(&n_rcu_torture_error, 0);
1484 n_rcu_torture_boost_ktrerror = 0;
1485 n_rcu_torture_boost_rterror = 0;
1486 n_rcu_torture_boost_failure = 0;
1487 n_rcu_torture_boosts = 0;
1257 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1488 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
1258 atomic_set(&rcu_torture_wcount[i], 0); 1489 atomic_set(&rcu_torture_wcount[i], 0);
1259 for_each_possible_cpu(cpu) { 1490 for_each_possible_cpu(cpu) {
@@ -1367,7 +1598,28 @@ rcu_torture_init(void)
1367 goto unwind; 1598 goto unwind;
1368 } 1599 }
1369 } 1600 }
1370 register_reboot_notifier(&rcutorture_nb); 1601 if (test_boost_interval < 1)
1602 test_boost_interval = 1;
1603 if (test_boost_duration < 2)
1604 test_boost_duration = 2;
1605 if ((test_boost == 1 && cur_ops->can_boost) ||
1606 test_boost == 2) {
1607 int retval;
1608
1609 boost_starttime = jiffies + test_boost_interval * HZ;
1610 register_cpu_notifier(&rcutorture_cpu_nb);
1611 for_each_possible_cpu(i) {
1612 if (cpu_is_offline(i))
1613 continue; /* Heuristic: CPU can go offline. */
1614 retval = rcutorture_booster_init(i);
1615 if (retval < 0) {
1616 firsterr = retval;
1617 goto unwind;
1618 }
1619 }
1620 }
1621 register_reboot_notifier(&rcutorture_shutdown_nb);
1622 rcutorture_record_test_transition();
1371 mutex_unlock(&fullstop_mutex); 1623 mutex_unlock(&fullstop_mutex);
1372 return 0; 1624 return 0;
1373 1625
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d5bc43976c5a..ba06207b1dd3 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -36,7 +36,7 @@
36#include <linux/interrupt.h> 36#include <linux/interrupt.h>
37#include <linux/sched.h> 37#include <linux/sched.h>
38#include <linux/nmi.h> 38#include <linux/nmi.h>
39#include <asm/atomic.h> 39#include <linux/atomic.h>
40#include <linux/bitops.h> 40#include <linux/bitops.h>
41#include <linux/module.h> 41#include <linux/module.h>
42#include <linux/completion.h> 42#include <linux/completion.h>
@@ -47,6 +47,9 @@
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h> 49#include <linux/kernel_stat.h>
50#include <linux/wait.h>
51#include <linux/kthread.h>
52#include <linux/prefetch.h>
50 53
51#include "rcutree.h" 54#include "rcutree.h"
52 55
@@ -67,9 +70,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
67 .gpnum = -300, \ 70 .gpnum = -300, \
68 .completed = -300, \ 71 .completed = -300, \
69 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ 72 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
70 .orphan_cbs_list = NULL, \
71 .orphan_cbs_tail = &structname.orphan_cbs_list, \
72 .orphan_qlen = 0, \
73 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ 73 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
74 .n_force_qs = 0, \ 74 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 75 .n_force_qs_ngp = 0, \
@@ -82,10 +82,67 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
82struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 82struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
83DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 83DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
84 84
85static struct rcu_state *rcu_state;
86
87/*
88 * The rcu_scheduler_active variable transitions from zero to one just
89 * before the first task is spawned. So when this variable is zero, RCU
90 * can assume that there is but one task, allowing RCU to (for example)
91 * optimized synchronize_sched() to a simple barrier(). When this variable
92 * is one, RCU must actually do all the hard work required to detect real
93 * grace periods. This variable is also used to suppress boot-time false
94 * positives from lockdep-RCU error checking.
95 */
85int rcu_scheduler_active __read_mostly; 96int rcu_scheduler_active __read_mostly;
86EXPORT_SYMBOL_GPL(rcu_scheduler_active); 97EXPORT_SYMBOL_GPL(rcu_scheduler_active);
87 98
88/* 99/*
100 * The rcu_scheduler_fully_active variable transitions from zero to one
101 * during the early_initcall() processing, which is after the scheduler
102 * is capable of creating new tasks. So RCU processing (for example,
103 * creating tasks for RCU priority boosting) must be delayed until after
104 * rcu_scheduler_fully_active transitions from zero to one. We also
105 * currently delay invocation of any RCU callbacks until after this point.
106 *
107 * It might later prove better for people registering RCU callbacks during
108 * early boot to take responsibility for these callbacks, but one step at
109 * a time.
110 */
111static int rcu_scheduler_fully_active __read_mostly;
112
113#ifdef CONFIG_RCU_BOOST
114
115/*
116 * Control variables for per-CPU and per-rcu_node kthreads. These
117 * handle all flavors of RCU.
118 */
119static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
120DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
121DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
122DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
123DEFINE_PER_CPU(char, rcu_cpu_has_work);
124
125#endif /* #ifdef CONFIG_RCU_BOOST */
126
127static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
128static void invoke_rcu_core(void);
129static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
130
131#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */
132
133/*
134 * Track the rcutorture test sequence number and the update version
135 * number within a given test. The rcutorture_testseq is incremented
136 * on every rcutorture module load and unload, so has an odd value
137 * when a test is running. The rcutorture_vernum is set to zero
138 * when rcutorture starts and is incremented on each rcutorture update.
139 * These variables enable correlating rcutorture output with the
140 * RCU tracing information.
141 */
142unsigned long rcutorture_testseq;
143unsigned long rcutorture_vernum;
144
145/*
89 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 146 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
90 * permit this function to be invoked without holding the root rcu_node 147 * permit this function to be invoked without holding the root rcu_node
91 * structure's ->lock, but of course results can be subject to change. 148 * structure's ->lock, but of course results can be subject to change.
@@ -127,11 +184,12 @@ void rcu_note_context_switch(int cpu)
127 rcu_sched_qs(cpu); 184 rcu_sched_qs(cpu);
128 rcu_preempt_note_context_switch(cpu); 185 rcu_preempt_note_context_switch(cpu);
129} 186}
187EXPORT_SYMBOL_GPL(rcu_note_context_switch);
130 188
131#ifdef CONFIG_NO_HZ 189#ifdef CONFIG_NO_HZ
132DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 190DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
133 .dynticks_nesting = 1, 191 .dynticks_nesting = 1,
134 .dynticks = 1, 192 .dynticks = ATOMIC_INIT(1),
135}; 193};
136#endif /* #ifdef CONFIG_NO_HZ */ 194#endif /* #ifdef CONFIG_NO_HZ */
137 195
@@ -143,6 +201,9 @@ module_param(blimit, int, 0);
143module_param(qhimark, int, 0); 201module_param(qhimark, int, 0);
144module_param(qlowmark, int, 0); 202module_param(qlowmark, int, 0);
145 203
204int rcu_cpu_stall_suppress __read_mostly;
205module_param(rcu_cpu_stall_suppress, int, 0644);
206
146static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 207static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
147static int rcu_pending(int cpu); 208static int rcu_pending(int cpu);
148 209
@@ -174,6 +235,31 @@ void rcu_bh_force_quiescent_state(void)
174EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 235EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
175 236
176/* 237/*
238 * Record the number of times rcutorture tests have been initiated and
239 * terminated. This information allows the debugfs tracing stats to be
240 * correlated to the rcutorture messages, even when the rcutorture module
241 * is being repeatedly loaded and unloaded. In other words, we cannot
242 * store this state in rcutorture itself.
243 */
244void rcutorture_record_test_transition(void)
245{
246 rcutorture_testseq++;
247 rcutorture_vernum = 0;
248}
249EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
250
251/*
252 * Record the number of writer passes through the current rcutorture test.
253 * This is also used to correlate debugfs tracing stats with the rcutorture
254 * messages.
255 */
256void rcutorture_record_progress(unsigned long vernum)
257{
258 rcutorture_vernum++;
259}
260EXPORT_SYMBOL_GPL(rcutorture_record_progress);
261
262/*
177 * Force a quiescent state for RCU-sched. 263 * Force a quiescent state for RCU-sched.
178 */ 264 */
179void rcu_sched_force_quiescent_state(void) 265void rcu_sched_force_quiescent_state(void)
@@ -232,8 +318,8 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
232 return 1; 318 return 1;
233 } 319 }
234 320
235 /* If preemptable RCU, no point in sending reschedule IPI. */ 321 /* If preemptible RCU, no point in sending reschedule IPI. */
236 if (rdp->preemptable) 322 if (rdp->preemptible)
237 return 0; 323 return 0;
238 324
239 /* The CPU is online, so send it a reschedule IPI. */ 325 /* The CPU is online, so send it a reschedule IPI. */
@@ -262,13 +348,25 @@ void rcu_enter_nohz(void)
262 unsigned long flags; 348 unsigned long flags;
263 struct rcu_dynticks *rdtp; 349 struct rcu_dynticks *rdtp;
264 350
265 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
266 local_irq_save(flags); 351 local_irq_save(flags);
267 rdtp = &__get_cpu_var(rcu_dynticks); 352 rdtp = &__get_cpu_var(rcu_dynticks);
268 rdtp->dynticks++; 353 if (--rdtp->dynticks_nesting) {
269 rdtp->dynticks_nesting--; 354 local_irq_restore(flags);
270 WARN_ON_ONCE(rdtp->dynticks & 0x1); 355 return;
356 }
357 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
358 smp_mb__before_atomic_inc(); /* See above. */
359 atomic_inc(&rdtp->dynticks);
360 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
361 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
271 local_irq_restore(flags); 362 local_irq_restore(flags);
363
364 /* If the interrupt queued a callback, get out of dyntick mode. */
365 if (in_irq() &&
366 (__get_cpu_var(rcu_sched_data).nxtlist ||
367 __get_cpu_var(rcu_bh_data).nxtlist ||
368 rcu_preempt_needs_cpu(smp_processor_id())))
369 set_need_resched();
272} 370}
273 371
274/* 372/*
@@ -284,11 +382,16 @@ void rcu_exit_nohz(void)
284 382
285 local_irq_save(flags); 383 local_irq_save(flags);
286 rdtp = &__get_cpu_var(rcu_dynticks); 384 rdtp = &__get_cpu_var(rcu_dynticks);
287 rdtp->dynticks++; 385 if (rdtp->dynticks_nesting++) {
288 rdtp->dynticks_nesting++; 386 local_irq_restore(flags);
289 WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); 387 return;
388 }
389 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
390 atomic_inc(&rdtp->dynticks);
391 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
392 smp_mb__after_atomic_inc(); /* See above. */
393 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
290 local_irq_restore(flags); 394 local_irq_restore(flags);
291 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
292} 395}
293 396
294/** 397/**
@@ -302,11 +405,15 @@ void rcu_nmi_enter(void)
302{ 405{
303 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 406 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
304 407
305 if (rdtp->dynticks & 0x1) 408 if (rdtp->dynticks_nmi_nesting == 0 &&
409 (atomic_read(&rdtp->dynticks) & 0x1))
306 return; 410 return;
307 rdtp->dynticks_nmi++; 411 rdtp->dynticks_nmi_nesting++;
308 WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1)); 412 smp_mb__before_atomic_inc(); /* Force delay from prior write. */
309 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 413 atomic_inc(&rdtp->dynticks);
414 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
415 smp_mb__after_atomic_inc(); /* See above. */
416 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
310} 417}
311 418
312/** 419/**
@@ -320,11 +427,14 @@ void rcu_nmi_exit(void)
320{ 427{
321 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 428 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
322 429
323 if (rdtp->dynticks & 0x1) 430 if (rdtp->dynticks_nmi_nesting == 0 ||
431 --rdtp->dynticks_nmi_nesting != 0)
324 return; 432 return;
325 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 433 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
326 rdtp->dynticks_nmi++; 434 smp_mb__before_atomic_inc(); /* See above. */
327 WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1); 435 atomic_inc(&rdtp->dynticks);
436 smp_mb__after_atomic_inc(); /* Force delay to next write. */
437 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
328} 438}
329 439
330/** 440/**
@@ -335,13 +445,7 @@ void rcu_nmi_exit(void)
335 */ 445 */
336void rcu_irq_enter(void) 446void rcu_irq_enter(void)
337{ 447{
338 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 448 rcu_exit_nohz();
339
340 if (rdtp->dynticks_nesting++)
341 return;
342 rdtp->dynticks++;
343 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
344 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
345} 449}
346 450
347/** 451/**
@@ -353,18 +457,7 @@ void rcu_irq_enter(void)
353 */ 457 */
354void rcu_irq_exit(void) 458void rcu_irq_exit(void)
355{ 459{
356 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 460 rcu_enter_nohz();
357
358 if (--rdtp->dynticks_nesting)
359 return;
360 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
361 rdtp->dynticks++;
362 WARN_ON_ONCE(rdtp->dynticks & 0x1);
363
364 /* If the interrupt queued a callback, get out of dyntick mode. */
365 if (__get_cpu_var(rcu_sched_data).nxtlist ||
366 __get_cpu_var(rcu_bh_data).nxtlist)
367 set_need_resched();
368} 461}
369 462
370#ifdef CONFIG_SMP 463#ifdef CONFIG_SMP
@@ -376,19 +469,8 @@ void rcu_irq_exit(void)
376 */ 469 */
377static int dyntick_save_progress_counter(struct rcu_data *rdp) 470static int dyntick_save_progress_counter(struct rcu_data *rdp)
378{ 471{
379 int ret; 472 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
380 int snap; 473 return 0;
381 int snap_nmi;
382
383 snap = rdp->dynticks->dynticks;
384 snap_nmi = rdp->dynticks->dynticks_nmi;
385 smp_mb(); /* Order sampling of snap with end of grace period. */
386 rdp->dynticks_snap = snap;
387 rdp->dynticks_nmi_snap = snap_nmi;
388 ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
389 if (ret)
390 rdp->dynticks_fqs++;
391 return ret;
392} 474}
393 475
394/* 476/*
@@ -399,16 +481,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
399 */ 481 */
400static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) 482static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
401{ 483{
402 long curr; 484 unsigned long curr;
403 long curr_nmi; 485 unsigned long snap;
404 long snap;
405 long snap_nmi;
406 486
407 curr = rdp->dynticks->dynticks; 487 curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
408 snap = rdp->dynticks_snap; 488 snap = (unsigned long)rdp->dynticks_snap;
409 curr_nmi = rdp->dynticks->dynticks_nmi;
410 snap_nmi = rdp->dynticks_nmi_snap;
411 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
412 489
413 /* 490 /*
414 * If the CPU passed through or entered a dynticks idle phase with 491 * If the CPU passed through or entered a dynticks idle phase with
@@ -418,8 +495,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
418 * read-side critical section that started before the beginning 495 * read-side critical section that started before the beginning
419 * of the current RCU grace period. 496 * of the current RCU grace period.
420 */ 497 */
421 if ((curr != snap || (curr & 0x1) == 0) && 498 if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
422 (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
423 rdp->dynticks_fqs++; 499 rdp->dynticks_fqs++;
424 return 1; 500 return 1;
425 } 501 }
@@ -448,9 +524,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
448 524
449#endif /* #else #ifdef CONFIG_NO_HZ */ 525#endif /* #else #ifdef CONFIG_NO_HZ */
450 526
451#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 527int rcu_cpu_stall_suppress __read_mostly;
452
453int rcu_cpu_stall_panicking __read_mostly;
454 528
455static void record_gp_stall_check_time(struct rcu_state *rsp) 529static void record_gp_stall_check_time(struct rcu_state *rsp)
456{ 530{
@@ -482,8 +556,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
482 rcu_print_task_stall(rnp); 556 rcu_print_task_stall(rnp);
483 raw_spin_unlock_irqrestore(&rnp->lock, flags); 557 raw_spin_unlock_irqrestore(&rnp->lock, flags);
484 558
485 /* OK, time to rat on our buddy... */ 559 /*
486 560 * OK, time to rat on our buddy...
561 * See Documentation/RCU/stallwarn.txt for info on how to debug
562 * RCU CPU stall warnings.
563 */
487 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", 564 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
488 rsp->name); 565 rsp->name);
489 rcu_for_each_leaf_node(rsp, rnp) { 566 rcu_for_each_leaf_node(rsp, rnp) {
@@ -512,6 +589,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
512 unsigned long flags; 589 unsigned long flags;
513 struct rcu_node *rnp = rcu_get_root(rsp); 590 struct rcu_node *rnp = rcu_get_root(rsp);
514 591
592 /*
593 * OK, time to rat on ourselves...
594 * See Documentation/RCU/stallwarn.txt for info on how to debug
595 * RCU CPU stall warnings.
596 */
515 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", 597 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
516 rsp->name, smp_processor_id(), jiffies - rsp->gp_start); 598 rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
517 trigger_all_cpu_backtrace(); 599 trigger_all_cpu_backtrace();
@@ -527,31 +609,50 @@ static void print_cpu_stall(struct rcu_state *rsp)
527 609
528static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) 610static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
529{ 611{
530 long delta; 612 unsigned long j;
613 unsigned long js;
531 struct rcu_node *rnp; 614 struct rcu_node *rnp;
532 615
533 if (rcu_cpu_stall_panicking) 616 if (rcu_cpu_stall_suppress)
534 return; 617 return;
535 delta = jiffies - rsp->jiffies_stall; 618 j = ACCESS_ONCE(jiffies);
619 js = ACCESS_ONCE(rsp->jiffies_stall);
536 rnp = rdp->mynode; 620 rnp = rdp->mynode;
537 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { 621 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
538 622
539 /* We haven't checked in, so go dump stack. */ 623 /* We haven't checked in, so go dump stack. */
540 print_cpu_stall(rsp); 624 print_cpu_stall(rsp);
541 625
542 } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) { 626 } else if (rcu_gp_in_progress(rsp) &&
627 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
543 628
544 /* They had two time units to dump stack, so complain. */ 629 /* They had a few time units to dump stack, so complain. */
545 print_other_cpu_stall(rsp); 630 print_other_cpu_stall(rsp);
546 } 631 }
547} 632}
548 633
549static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) 634static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
550{ 635{
551 rcu_cpu_stall_panicking = 1; 636 rcu_cpu_stall_suppress = 1;
552 return NOTIFY_DONE; 637 return NOTIFY_DONE;
553} 638}
554 639
640/**
641 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
642 *
643 * Set the stall-warning timeout way off into the future, thus preventing
644 * any RCU CPU stall-warning messages from appearing in the current set of
645 * RCU grace periods.
646 *
647 * The caller must disable hard irqs.
648 */
649void rcu_cpu_stall_reset(void)
650{
651 rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
652 rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
653 rcu_preempt_stall_reset();
654}
655
555static struct notifier_block rcu_panic_block = { 656static struct notifier_block rcu_panic_block = {
556 .notifier_call = rcu_panic, 657 .notifier_call = rcu_panic,
557}; 658};
@@ -561,22 +662,6 @@ static void __init check_cpu_stall_init(void)
561 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); 662 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
562} 663}
563 664
564#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
565
566static void record_gp_stall_check_time(struct rcu_state *rsp)
567{
568}
569
570static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
571{
572}
573
574static void __init check_cpu_stall_init(void)
575{
576}
577
578#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
579
580/* 665/*
581 * Update CPU-local rcu_data state to record the newly noticed grace period. 666 * Update CPU-local rcu_data state to record the newly noticed grace period.
582 * This is used both when we started the grace period and when we notice 667 * This is used both when we started the grace period and when we notice
@@ -587,9 +672,17 @@ static void __init check_cpu_stall_init(void)
587static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) 672static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
588{ 673{
589 if (rdp->gpnum != rnp->gpnum) { 674 if (rdp->gpnum != rnp->gpnum) {
590 rdp->qs_pending = 1; 675 /*
591 rdp->passed_quiesc = 0; 676 * If the current grace period is waiting for this CPU,
677 * set up to detect a quiescent state, otherwise don't
678 * go looking for one.
679 */
592 rdp->gpnum = rnp->gpnum; 680 rdp->gpnum = rnp->gpnum;
681 if (rnp->qsmask & rdp->grpmask) {
682 rdp->qs_pending = 1;
683 rdp->passed_quiesc = 0;
684 } else
685 rdp->qs_pending = 0;
593 } 686 }
594} 687}
595 688
@@ -648,6 +741,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
648 741
649 /* Remember that we saw this grace-period completion. */ 742 /* Remember that we saw this grace-period completion. */
650 rdp->completed = rnp->completed; 743 rdp->completed = rnp->completed;
744
745 /*
746 * If we were in an extended quiescent state, we may have
747 * missed some grace periods that others CPUs handled on
748 * our behalf. Catch up with this state to avoid noting
749 * spurious new grace periods. If another grace period
750 * has started, then rnp->gpnum will have advanced, so
751 * we will detect this later on.
752 */
753 if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
754 rdp->gpnum = rdp->completed;
755
756 /*
757 * If RCU does not need a quiescent state from this CPU,
758 * then make sure that this CPU doesn't go looking for one.
759 */
760 if ((rnp->qsmask & rdp->grpmask) == 0)
761 rdp->qs_pending = 0;
651 } 762 }
652} 763}
653 764
@@ -712,7 +823,7 @@ static void
712rcu_start_gp(struct rcu_state *rsp, unsigned long flags) 823rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
713 __releases(rcu_get_root(rsp)->lock) 824 __releases(rcu_get_root(rsp)->lock)
714{ 825{
715 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 826 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
716 struct rcu_node *rnp = rcu_get_root(rsp); 827 struct rcu_node *rnp = rcu_get_root(rsp);
717 828
718 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { 829 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
@@ -753,6 +864,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
753 rnp->completed = rsp->completed; 864 rnp->completed = rsp->completed;
754 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 865 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
755 rcu_start_gp_per_cpu(rsp, rnp, rdp); 866 rcu_start_gp_per_cpu(rsp, rnp, rdp);
867 rcu_preempt_boost_start_gp(rnp);
756 raw_spin_unlock_irqrestore(&rnp->lock, flags); 868 raw_spin_unlock_irqrestore(&rnp->lock, flags);
757 return; 869 return;
758 } 870 }
@@ -788,6 +900,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
788 rnp->completed = rsp->completed; 900 rnp->completed = rsp->completed;
789 if (rnp == rdp->mynode) 901 if (rnp == rdp->mynode)
790 rcu_start_gp_per_cpu(rsp, rnp, rdp); 902 rcu_start_gp_per_cpu(rsp, rnp, rdp);
903 rcu_preempt_boost_start_gp(rnp);
791 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 904 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
792 } 905 }
793 906
@@ -808,7 +921,18 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
808static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 921static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
809 __releases(rcu_get_root(rsp)->lock) 922 __releases(rcu_get_root(rsp)->lock)
810{ 923{
924 unsigned long gp_duration;
925
811 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 926 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
927
928 /*
929 * Ensure that all grace-period and pre-grace-period activity
930 * is seen before the assignment to rsp->completed.
931 */
932 smp_mb(); /* See above block comment. */
933 gp_duration = jiffies - rsp->gp_start;
934 if (gp_duration > rsp->gp_max)
935 rsp->gp_max = gp_duration;
812 rsp->completed = rsp->gpnum; 936 rsp->completed = rsp->gpnum;
813 rsp->signaled = RCU_GP_IDLE; 937 rsp->signaled = RCU_GP_IDLE;
814 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 938 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
@@ -838,7 +962,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
838 return; 962 return;
839 } 963 }
840 rnp->qsmask &= ~mask; 964 rnp->qsmask &= ~mask;
841 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 965 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
842 966
843 /* Other bits still set at this level, so done. */ 967 /* Other bits still set at this level, so done. */
844 raw_spin_unlock_irqrestore(&rnp->lock, flags); 968 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -951,65 +1075,49 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
951#ifdef CONFIG_HOTPLUG_CPU 1075#ifdef CONFIG_HOTPLUG_CPU
952 1076
953/* 1077/*
954 * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the 1078 * Move a dying CPU's RCU callbacks to online CPU's callback list.
955 * specified flavor of RCU. The callbacks will be adopted by the next 1079 * Synchronization is not required because this function executes
956 * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever 1080 * in stop_machine() context.
957 * comes first. Because this is invoked from the CPU_DYING notifier,
958 * irqs are already disabled.
959 */ 1081 */
960static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) 1082static void rcu_send_cbs_to_online(struct rcu_state *rsp)
961{ 1083{
962 int i; 1084 int i;
963 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 1085 /* current DYING CPU is cleared in the cpu_online_mask */
1086 int receive_cpu = cpumask_any(cpu_online_mask);
1087 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1088 struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
964 1089
965 if (rdp->nxtlist == NULL) 1090 if (rdp->nxtlist == NULL)
966 return; /* irqs disabled, so comparison is stable. */ 1091 return; /* irqs disabled, so comparison is stable. */
967 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ 1092
968 *rsp->orphan_cbs_tail = rdp->nxtlist; 1093 *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
969 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; 1094 receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1095 receive_rdp->qlen += rdp->qlen;
1096 receive_rdp->n_cbs_adopted += rdp->qlen;
1097 rdp->n_cbs_orphaned += rdp->qlen;
1098
970 rdp->nxtlist = NULL; 1099 rdp->nxtlist = NULL;
971 for (i = 0; i < RCU_NEXT_SIZE; i++) 1100 for (i = 0; i < RCU_NEXT_SIZE; i++)
972 rdp->nxttail[i] = &rdp->nxtlist; 1101 rdp->nxttail[i] = &rdp->nxtlist;
973 rsp->orphan_qlen += rdp->qlen;
974 rdp->qlen = 0; 1102 rdp->qlen = 0;
975 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
976}
977
978/*
979 * Adopt previously orphaned RCU callbacks.
980 */
981static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
982{
983 unsigned long flags;
984 struct rcu_data *rdp;
985
986 raw_spin_lock_irqsave(&rsp->onofflock, flags);
987 rdp = rsp->rda[smp_processor_id()];
988 if (rsp->orphan_cbs_list == NULL) {
989 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
990 return;
991 }
992 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
993 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
994 rdp->qlen += rsp->orphan_qlen;
995 rsp->orphan_cbs_list = NULL;
996 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
997 rsp->orphan_qlen = 0;
998 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
999} 1103}
1000 1104
1001/* 1105/*
1002 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy 1106 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
1003 * and move all callbacks from the outgoing CPU to the current one. 1107 * and move all callbacks from the outgoing CPU to the current one.
1108 * There can only be one CPU hotplug operation at a time, so no other
1109 * CPU can be attempting to update rcu_cpu_kthread_task.
1004 */ 1110 */
1005static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) 1111static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1006{ 1112{
1007 unsigned long flags; 1113 unsigned long flags;
1008 unsigned long mask; 1114 unsigned long mask;
1009 int need_report = 0; 1115 int need_report = 0;
1010 struct rcu_data *rdp = rsp->rda[cpu]; 1116 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1011 struct rcu_node *rnp; 1117 struct rcu_node *rnp;
1012 1118
1119 rcu_stop_cpu_kthread(cpu);
1120
1013 /* Exclude any attempts to start a new grace period. */ 1121 /* Exclude any attempts to start a new grace period. */
1014 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1122 raw_spin_lock_irqsave(&rsp->onofflock, flags);
1015 1123
@@ -1046,8 +1154,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1046 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1154 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1047 if (need_report & RCU_OFL_TASKS_EXP_GP) 1155 if (need_report & RCU_OFL_TASKS_EXP_GP)
1048 rcu_report_exp_rnp(rsp, rnp); 1156 rcu_report_exp_rnp(rsp, rnp);
1049 1157 rcu_node_kthread_setaffinity(rnp, -1);
1050 rcu_adopt_orphan_cbs(rsp);
1051} 1158}
1052 1159
1053/* 1160/*
@@ -1065,11 +1172,7 @@ static void rcu_offline_cpu(int cpu)
1065 1172
1066#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1173#else /* #ifdef CONFIG_HOTPLUG_CPU */
1067 1174
1068static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) 1175static void rcu_send_cbs_to_online(struct rcu_state *rsp)
1069{
1070}
1071
1072static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1073{ 1176{
1074} 1177}
1075 1178
@@ -1113,7 +1216,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1113 next = list->next; 1216 next = list->next;
1114 prefetch(next); 1217 prefetch(next);
1115 debug_rcu_head_unqueue(list); 1218 debug_rcu_head_unqueue(list);
1116 list->func(list); 1219 __rcu_reclaim(list);
1117 list = next; 1220 list = next;
1118 if (++count >= rdp->blimit) 1221 if (++count >= rdp->blimit)
1119 break; 1222 break;
@@ -1123,6 +1226,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1123 1226
1124 /* Update count, and requeue any remaining callbacks. */ 1227 /* Update count, and requeue any remaining callbacks. */
1125 rdp->qlen -= count; 1228 rdp->qlen -= count;
1229 rdp->n_cbs_invoked += count;
1126 if (list != NULL) { 1230 if (list != NULL) {
1127 *tail = rdp->nxtlist; 1231 *tail = rdp->nxtlist;
1128 rdp->nxtlist = list; 1232 rdp->nxtlist = list;
@@ -1148,7 +1252,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1148 1252
1149 /* Re-raise the RCU softirq if there are callbacks remaining. */ 1253 /* Re-raise the RCU softirq if there are callbacks remaining. */
1150 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1254 if (cpu_has_callbacks_ready_to_invoke(rdp))
1151 raise_softirq(RCU_SOFTIRQ); 1255 invoke_rcu_core();
1152} 1256}
1153 1257
1154/* 1258/*
@@ -1194,7 +1298,7 @@ void rcu_check_callbacks(int cpu, int user)
1194 } 1298 }
1195 rcu_preempt_check_callbacks(cpu); 1299 rcu_preempt_check_callbacks(cpu);
1196 if (rcu_pending(cpu)) 1300 if (rcu_pending(cpu))
1197 raise_softirq(RCU_SOFTIRQ); 1301 invoke_rcu_core();
1198} 1302}
1199 1303
1200#ifdef CONFIG_SMP 1304#ifdef CONFIG_SMP
@@ -1202,6 +1306,8 @@ void rcu_check_callbacks(int cpu, int user)
1202/* 1306/*
1203 * Scan the leaf rcu_node structures, processing dyntick state for any that 1307 * Scan the leaf rcu_node structures, processing dyntick state for any that
1204 * have not yet encountered a quiescent state, using the function specified. 1308 * have not yet encountered a quiescent state, using the function specified.
1309 * Also initiate boosting for any threads blocked on the root rcu_node.
1310 *
1205 * The caller must have suppressed start of new grace periods. 1311 * The caller must have suppressed start of new grace periods.
1206 */ 1312 */
1207static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) 1313static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
@@ -1220,13 +1326,14 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1220 return; 1326 return;
1221 } 1327 }
1222 if (rnp->qsmask == 0) { 1328 if (rnp->qsmask == 0) {
1223 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1329 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
1224 continue; 1330 continue;
1225 } 1331 }
1226 cpu = rnp->grplo; 1332 cpu = rnp->grplo;
1227 bit = 1; 1333 bit = 1;
1228 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { 1334 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
1229 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1335 if ((rnp->qsmask & bit) != 0 &&
1336 f(per_cpu_ptr(rsp->rda, cpu)))
1230 mask |= bit; 1337 mask |= bit;
1231 } 1338 }
1232 if (mask != 0) { 1339 if (mask != 0) {
@@ -1237,6 +1344,11 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1237 } 1344 }
1238 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1345 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1239 } 1346 }
1347 rnp = rcu_get_root(rsp);
1348 if (rnp->qsmask == 0) {
1349 raw_spin_lock_irqsave(&rnp->lock, flags);
1350 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
1351 }
1240} 1352}
1241 1353
1242/* 1354/*
@@ -1351,7 +1463,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1351 } 1463 }
1352 1464
1353 /* If there are callbacks ready, invoke them. */ 1465 /* If there are callbacks ready, invoke them. */
1354 rcu_do_batch(rsp, rdp); 1466 if (cpu_has_callbacks_ready_to_invoke(rdp))
1467 invoke_rcu_callbacks(rsp, rdp);
1355} 1468}
1356 1469
1357/* 1470/*
@@ -1359,29 +1472,37 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1359 */ 1472 */
1360static void rcu_process_callbacks(struct softirq_action *unused) 1473static void rcu_process_callbacks(struct softirq_action *unused)
1361{ 1474{
1362 /*
1363 * Memory references from any prior RCU read-side critical sections
1364 * executed by the interrupted code must be seen before any RCU
1365 * grace-period manipulations below.
1366 */
1367 smp_mb(); /* See above block comment. */
1368
1369 __rcu_process_callbacks(&rcu_sched_state, 1475 __rcu_process_callbacks(&rcu_sched_state,
1370 &__get_cpu_var(rcu_sched_data)); 1476 &__get_cpu_var(rcu_sched_data));
1371 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1477 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1372 rcu_preempt_process_callbacks(); 1478 rcu_preempt_process_callbacks();
1373 1479
1374 /*
1375 * Memory references from any later RCU read-side critical sections
1376 * executed by the interrupted code must be seen after any RCU
1377 * grace-period manipulations above.
1378 */
1379 smp_mb(); /* See above block comment. */
1380
1381 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ 1480 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
1382 rcu_needs_cpu_flush(); 1481 rcu_needs_cpu_flush();
1383} 1482}
1384 1483
1484/*
1485 * Wake up the current CPU's kthread. This replaces raise_softirq()
1486 * in earlier versions of RCU. Note that because we are running on
1487 * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
1488 * cannot disappear out from under us.
1489 */
1490static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1491{
1492 if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
1493 return;
1494 if (likely(!rsp->boost)) {
1495 rcu_do_batch(rsp, rdp);
1496 return;
1497 }
1498 invoke_rcu_callbacks_kthread();
1499}
1500
1501static void invoke_rcu_core(void)
1502{
1503 raise_softirq(RCU_SOFTIRQ);
1504}
1505
1385static void 1506static void
1386__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), 1507__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1387 struct rcu_state *rsp) 1508 struct rcu_state *rsp)
@@ -1402,21 +1523,17 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1402 * a quiescent state betweentimes. 1523 * a quiescent state betweentimes.
1403 */ 1524 */
1404 local_irq_save(flags); 1525 local_irq_save(flags);
1405 rdp = rsp->rda[smp_processor_id()]; 1526 rdp = this_cpu_ptr(rsp->rda);
1406 rcu_process_gp_end(rsp, rdp);
1407 check_for_new_grace_period(rsp, rdp);
1408 1527
1409 /* Add the callback to our list. */ 1528 /* Add the callback to our list. */
1410 *rdp->nxttail[RCU_NEXT_TAIL] = head; 1529 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1411 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1530 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1531 rdp->qlen++;
1412 1532
1413 /* Start a new grace period if one not already started. */ 1533 /* If interrupts were disabled, don't dive into RCU core. */
1414 if (!rcu_gp_in_progress(rsp)) { 1534 if (irqs_disabled_flags(flags)) {
1415 unsigned long nestflag; 1535 local_irq_restore(flags);
1416 struct rcu_node *rnp_root = rcu_get_root(rsp); 1536 return;
1417
1418 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1419 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1420 } 1537 }
1421 1538
1422 /* 1539 /*
@@ -1426,13 +1543,28 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1426 * invoking force_quiescent_state() if the newly enqueued callback 1543 * invoking force_quiescent_state() if the newly enqueued callback
1427 * is the only one waiting for a grace period to complete. 1544 * is the only one waiting for a grace period to complete.
1428 */ 1545 */
1429 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 1546 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1430 rdp->blimit = LONG_MAX; 1547
1431 if (rsp->n_force_qs == rdp->n_force_qs_snap && 1548 /* Are we ignoring a completed grace period? */
1432 *rdp->nxttail[RCU_DONE_TAIL] != head) 1549 rcu_process_gp_end(rsp, rdp);
1433 force_quiescent_state(rsp, 0); 1550 check_for_new_grace_period(rsp, rdp);
1434 rdp->n_force_qs_snap = rsp->n_force_qs; 1551
1435 rdp->qlen_last_fqs_check = rdp->qlen; 1552 /* Start a new grace period if one not already started. */
1553 if (!rcu_gp_in_progress(rsp)) {
1554 unsigned long nestflag;
1555 struct rcu_node *rnp_root = rcu_get_root(rsp);
1556
1557 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1558 rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
1559 } else {
1560 /* Give the grace period a kick. */
1561 rdp->blimit = LONG_MAX;
1562 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1563 *rdp->nxttail[RCU_DONE_TAIL] != head)
1564 force_quiescent_state(rsp, 0);
1565 rdp->n_force_qs_snap = rsp->n_force_qs;
1566 rdp->qlen_last_fqs_check = rdp->qlen;
1567 }
1436 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) 1568 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1437 force_quiescent_state(rsp, 1); 1569 force_quiescent_state(rsp, 1);
1438 local_irq_restore(flags); 1570 local_irq_restore(flags);
@@ -1547,7 +1679,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1547 * or RCU-bh, force a local reschedule. 1679 * or RCU-bh, force a local reschedule.
1548 */ 1680 */
1549 rdp->n_rp_qs_pending++; 1681 rdp->n_rp_qs_pending++;
1550 if (!rdp->preemptable && 1682 if (!rdp->preemptible &&
1551 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, 1683 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
1552 jiffies)) 1684 jiffies))
1553 set_need_resched(); 1685 set_need_resched();
@@ -1662,13 +1794,12 @@ static void _rcu_barrier(struct rcu_state *rsp,
1662 * decrement rcu_barrier_cpu_count -- otherwise the first CPU 1794 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
1663 * might complete its grace period before all of the other CPUs 1795 * might complete its grace period before all of the other CPUs
1664 * did their increment, causing this function to return too 1796 * did their increment, causing this function to return too
1665 * early. 1797 * early. Note that on_each_cpu() disables irqs, which prevents
1798 * any CPUs from coming online or going offline until each online
1799 * CPU has queued its RCU-barrier callback.
1666 */ 1800 */
1667 atomic_set(&rcu_barrier_cpu_count, 1); 1801 atomic_set(&rcu_barrier_cpu_count, 1);
1668 preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
1669 rcu_adopt_orphan_cbs(rsp);
1670 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); 1802 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
1671 preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
1672 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 1803 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
1673 complete(&rcu_barrier_completion); 1804 complete(&rcu_barrier_completion);
1674 wait_for_completion(&rcu_barrier_completion); 1805 wait_for_completion(&rcu_barrier_completion);
@@ -1701,7 +1832,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1701{ 1832{
1702 unsigned long flags; 1833 unsigned long flags;
1703 int i; 1834 int i;
1704 struct rcu_data *rdp = rsp->rda[cpu]; 1835 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1705 struct rcu_node *rnp = rcu_get_root(rsp); 1836 struct rcu_node *rnp = rcu_get_root(rsp);
1706 1837
1707 /* Set up local state, ensuring consistent view of global state. */ 1838 /* Set up local state, ensuring consistent view of global state. */
@@ -1725,11 +1856,11 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1725 * that this CPU cannot possibly have any RCU callbacks in flight yet. 1856 * that this CPU cannot possibly have any RCU callbacks in flight yet.
1726 */ 1857 */
1727static void __cpuinit 1858static void __cpuinit
1728rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) 1859rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
1729{ 1860{
1730 unsigned long flags; 1861 unsigned long flags;
1731 unsigned long mask; 1862 unsigned long mask;
1732 struct rcu_data *rdp = rsp->rda[cpu]; 1863 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1733 struct rcu_node *rnp = rcu_get_root(rsp); 1864 struct rcu_node *rnp = rcu_get_root(rsp);
1734 1865
1735 /* Set up local state, ensuring consistent view of global state. */ 1866 /* Set up local state, ensuring consistent view of global state. */
@@ -1737,7 +1868,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1737 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1868 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1738 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1869 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1739 rdp->beenonline = 1; /* We have now been online. */ 1870 rdp->beenonline = 1; /* We have now been online. */
1740 rdp->preemptable = preemptable; 1871 rdp->preemptible = preemptible;
1741 rdp->qlen_last_fqs_check = 0; 1872 rdp->qlen_last_fqs_check = 0;
1742 rdp->n_force_qs_snap = rsp->n_force_qs; 1873 rdp->n_force_qs_snap = rsp->n_force_qs;
1743 rdp->blimit = blimit; 1874 rdp->blimit = blimit;
@@ -1771,7 +1902,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1771 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 1902 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1772} 1903}
1773 1904
1774static void __cpuinit rcu_online_cpu(int cpu) 1905static void __cpuinit rcu_prepare_cpu(int cpu)
1775{ 1906{
1776 rcu_init_percpu_data(cpu, &rcu_sched_state, 0); 1907 rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
1777 rcu_init_percpu_data(cpu, &rcu_bh_state, 0); 1908 rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
@@ -1785,27 +1916,34 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1785 unsigned long action, void *hcpu) 1916 unsigned long action, void *hcpu)
1786{ 1917{
1787 long cpu = (long)hcpu; 1918 long cpu = (long)hcpu;
1919 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
1920 struct rcu_node *rnp = rdp->mynode;
1788 1921
1789 switch (action) { 1922 switch (action) {
1790 case CPU_UP_PREPARE: 1923 case CPU_UP_PREPARE:
1791 case CPU_UP_PREPARE_FROZEN: 1924 case CPU_UP_PREPARE_FROZEN:
1792 rcu_online_cpu(cpu); 1925 rcu_prepare_cpu(cpu);
1926 rcu_prepare_kthreads(cpu);
1927 break;
1928 case CPU_ONLINE:
1929 case CPU_DOWN_FAILED:
1930 rcu_node_kthread_setaffinity(rnp, -1);
1931 rcu_cpu_kthread_setrt(cpu, 1);
1932 break;
1933 case CPU_DOWN_PREPARE:
1934 rcu_node_kthread_setaffinity(rnp, cpu);
1935 rcu_cpu_kthread_setrt(cpu, 0);
1793 break; 1936 break;
1794 case CPU_DYING: 1937 case CPU_DYING:
1795 case CPU_DYING_FROZEN: 1938 case CPU_DYING_FROZEN:
1796 /* 1939 /*
1797 * preempt_disable() in _rcu_barrier() prevents stop_machine(), 1940 * The whole machine is "stopped" except this CPU, so we can
1798 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" 1941 * touch any data without introducing corruption. We send the
1799 * returns, all online cpus have queued rcu_barrier_func(). 1942 * dying CPU's callbacks to an arbitrarily chosen online CPU.
1800 * The dying CPU clears its cpu_online_mask bit and
1801 * moves all of its RCU callbacks to ->orphan_cbs_list
1802 * in the context of stop_machine(), so subsequent calls
1803 * to _rcu_barrier() will adopt these callbacks and only
1804 * then queue rcu_barrier_func() on all remaining CPUs.
1805 */ 1943 */
1806 rcu_send_cbs_to_orphanage(&rcu_bh_state); 1944 rcu_send_cbs_to_online(&rcu_bh_state);
1807 rcu_send_cbs_to_orphanage(&rcu_sched_state); 1945 rcu_send_cbs_to_online(&rcu_sched_state);
1808 rcu_preempt_send_cbs_to_orphanage(); 1946 rcu_preempt_send_cbs_to_online();
1809 break; 1947 break;
1810 case CPU_DEAD: 1948 case CPU_DEAD:
1811 case CPU_DEAD_FROZEN: 1949 case CPU_DEAD_FROZEN:
@@ -1843,8 +1981,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
1843{ 1981{
1844 int i; 1982 int i;
1845 1983
1846 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) 1984 for (i = NUM_RCU_LVLS - 1; i > 0; i--)
1847 rsp->levelspread[i] = CONFIG_RCU_FANOUT; 1985 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
1986 rsp->levelspread[0] = RCU_FANOUT_LEAF;
1848} 1987}
1849#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ 1988#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
1850static void __init rcu_init_levelspread(struct rcu_state *rsp) 1989static void __init rcu_init_levelspread(struct rcu_state *rsp)
@@ -1865,7 +2004,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
1865/* 2004/*
1866 * Helper function for rcu_init() that initializes one rcu_state structure. 2005 * Helper function for rcu_init() that initializes one rcu_state structure.
1867 */ 2006 */
1868static void __init rcu_init_one(struct rcu_state *rsp) 2007static void __init rcu_init_one(struct rcu_state *rsp,
2008 struct rcu_data __percpu *rda)
1869{ 2009{
1870 static char *buf[] = { "rcu_node_level_0", 2010 static char *buf[] = { "rcu_node_level_0",
1871 "rcu_node_level_1", 2011 "rcu_node_level_1",
@@ -1911,46 +2051,29 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1911 j / rsp->levelspread[i - 1]; 2051 j / rsp->levelspread[i - 1];
1912 } 2052 }
1913 rnp->level = i; 2053 rnp->level = i;
1914 INIT_LIST_HEAD(&rnp->blocked_tasks[0]); 2054 INIT_LIST_HEAD(&rnp->blkd_tasks);
1915 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1916 INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
1917 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1918 } 2055 }
1919 } 2056 }
1920 2057
2058 rsp->rda = rda;
1921 rnp = rsp->level[NUM_RCU_LVLS - 1]; 2059 rnp = rsp->level[NUM_RCU_LVLS - 1];
1922 for_each_possible_cpu(i) { 2060 for_each_possible_cpu(i) {
1923 while (i > rnp->grphi) 2061 while (i > rnp->grphi)
1924 rnp++; 2062 rnp++;
1925 rsp->rda[i]->mynode = rnp; 2063 per_cpu_ptr(rsp->rda, i)->mynode = rnp;
1926 rcu_boot_init_percpu_data(i, rsp); 2064 rcu_boot_init_percpu_data(i, rsp);
1927 } 2065 }
1928} 2066}
1929 2067
1930/*
1931 * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used
1932 * nowhere else! Assigns leaf node pointers into each CPU's rcu_data
1933 * structure.
1934 */
1935#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1936do { \
1937 int i; \
1938 \
1939 for_each_possible_cpu(i) { \
1940 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1941 } \
1942 rcu_init_one(rsp); \
1943} while (0)
1944
1945void __init rcu_init(void) 2068void __init rcu_init(void)
1946{ 2069{
1947 int cpu; 2070 int cpu;
1948 2071
1949 rcu_bootup_announce(); 2072 rcu_bootup_announce();
1950 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 2073 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
1951 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 2074 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
1952 __rcu_init_preempt(); 2075 __rcu_init_preempt();
1953 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 2076 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1954 2077
1955 /* 2078 /*
1956 * We don't need protection against CPU-hotplug here because 2079 * We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 14c040b18ed0..01b2ccda26fb 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -31,46 +31,51 @@
31/* 31/*
32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. 32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
33 * In theory, it should be possible to add more levels straightforwardly. 33 * In theory, it should be possible to add more levels straightforwardly.
34 * In practice, this has not been tested, so there is probably some 34 * In practice, this did work well going from three levels to four.
35 * bug somewhere. 35 * Of course, your mileage may vary.
36 */ 36 */
37#define MAX_RCU_LVLS 4 37#define MAX_RCU_LVLS 4
38#define RCU_FANOUT (CONFIG_RCU_FANOUT) 38#if CONFIG_RCU_FANOUT > 16
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) 39#define RCU_FANOUT_LEAF 16
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) 40#else /* #if CONFIG_RCU_FANOUT > 16 */
41#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT) 41#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
42 42#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
43#if NR_CPUS <= RCU_FANOUT 43#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
44#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
45#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
46#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
47
48#if NR_CPUS <= RCU_FANOUT_1
44# define NUM_RCU_LVLS 1 49# define NUM_RCU_LVLS 1
45# define NUM_RCU_LVL_0 1 50# define NUM_RCU_LVL_0 1
46# define NUM_RCU_LVL_1 (NR_CPUS) 51# define NUM_RCU_LVL_1 (NR_CPUS)
47# define NUM_RCU_LVL_2 0 52# define NUM_RCU_LVL_2 0
48# define NUM_RCU_LVL_3 0 53# define NUM_RCU_LVL_3 0
49# define NUM_RCU_LVL_4 0 54# define NUM_RCU_LVL_4 0
50#elif NR_CPUS <= RCU_FANOUT_SQ 55#elif NR_CPUS <= RCU_FANOUT_2
51# define NUM_RCU_LVLS 2 56# define NUM_RCU_LVLS 2
52# define NUM_RCU_LVL_0 1 57# define NUM_RCU_LVL_0 1
53# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 58# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
54# define NUM_RCU_LVL_2 (NR_CPUS) 59# define NUM_RCU_LVL_2 (NR_CPUS)
55# define NUM_RCU_LVL_3 0 60# define NUM_RCU_LVL_3 0
56# define NUM_RCU_LVL_4 0 61# define NUM_RCU_LVL_4 0
57#elif NR_CPUS <= RCU_FANOUT_CUBE 62#elif NR_CPUS <= RCU_FANOUT_3
58# define NUM_RCU_LVLS 3 63# define NUM_RCU_LVLS 3
59# define NUM_RCU_LVL_0 1 64# define NUM_RCU_LVL_0 1
60# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) 65# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
61# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 66# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
62# define NUM_RCU_LVL_3 NR_CPUS 67# define NUM_RCU_LVL_3 (NR_CPUS)
63# define NUM_RCU_LVL_4 0 68# define NUM_RCU_LVL_4 0
64#elif NR_CPUS <= RCU_FANOUT_FOURTH 69#elif NR_CPUS <= RCU_FANOUT_4
65# define NUM_RCU_LVLS 4 70# define NUM_RCU_LVLS 4
66# define NUM_RCU_LVL_0 1 71# define NUM_RCU_LVL_0 1
67# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE) 72# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
68# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) 73# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
69# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 74# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
70# define NUM_RCU_LVL_4 NR_CPUS 75# define NUM_RCU_LVL_4 (NR_CPUS)
71#else 76#else
72# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" 77# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
73#endif /* #if (NR_CPUS) <= RCU_FANOUT */ 78#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
74 79
75#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) 80#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
76#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) 81#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
@@ -79,13 +84,19 @@
79 * Dynticks per-CPU state. 84 * Dynticks per-CPU state.
80 */ 85 */
81struct rcu_dynticks { 86struct rcu_dynticks {
82 int dynticks_nesting; /* Track nesting level, sort of. */ 87 int dynticks_nesting; /* Track irq/process nesting level. */
83 int dynticks; /* Even value for dynticks-idle, else odd. */ 88 int dynticks_nmi_nesting; /* Track NMI nesting level. */
84 int dynticks_nmi; /* Even value for either dynticks-idle or */ 89 atomic_t dynticks; /* Even value for dynticks-idle, else odd. */
85 /* not in nmi handler, else odd. So this */
86 /* remains even for nmi from irq handler. */
87}; 90};
88 91
92/* RCU's kthread states for tracing. */
93#define RCU_KTHREAD_STOPPED 0
94#define RCU_KTHREAD_RUNNING 1
95#define RCU_KTHREAD_WAITING 2
96#define RCU_KTHREAD_OFFCPU 3
97#define RCU_KTHREAD_YIELDING 4
98#define RCU_KTHREAD_MAX 4
99
89/* 100/*
90 * Definition for node within the RCU grace-period-detection hierarchy. 101 * Definition for node within the RCU grace-period-detection hierarchy.
91 */ 102 */
@@ -104,10 +115,13 @@ struct rcu_node {
104 /* an rcu_data structure, otherwise, each */ 115 /* an rcu_data structure, otherwise, each */
105 /* bit corresponds to a child rcu_node */ 116 /* bit corresponds to a child rcu_node */
106 /* structure. */ 117 /* structure. */
107 unsigned long expmask; /* Groups that have ->blocked_tasks[] */ 118 unsigned long expmask; /* Groups that have ->blkd_tasks */
108 /* elements that need to drain to allow the */ 119 /* elements that need to drain to allow the */
109 /* current expedited grace period to */ 120 /* current expedited grace period to */
110 /* complete (only for TREE_PREEMPT_RCU). */ 121 /* complete (only for TREE_PREEMPT_RCU). */
122 atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */
123 /* Since this has meaning only for leaf */
124 /* rcu_node structures, 32 bits suffices. */
111 unsigned long qsmaskinit; 125 unsigned long qsmaskinit;
112 /* Per-GP initial value for qsmask & expmask. */ 126 /* Per-GP initial value for qsmask & expmask. */
113 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 127 unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -117,11 +131,62 @@ struct rcu_node {
117 u8 grpnum; /* CPU/group number for next level up. */ 131 u8 grpnum; /* CPU/group number for next level up. */
118 u8 level; /* root is at level 0. */ 132 u8 level; /* root is at level 0. */
119 struct rcu_node *parent; 133 struct rcu_node *parent;
120 struct list_head blocked_tasks[4]; 134 struct list_head blkd_tasks;
121 /* Tasks blocked in RCU read-side critsect. */ 135 /* Tasks blocked in RCU read-side critical */
122 /* Grace period number (->gpnum) x blocked */ 136 /* section. Tasks are placed at the head */
123 /* by tasks on the (x & 0x1) element of the */ 137 /* of this list and age towards the tail. */
124 /* blocked_tasks[] array. */ 138 struct list_head *gp_tasks;
139 /* Pointer to the first task blocking the */
140 /* current grace period, or NULL if there */
141 /* is no such task. */
142 struct list_head *exp_tasks;
143 /* Pointer to the first task blocking the */
144 /* current expedited grace period, or NULL */
145 /* if there is no such task. If there */
146 /* is no current expedited grace period, */
147 /* then there can cannot be any such task. */
148#ifdef CONFIG_RCU_BOOST
149 struct list_head *boost_tasks;
150 /* Pointer to first task that needs to be */
151 /* priority boosted, or NULL if no priority */
152 /* boosting is needed for this rcu_node */
153 /* structure. If there are no tasks */
154 /* queued on this rcu_node structure that */
155 /* are blocking the current grace period, */
156 /* there can be no such task. */
157 unsigned long boost_time;
158 /* When to start boosting (jiffies). */
159 struct task_struct *boost_kthread_task;
160 /* kthread that takes care of priority */
161 /* boosting for this rcu_node structure. */
162 unsigned int boost_kthread_status;
163 /* State of boost_kthread_task for tracing. */
164 unsigned long n_tasks_boosted;
165 /* Total number of tasks boosted. */
166 unsigned long n_exp_boosts;
167 /* Number of tasks boosted for expedited GP. */
168 unsigned long n_normal_boosts;
169 /* Number of tasks boosted for normal GP. */
170 unsigned long n_balk_blkd_tasks;
171 /* Refused to boost: no blocked tasks. */
172 unsigned long n_balk_exp_gp_tasks;
173 /* Refused to boost: nothing blocking GP. */
174 unsigned long n_balk_boost_tasks;
175 /* Refused to boost: already boosting. */
176 unsigned long n_balk_notblocked;
177 /* Refused to boost: RCU RS CS still running. */
178 unsigned long n_balk_notyet;
179 /* Refused to boost: not yet time. */
180 unsigned long n_balk_nos;
181 /* Refused to boost: not sure why, though. */
182 /* This can happen due to race conditions. */
183#endif /* #ifdef CONFIG_RCU_BOOST */
184 struct task_struct *node_kthread_task;
185 /* kthread that takes care of this rcu_node */
186 /* structure, for example, awakening the */
187 /* per-CPU kthreads as needed. */
188 unsigned int node_kthread_status;
189 /* State of node_kthread_task for tracing. */
125} ____cacheline_internodealigned_in_smp; 190} ____cacheline_internodealigned_in_smp;
126 191
127/* 192/*
@@ -170,7 +235,7 @@ struct rcu_data {
170 bool passed_quiesc; /* User-mode/idle loop etc. */ 235 bool passed_quiesc; /* User-mode/idle loop etc. */
171 bool qs_pending; /* Core waits for quiesc state. */ 236 bool qs_pending; /* Core waits for quiesc state. */
172 bool beenonline; /* CPU online at least once. */ 237 bool beenonline; /* CPU online at least once. */
173 bool preemptable; /* Preemptable RCU? */ 238 bool preemptible; /* Preemptible RCU? */
174 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 239 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
175 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 240 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
176 241
@@ -202,6 +267,9 @@ struct rcu_data {
202 long qlen; /* # of queued callbacks */ 267 long qlen; /* # of queued callbacks */
203 long qlen_last_fqs_check; 268 long qlen_last_fqs_check;
204 /* qlen at last check for QS forcing */ 269 /* qlen at last check for QS forcing */
270 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
271 unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
272 unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
205 unsigned long n_force_qs_snap; 273 unsigned long n_force_qs_snap;
206 /* did other CPU force QS recently? */ 274 /* did other CPU force QS recently? */
207 long blimit; /* Upper limit on a processed batch */ 275 long blimit; /* Upper limit on a processed batch */
@@ -210,7 +278,6 @@ struct rcu_data {
210 /* 3) dynticks interface. */ 278 /* 3) dynticks interface. */
211 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ 279 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
212 int dynticks_snap; /* Per-GP tracking for dynticks. */ 280 int dynticks_snap; /* Per-GP tracking for dynticks. */
213 int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
214#endif /* #ifdef CONFIG_NO_HZ */ 281#endif /* #ifdef CONFIG_NO_HZ */
215 282
216 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 283 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
@@ -246,7 +313,6 @@ struct rcu_data {
246#endif /* #else #ifdef CONFIG_NO_HZ */ 313#endif /* #else #ifdef CONFIG_NO_HZ */
247 314
248#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 315#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
249#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
250 316
251#ifdef CONFIG_PROVE_RCU 317#ifdef CONFIG_PROVE_RCU
252#define RCU_STALL_DELAY_DELTA (5 * HZ) 318#define RCU_STALL_DELAY_DELTA (5 * HZ)
@@ -254,19 +320,26 @@ struct rcu_data {
254#define RCU_STALL_DELAY_DELTA 0 320#define RCU_STALL_DELAY_DELTA 0
255#endif 321#endif
256 322
257#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA) 323#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
324 RCU_STALL_DELAY_DELTA)
258 /* for rsp->jiffies_stall */ 325 /* for rsp->jiffies_stall */
259#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA) 326#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
260 /* for rsp->jiffies_stall */ 327 /* for rsp->jiffies_stall */
261#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 328#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
262 /* to take at least one */ 329 /* to take at least one */
263 /* scheduling clock irq */ 330 /* scheduling clock irq */
264 /* before ratting on them. */ 331 /* before ratting on them. */
265 332
266#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 333#define rcu_wait(cond) \
267 334do { \
268#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) 335 for (;;) { \
269#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) 336 set_current_state(TASK_INTERRUPTIBLE); \
337 if (cond) \
338 break; \
339 schedule(); \
340 } \
341 __set_current_state(TASK_RUNNING); \
342} while (0)
270 343
271/* 344/*
272 * RCU global state, including node hierarchy. This hierarchy is 345 * RCU global state, including node hierarchy. This hierarchy is
@@ -283,7 +356,7 @@ struct rcu_state {
283 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ 356 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
284 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ 357 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
285 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ 358 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
286 struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ 359 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
287 360
288 /* The following fields are guarded by the root rcu_node's lock. */ 361 /* The following fields are guarded by the root rcu_node's lock. */
289 362
@@ -296,21 +369,14 @@ struct rcu_state {
296 /* period because */ 369 /* period because */
297 /* force_quiescent_state() */ 370 /* force_quiescent_state() */
298 /* was running. */ 371 /* was running. */
372 u8 boost; /* Subject to priority boost. */
299 unsigned long gpnum; /* Current gp number. */ 373 unsigned long gpnum; /* Current gp number. */
300 unsigned long completed; /* # of last completed gp. */ 374 unsigned long completed; /* # of last completed gp. */
301 375
302 /* End of fields guarded by root rcu_node's lock. */ 376 /* End of fields guarded by root rcu_node's lock. */
303 377
304 raw_spinlock_t onofflock; /* exclude on/offline and */ 378 raw_spinlock_t onofflock; /* exclude on/offline and */
305 /* starting new GP. Also */ 379 /* starting new GP. */
306 /* protects the following */
307 /* orphan_cbs fields. */
308 struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */
309 /* orphaned by all CPUs in */
310 /* a given leaf rcu_node */
311 /* going offline. */
312 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
313 long orphan_qlen; /* Number of orphaned cbs. */
314 raw_spinlock_t fqslock; /* Only one task forcing */ 380 raw_spinlock_t fqslock; /* Only one task forcing */
315 /* quiescent states. */ 381 /* quiescent states. */
316 unsigned long jiffies_force_qs; /* Time at which to invoke */ 382 unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -321,12 +387,12 @@ struct rcu_state {
321 /* due to lock unavailable. */ 387 /* due to lock unavailable. */
322 unsigned long n_force_qs_ngp; /* Number of calls leaving */ 388 unsigned long n_force_qs_ngp; /* Number of calls leaving */
323 /* due to no GP active. */ 389 /* due to no GP active. */
324#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
325 unsigned long gp_start; /* Time at which GP started, */ 390 unsigned long gp_start; /* Time at which GP started, */
326 /* but in jiffies. */ 391 /* but in jiffies. */
327 unsigned long jiffies_stall; /* Time at which to check */ 392 unsigned long jiffies_stall; /* Time at which to check */
328 /* for CPU stalls. */ 393 /* for CPU stalls. */
329#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 394 unsigned long gp_max; /* Maximum GP duration in */
395 /* jiffies. */
330 char *name; /* Name of structure. */ 396 char *name; /* Name of structure. */
331}; 397};
332 398
@@ -357,15 +423,15 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
357static void rcu_bootup_announce(void); 423static void rcu_bootup_announce(void);
358long rcu_batches_completed(void); 424long rcu_batches_completed(void);
359static void rcu_preempt_note_context_switch(int cpu); 425static void rcu_preempt_note_context_switch(int cpu);
360static int rcu_preempted_readers(struct rcu_node *rnp); 426static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
361#ifdef CONFIG_HOTPLUG_CPU 427#ifdef CONFIG_HOTPLUG_CPU
362static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 428static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
363 unsigned long flags); 429 unsigned long flags);
430static void rcu_stop_cpu_kthread(int cpu);
364#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 431#endif /* #ifdef CONFIG_HOTPLUG_CPU */
365#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
366static void rcu_print_detail_task_stall(struct rcu_state *rsp); 432static void rcu_print_detail_task_stall(struct rcu_state *rsp);
367static void rcu_print_task_stall(struct rcu_node *rnp); 433static void rcu_print_task_stall(struct rcu_node *rnp);
368#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 434static void rcu_preempt_stall_reset(void);
369static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 435static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
370#ifdef CONFIG_HOTPLUG_CPU 436#ifdef CONFIG_HOTPLUG_CPU
371static int rcu_preempt_offline_tasks(struct rcu_state *rsp, 437static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
@@ -382,8 +448,23 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
382static int rcu_preempt_pending(int cpu); 448static int rcu_preempt_pending(int cpu);
383static int rcu_preempt_needs_cpu(int cpu); 449static int rcu_preempt_needs_cpu(int cpu);
384static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 450static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
385static void rcu_preempt_send_cbs_to_orphanage(void); 451static void rcu_preempt_send_cbs_to_online(void);
386static void __init __rcu_init_preempt(void); 452static void __init __rcu_init_preempt(void);
387static void rcu_needs_cpu_flush(void); 453static void rcu_needs_cpu_flush(void);
454static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
455static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
456static void invoke_rcu_callbacks_kthread(void);
457#ifdef CONFIG_RCU_BOOST
458static void rcu_preempt_do_callbacks(void);
459static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
460 cpumask_var_t cm);
461static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
462 struct rcu_node *rnp,
463 int rnp_index);
464static void invoke_rcu_node_kthread(struct rcu_node *rnp);
465static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
466#endif /* #ifdef CONFIG_RCU_BOOST */
467static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
468static void __cpuinit rcu_prepare_kthreads(int cpu);
388 469
389#endif /* #ifndef RCU_TREE_NONCORE */ 470#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 0e4f420245d9..8aafbb80b8b0 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version) 2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic 3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics. 4 * or preemptible semantics.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -25,6 +25,7 @@
25 */ 25 */
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/stop_machine.h>
28 29
29/* 30/*
30 * Check the RCU kernel configuration parameters and print informative 31 * Check the RCU kernel configuration parameters and print informative
@@ -53,11 +54,7 @@ static void __init rcu_bootup_announce_oddness(void)
53#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE 54#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
54 printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); 55 printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
55#endif 56#endif
56#ifndef CONFIG_RCU_CPU_STALL_DETECTOR 57#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
57 printk(KERN_INFO
58 "\tRCU-based detection of stalled CPUs is disabled.\n");
59#endif
60#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
61 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); 58 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
62#endif 59#endif
63#if NUM_RCU_LVL_4 != 0 60#if NUM_RCU_LVL_4 != 0
@@ -69,7 +66,9 @@ static void __init rcu_bootup_announce_oddness(void)
69 66
70struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 67struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
71DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 68DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
69static struct rcu_state *rcu_state = &rcu_preempt_state;
72 70
71static void rcu_read_unlock_special(struct task_struct *t);
73static int rcu_preempted_readers_exp(struct rcu_node *rnp); 72static int rcu_preempted_readers_exp(struct rcu_node *rnp);
74 73
75/* 74/*
@@ -77,7 +76,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
77 */ 76 */
78static void __init rcu_bootup_announce(void) 77static void __init rcu_bootup_announce(void)
79{ 78{
80 printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n"); 79 printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n");
81 rcu_bootup_announce_oddness(); 80 rcu_bootup_announce_oddness();
82} 81}
83 82
@@ -110,7 +109,7 @@ void rcu_force_quiescent_state(void)
110EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 109EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
111 110
112/* 111/*
113 * Record a preemptable-RCU quiescent state for the specified CPU. Note 112 * Record a preemptible-RCU quiescent state for the specified CPU. Note
114 * that this just means that the task currently running on the CPU is 113 * that this just means that the task currently running on the CPU is
115 * not in a quiescent state. There might be any number of tasks blocked 114 * not in a quiescent state. There might be any number of tasks blocked
116 * while in an RCU read-side critical section. 115 * while in an RCU read-side critical section.
@@ -133,12 +132,12 @@ static void rcu_preempt_qs(int cpu)
133 * We have entered the scheduler, and the current task might soon be 132 * We have entered the scheduler, and the current task might soon be
134 * context-switched away from. If this task is in an RCU read-side 133 * context-switched away from. If this task is in an RCU read-side
135 * critical section, we will no longer be able to rely on the CPU to 134 * critical section, we will no longer be able to rely on the CPU to
136 * record that fact, so we enqueue the task on the appropriate entry 135 * record that fact, so we enqueue the task on the blkd_tasks list.
137 * of the blocked_tasks[] array. The task will dequeue itself when 136 * The task will dequeue itself when it exits the outermost enclosing
138 * it exits the outermost enclosing RCU read-side critical section. 137 * RCU read-side critical section. Therefore, the current grace period
139 * Therefore, the current grace period cannot be permitted to complete 138 * cannot be permitted to complete until the blkd_tasks list entries
140 * until the blocked_tasks[] entry indexed by the low-order bit of 139 * predating the current grace period drain, in other words, until
141 * rnp->gpnum empties. 140 * rnp->gp_tasks becomes NULL.
142 * 141 *
143 * Caller must disable preemption. 142 * Caller must disable preemption.
144 */ 143 */
@@ -146,15 +145,14 @@ static void rcu_preempt_note_context_switch(int cpu)
146{ 145{
147 struct task_struct *t = current; 146 struct task_struct *t = current;
148 unsigned long flags; 147 unsigned long flags;
149 int phase;
150 struct rcu_data *rdp; 148 struct rcu_data *rdp;
151 struct rcu_node *rnp; 149 struct rcu_node *rnp;
152 150
153 if (t->rcu_read_lock_nesting && 151 if (t->rcu_read_lock_nesting > 0 &&
154 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 152 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
155 153
156 /* Possibly blocking in an RCU read-side critical section. */ 154 /* Possibly blocking in an RCU read-side critical section. */
157 rdp = rcu_preempt_state.rda[cpu]; 155 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
158 rnp = rdp->mynode; 156 rnp = rdp->mynode;
159 raw_spin_lock_irqsave(&rnp->lock, flags); 157 raw_spin_lock_irqsave(&rnp->lock, flags);
160 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 158 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -168,16 +166,39 @@ static void rcu_preempt_note_context_switch(int cpu)
168 * (i.e., this CPU has not yet passed through a quiescent 166 * (i.e., this CPU has not yet passed through a quiescent
169 * state for the current grace period), then as long 167 * state for the current grace period), then as long
170 * as that task remains queued, the current grace period 168 * as that task remains queued, the current grace period
171 * cannot end. 169 * cannot end. Note that there is some uncertainty as
170 * to exactly when the current grace period started.
171 * We take a conservative approach, which can result
172 * in unnecessarily waiting on tasks that started very
173 * slightly after the current grace period began. C'est
174 * la vie!!!
172 * 175 *
173 * But first, note that the current CPU must still be 176 * But first, note that the current CPU must still be
174 * on line! 177 * on line!
175 */ 178 */
176 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); 179 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
177 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 180 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
178 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; 181 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
179 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); 182 list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
183 rnp->gp_tasks = &t->rcu_node_entry;
184#ifdef CONFIG_RCU_BOOST
185 if (rnp->boost_tasks != NULL)
186 rnp->boost_tasks = rnp->gp_tasks;
187#endif /* #ifdef CONFIG_RCU_BOOST */
188 } else {
189 list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
190 if (rnp->qsmask & rdp->grpmask)
191 rnp->gp_tasks = &t->rcu_node_entry;
192 }
180 raw_spin_unlock_irqrestore(&rnp->lock, flags); 193 raw_spin_unlock_irqrestore(&rnp->lock, flags);
194 } else if (t->rcu_read_lock_nesting < 0 &&
195 t->rcu_read_unlock_special) {
196
197 /*
198 * Complete exit from RCU read-side critical section on
199 * behalf of preempted instance of __rcu_read_unlock().
200 */
201 rcu_read_unlock_special(t);
181 } 202 }
182 203
183 /* 204 /*
@@ -195,13 +216,13 @@ static void rcu_preempt_note_context_switch(int cpu)
195} 216}
196 217
197/* 218/*
198 * Tree-preemptable RCU implementation for rcu_read_lock(). 219 * Tree-preemptible RCU implementation for rcu_read_lock().
199 * Just increment ->rcu_read_lock_nesting, shared state will be updated 220 * Just increment ->rcu_read_lock_nesting, shared state will be updated
200 * if we block. 221 * if we block.
201 */ 222 */
202void __rcu_read_lock(void) 223void __rcu_read_lock(void)
203{ 224{
204 ACCESS_ONCE(current->rcu_read_lock_nesting)++; 225 current->rcu_read_lock_nesting++;
205 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ 226 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
206} 227}
207EXPORT_SYMBOL_GPL(__rcu_read_lock); 228EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -211,12 +232,9 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
211 * for the specified rcu_node structure. If the caller needs a reliable 232 * for the specified rcu_node structure. If the caller needs a reliable
212 * answer, it must hold the rcu_node's ->lock. 233 * answer, it must hold the rcu_node's ->lock.
213 */ 234 */
214static int rcu_preempted_readers(struct rcu_node *rnp) 235static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
215{ 236{
216 int phase = rnp->gpnum & 0x1; 237 return rnp->gp_tasks != NULL;
217
218 return !list_empty(&rnp->blocked_tasks[phase]) ||
219 !list_empty(&rnp->blocked_tasks[phase + 2]);
220} 238}
221 239
222/* 240/*
@@ -232,7 +250,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
232 unsigned long mask; 250 unsigned long mask;
233 struct rcu_node *rnp_p; 251 struct rcu_node *rnp_p;
234 252
235 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 253 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
236 raw_spin_unlock_irqrestore(&rnp->lock, flags); 254 raw_spin_unlock_irqrestore(&rnp->lock, flags);
237 return; /* Still need more quiescent states! */ 255 return; /* Still need more quiescent states! */
238 } 256 }
@@ -256,15 +274,31 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
256} 274}
257 275
258/* 276/*
277 * Advance a ->blkd_tasks-list pointer to the next entry, instead
278 * returning NULL if at the end of the list.
279 */
280static struct list_head *rcu_next_node_entry(struct task_struct *t,
281 struct rcu_node *rnp)
282{
283 struct list_head *np;
284
285 np = t->rcu_node_entry.next;
286 if (np == &rnp->blkd_tasks)
287 np = NULL;
288 return np;
289}
290
291/*
259 * Handle special cases during rcu_read_unlock(), such as needing to 292 * Handle special cases during rcu_read_unlock(), such as needing to
260 * notify RCU core processing or task having blocked during the RCU 293 * notify RCU core processing or task having blocked during the RCU
261 * read-side critical section. 294 * read-side critical section.
262 */ 295 */
263static void rcu_read_unlock_special(struct task_struct *t) 296static noinline void rcu_read_unlock_special(struct task_struct *t)
264{ 297{
265 int empty; 298 int empty;
266 int empty_exp; 299 int empty_exp;
267 unsigned long flags; 300 unsigned long flags;
301 struct list_head *np;
268 struct rcu_node *rnp; 302 struct rcu_node *rnp;
269 int special; 303 int special;
270 304
@@ -284,7 +318,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
284 } 318 }
285 319
286 /* Hardware IRQ handlers cannot block. */ 320 /* Hardware IRQ handlers cannot block. */
287 if (in_irq()) { 321 if (in_irq() || in_serving_softirq()) {
288 local_irq_restore(flags); 322 local_irq_restore(flags);
289 return; 323 return;
290 } 324 }
@@ -305,10 +339,24 @@ static void rcu_read_unlock_special(struct task_struct *t)
305 break; 339 break;
306 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 340 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
307 } 341 }
308 empty = !rcu_preempted_readers(rnp); 342 empty = !rcu_preempt_blocked_readers_cgp(rnp);
309 empty_exp = !rcu_preempted_readers_exp(rnp); 343 empty_exp = !rcu_preempted_readers_exp(rnp);
310 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 344 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
345 np = rcu_next_node_entry(t, rnp);
311 list_del_init(&t->rcu_node_entry); 346 list_del_init(&t->rcu_node_entry);
347 if (&t->rcu_node_entry == rnp->gp_tasks)
348 rnp->gp_tasks = np;
349 if (&t->rcu_node_entry == rnp->exp_tasks)
350 rnp->exp_tasks = np;
351#ifdef CONFIG_RCU_BOOST
352 if (&t->rcu_node_entry == rnp->boost_tasks)
353 rnp->boost_tasks = np;
354 /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */
355 if (t->rcu_boosted) {
356 special |= RCU_READ_UNLOCK_BOOSTED;
357 t->rcu_boosted = 0;
358 }
359#endif /* #ifdef CONFIG_RCU_BOOST */
312 t->rcu_blocked_node = NULL; 360 t->rcu_blocked_node = NULL;
313 361
314 /* 362 /*
@@ -321,6 +369,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
321 else 369 else
322 rcu_report_unblock_qs_rnp(rnp, flags); 370 rcu_report_unblock_qs_rnp(rnp, flags);
323 371
372#ifdef CONFIG_RCU_BOOST
373 /* Unboost if we were boosted. */
374 if (special & RCU_READ_UNLOCK_BOOSTED) {
375 rt_mutex_unlock(t->rcu_boost_mutex);
376 t->rcu_boost_mutex = NULL;
377 }
378#endif /* #ifdef CONFIG_RCU_BOOST */
379
324 /* 380 /*
325 * If this was the last task on the expedited lists, 381 * If this was the last task on the expedited lists,
326 * then we need to report up the rcu_node hierarchy. 382 * then we need to report up the rcu_node hierarchy.
@@ -333,7 +389,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
333} 389}
334 390
335/* 391/*
336 * Tree-preemptable RCU implementation for rcu_read_unlock(). 392 * Tree-preemptible RCU implementation for rcu_read_unlock().
337 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost 393 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
338 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then 394 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
339 * invoke rcu_read_unlock_special() to clean up after a context switch 395 * invoke rcu_read_unlock_special() to clean up after a context switch
@@ -344,17 +400,26 @@ void __rcu_read_unlock(void)
344 struct task_struct *t = current; 400 struct task_struct *t = current;
345 401
346 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ 402 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
347 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && 403 if (t->rcu_read_lock_nesting != 1)
348 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 404 --t->rcu_read_lock_nesting;
349 rcu_read_unlock_special(t); 405 else {
406 t->rcu_read_lock_nesting = INT_MIN;
407 barrier(); /* assign before ->rcu_read_unlock_special load */
408 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
409 rcu_read_unlock_special(t);
410 barrier(); /* ->rcu_read_unlock_special load before assign */
411 t->rcu_read_lock_nesting = 0;
412 }
350#ifdef CONFIG_PROVE_LOCKING 413#ifdef CONFIG_PROVE_LOCKING
351 WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0); 414 {
415 int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
416
417 WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
418 }
352#endif /* #ifdef CONFIG_PROVE_LOCKING */ 419#endif /* #ifdef CONFIG_PROVE_LOCKING */
353} 420}
354EXPORT_SYMBOL_GPL(__rcu_read_unlock); 421EXPORT_SYMBOL_GPL(__rcu_read_unlock);
355 422
356#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
357
358#ifdef CONFIG_RCU_CPU_STALL_VERBOSE 423#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
359 424
360/* 425/*
@@ -364,18 +429,16 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock);
364static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) 429static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
365{ 430{
366 unsigned long flags; 431 unsigned long flags;
367 struct list_head *lp;
368 int phase;
369 struct task_struct *t; 432 struct task_struct *t;
370 433
371 if (rcu_preempted_readers(rnp)) { 434 if (!rcu_preempt_blocked_readers_cgp(rnp))
372 raw_spin_lock_irqsave(&rnp->lock, flags); 435 return;
373 phase = rnp->gpnum & 0x1; 436 raw_spin_lock_irqsave(&rnp->lock, flags);
374 lp = &rnp->blocked_tasks[phase]; 437 t = list_entry(rnp->gp_tasks,
375 list_for_each_entry(t, lp, rcu_node_entry) 438 struct task_struct, rcu_node_entry);
376 sched_show_task(t); 439 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
377 raw_spin_unlock_irqrestore(&rnp->lock, flags); 440 sched_show_task(t);
378 } 441 raw_spin_unlock_irqrestore(&rnp->lock, flags);
379} 442}
380 443
381/* 444/*
@@ -405,19 +468,25 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
405 */ 468 */
406static void rcu_print_task_stall(struct rcu_node *rnp) 469static void rcu_print_task_stall(struct rcu_node *rnp)
407{ 470{
408 struct list_head *lp;
409 int phase;
410 struct task_struct *t; 471 struct task_struct *t;
411 472
412 if (rcu_preempted_readers(rnp)) { 473 if (!rcu_preempt_blocked_readers_cgp(rnp))
413 phase = rnp->gpnum & 0x1; 474 return;
414 lp = &rnp->blocked_tasks[phase]; 475 t = list_entry(rnp->gp_tasks,
415 list_for_each_entry(t, lp, rcu_node_entry) 476 struct task_struct, rcu_node_entry);
416 printk(" P%d", t->pid); 477 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
417 } 478 printk(" P%d", t->pid);
418} 479}
419 480
420#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 481/*
482 * Suppress preemptible RCU's CPU stall warnings by pushing the
483 * time of the next stall-warning message comfortably far into the
484 * future.
485 */
486static void rcu_preempt_stall_reset(void)
487{
488 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
489}
421 490
422/* 491/*
423 * Check that the list of blocked tasks for the newly completed grace 492 * Check that the list of blocked tasks for the newly completed grace
@@ -425,10 +494,15 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
425 * period that still has RCU readers blocked! This function must be 494 * period that still has RCU readers blocked! This function must be
426 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock 495 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
427 * must be held by the caller. 496 * must be held by the caller.
497 *
498 * Also, if there are blocked tasks on the list, they automatically
499 * block the newly created grace period, so set up ->gp_tasks accordingly.
428 */ 500 */
429static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 501static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
430{ 502{
431 WARN_ON_ONCE(rcu_preempted_readers(rnp)); 503 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
504 if (!list_empty(&rnp->blkd_tasks))
505 rnp->gp_tasks = rnp->blkd_tasks.next;
432 WARN_ON_ONCE(rnp->qsmask); 506 WARN_ON_ONCE(rnp->qsmask);
433} 507}
434 508
@@ -452,50 +526,68 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
452 struct rcu_node *rnp, 526 struct rcu_node *rnp,
453 struct rcu_data *rdp) 527 struct rcu_data *rdp)
454{ 528{
455 int i;
456 struct list_head *lp; 529 struct list_head *lp;
457 struct list_head *lp_root; 530 struct list_head *lp_root;
458 int retval = 0; 531 int retval = 0;
459 struct rcu_node *rnp_root = rcu_get_root(rsp); 532 struct rcu_node *rnp_root = rcu_get_root(rsp);
460 struct task_struct *tp; 533 struct task_struct *t;
461 534
462 if (rnp == rnp_root) { 535 if (rnp == rnp_root) {
463 WARN_ONCE(1, "Last CPU thought to be offlined?"); 536 WARN_ONCE(1, "Last CPU thought to be offlined?");
464 return 0; /* Shouldn't happen: at least one CPU online. */ 537 return 0; /* Shouldn't happen: at least one CPU online. */
465 } 538 }
466 WARN_ON_ONCE(rnp != rdp->mynode && 539
467 (!list_empty(&rnp->blocked_tasks[0]) || 540 /* If we are on an internal node, complain bitterly. */
468 !list_empty(&rnp->blocked_tasks[1]) || 541 WARN_ON_ONCE(rnp != rdp->mynode);
469 !list_empty(&rnp->blocked_tasks[2]) ||
470 !list_empty(&rnp->blocked_tasks[3])));
471 542
472 /* 543 /*
473 * Move tasks up to root rcu_node. Rely on the fact that the 544 * Move tasks up to root rcu_node. Don't try to get fancy for
474 * root rcu_node can be at most one ahead of the rest of the 545 * this corner-case operation -- just put this node's tasks
475 * rcu_nodes in terms of gp_num value. This fact allows us to 546 * at the head of the root node's list, and update the root node's
476 * move the blocked_tasks[] array directly, element by element. 547 * ->gp_tasks and ->exp_tasks pointers to those of this node's,
548 * if non-NULL. This might result in waiting for more tasks than
549 * absolutely necessary, but this is a good performance/complexity
550 * tradeoff.
477 */ 551 */
478 if (rcu_preempted_readers(rnp)) 552 if (rcu_preempt_blocked_readers_cgp(rnp))
479 retval |= RCU_OFL_TASKS_NORM_GP; 553 retval |= RCU_OFL_TASKS_NORM_GP;
480 if (rcu_preempted_readers_exp(rnp)) 554 if (rcu_preempted_readers_exp(rnp))
481 retval |= RCU_OFL_TASKS_EXP_GP; 555 retval |= RCU_OFL_TASKS_EXP_GP;
482 for (i = 0; i < 4; i++) { 556 lp = &rnp->blkd_tasks;
483 lp = &rnp->blocked_tasks[i]; 557 lp_root = &rnp_root->blkd_tasks;
484 lp_root = &rnp_root->blocked_tasks[i]; 558 while (!list_empty(lp)) {
485 while (!list_empty(lp)) { 559 t = list_entry(lp->next, typeof(*t), rcu_node_entry);
486 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); 560 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
487 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ 561 list_del(&t->rcu_node_entry);
488 list_del(&tp->rcu_node_entry); 562 t->rcu_blocked_node = rnp_root;
489 tp->rcu_blocked_node = rnp_root; 563 list_add(&t->rcu_node_entry, lp_root);
490 list_add(&tp->rcu_node_entry, lp_root); 564 if (&t->rcu_node_entry == rnp->gp_tasks)
491 raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 565 rnp_root->gp_tasks = rnp->gp_tasks;
492 } 566 if (&t->rcu_node_entry == rnp->exp_tasks)
567 rnp_root->exp_tasks = rnp->exp_tasks;
568#ifdef CONFIG_RCU_BOOST
569 if (&t->rcu_node_entry == rnp->boost_tasks)
570 rnp_root->boost_tasks = rnp->boost_tasks;
571#endif /* #ifdef CONFIG_RCU_BOOST */
572 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
493 } 573 }
574
575#ifdef CONFIG_RCU_BOOST
576 /* In case root is being boosted and leaf is not. */
577 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
578 if (rnp_root->boost_tasks != NULL &&
579 rnp_root->boost_tasks != rnp_root->gp_tasks)
580 rnp_root->boost_tasks = rnp_root->gp_tasks;
581 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
582#endif /* #ifdef CONFIG_RCU_BOOST */
583
584 rnp->gp_tasks = NULL;
585 rnp->exp_tasks = NULL;
494 return retval; 586 return retval;
495} 587}
496 588
497/* 589/*
498 * Do CPU-offline processing for preemptable RCU. 590 * Do CPU-offline processing for preemptible RCU.
499 */ 591 */
500static void rcu_preempt_offline_cpu(int cpu) 592static void rcu_preempt_offline_cpu(int cpu)
501{ 593{
@@ -519,12 +611,13 @@ static void rcu_preempt_check_callbacks(int cpu)
519 rcu_preempt_qs(cpu); 611 rcu_preempt_qs(cpu);
520 return; 612 return;
521 } 613 }
522 if (per_cpu(rcu_preempt_data, cpu).qs_pending) 614 if (t->rcu_read_lock_nesting > 0 &&
615 per_cpu(rcu_preempt_data, cpu).qs_pending)
523 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; 616 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
524} 617}
525 618
526/* 619/*
527 * Process callbacks for preemptable RCU. 620 * Process callbacks for preemptible RCU.
528 */ 621 */
529static void rcu_preempt_process_callbacks(void) 622static void rcu_preempt_process_callbacks(void)
530{ 623{
@@ -532,8 +625,17 @@ static void rcu_preempt_process_callbacks(void)
532 &__get_cpu_var(rcu_preempt_data)); 625 &__get_cpu_var(rcu_preempt_data));
533} 626}
534 627
628#ifdef CONFIG_RCU_BOOST
629
630static void rcu_preempt_do_callbacks(void)
631{
632 rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data));
633}
634
635#endif /* #ifdef CONFIG_RCU_BOOST */
636
535/* 637/*
536 * Queue a preemptable-RCU callback for invocation after a grace period. 638 * Queue a preemptible-RCU callback for invocation after a grace period.
537 */ 639 */
538void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 640void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
539{ 641{
@@ -546,9 +648,11 @@ EXPORT_SYMBOL_GPL(call_rcu);
546 * 648 *
547 * Control will return to the caller some time after a full grace 649 * Control will return to the caller some time after a full grace
548 * period has elapsed, in other words after all currently executing RCU 650 * period has elapsed, in other words after all currently executing RCU
549 * read-side critical sections have completed. RCU read-side critical 651 * read-side critical sections have completed. Note, however, that
550 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 652 * upon return from synchronize_rcu(), the caller might well be executing
551 * and may be nested. 653 * concurrently with new RCU read-side critical sections that began while
654 * synchronize_rcu() was waiting. RCU read-side critical sections are
655 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
552 */ 656 */
553void synchronize_rcu(void) 657void synchronize_rcu(void)
554{ 658{
@@ -579,8 +683,7 @@ static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
579 */ 683 */
580static int rcu_preempted_readers_exp(struct rcu_node *rnp) 684static int rcu_preempted_readers_exp(struct rcu_node *rnp)
581{ 685{
582 return !list_empty(&rnp->blocked_tasks[2]) || 686 return rnp->exp_tasks != NULL;
583 !list_empty(&rnp->blocked_tasks[3]);
584} 687}
585 688
586/* 689/*
@@ -615,9 +718,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
615 718
616 raw_spin_lock_irqsave(&rnp->lock, flags); 719 raw_spin_lock_irqsave(&rnp->lock, flags);
617 for (;;) { 720 for (;;) {
618 if (!sync_rcu_preempt_exp_done(rnp)) 721 if (!sync_rcu_preempt_exp_done(rnp)) {
722 raw_spin_unlock_irqrestore(&rnp->lock, flags);
619 break; 723 break;
724 }
620 if (rnp->parent == NULL) { 725 if (rnp->parent == NULL) {
726 raw_spin_unlock_irqrestore(&rnp->lock, flags);
621 wake_up(&sync_rcu_preempt_exp_wq); 727 wake_up(&sync_rcu_preempt_exp_wq);
622 break; 728 break;
623 } 729 }
@@ -627,7 +733,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
627 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 733 raw_spin_lock(&rnp->lock); /* irqs already disabled */
628 rnp->expmask &= ~mask; 734 rnp->expmask &= ~mask;
629 } 735 }
630 raw_spin_unlock_irqrestore(&rnp->lock, flags);
631} 736}
632 737
633/* 738/*
@@ -640,13 +745,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
640static void 745static void
641sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) 746sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
642{ 747{
643 int must_wait; 748 unsigned long flags;
749 int must_wait = 0;
644 750
645 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 751 raw_spin_lock_irqsave(&rnp->lock, flags);
646 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); 752 if (list_empty(&rnp->blkd_tasks))
647 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); 753 raw_spin_unlock_irqrestore(&rnp->lock, flags);
648 must_wait = rcu_preempted_readers_exp(rnp); 754 else {
649 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 755 rnp->exp_tasks = rnp->blkd_tasks.next;
756 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
757 must_wait = 1;
758 }
650 if (!must_wait) 759 if (!must_wait)
651 rcu_report_exp_rnp(rsp, rnp); 760 rcu_report_exp_rnp(rsp, rnp);
652} 761}
@@ -654,9 +763,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
654/* 763/*
655 * Wait for an rcu-preempt grace period, but expedite it. The basic idea 764 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
656 * is to invoke synchronize_sched_expedited() to push all the tasks to 765 * is to invoke synchronize_sched_expedited() to push all the tasks to
657 * the ->blocked_tasks[] lists, move all entries from the first set of 766 * the ->blkd_tasks lists and wait for this list to drain.
658 * ->blocked_tasks[] lists to the second set, and finally wait for this
659 * second set to drain.
660 */ 767 */
661void synchronize_rcu_expedited(void) 768void synchronize_rcu_expedited(void)
662{ 769{
@@ -688,7 +795,7 @@ void synchronize_rcu_expedited(void)
688 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) 795 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
689 goto unlock_mb_ret; /* Others did our work for us. */ 796 goto unlock_mb_ret; /* Others did our work for us. */
690 797
691 /* force all RCU readers onto blocked_tasks[]. */ 798 /* force all RCU readers onto ->blkd_tasks lists. */
692 synchronize_sched_expedited(); 799 synchronize_sched_expedited();
693 800
694 raw_spin_lock_irqsave(&rsp->onofflock, flags); 801 raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -700,7 +807,7 @@ void synchronize_rcu_expedited(void)
700 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 807 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
701 } 808 }
702 809
703 /* Snapshot current state of ->blocked_tasks[] lists. */ 810 /* Snapshot current state of ->blkd_tasks lists. */
704 rcu_for_each_leaf_node(rsp, rnp) 811 rcu_for_each_leaf_node(rsp, rnp)
705 sync_rcu_preempt_exp_init(rsp, rnp); 812 sync_rcu_preempt_exp_init(rsp, rnp);
706 if (NUM_RCU_NODES > 1) 813 if (NUM_RCU_NODES > 1)
@@ -708,7 +815,7 @@ void synchronize_rcu_expedited(void)
708 815
709 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 816 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
710 817
711 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ 818 /* Wait for snapshotted ->blkd_tasks lists to drain. */
712 rnp = rcu_get_root(rsp); 819 rnp = rcu_get_root(rsp);
713 wait_event(sync_rcu_preempt_exp_wq, 820 wait_event(sync_rcu_preempt_exp_wq,
714 sync_rcu_preempt_exp_done(rnp)); 821 sync_rcu_preempt_exp_done(rnp));
@@ -724,7 +831,7 @@ mb_ret:
724EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 831EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
725 832
726/* 833/*
727 * Check to see if there is any immediate preemptable-RCU-related work 834 * Check to see if there is any immediate preemptible-RCU-related work
728 * to be done. 835 * to be done.
729 */ 836 */
730static int rcu_preempt_pending(int cpu) 837static int rcu_preempt_pending(int cpu)
@@ -734,7 +841,7 @@ static int rcu_preempt_pending(int cpu)
734} 841}
735 842
736/* 843/*
737 * Does preemptable RCU need the CPU to stay out of dynticks mode? 844 * Does preemptible RCU need the CPU to stay out of dynticks mode?
738 */ 845 */
739static int rcu_preempt_needs_cpu(int cpu) 846static int rcu_preempt_needs_cpu(int cpu)
740{ 847{
@@ -751,7 +858,7 @@ void rcu_barrier(void)
751EXPORT_SYMBOL_GPL(rcu_barrier); 858EXPORT_SYMBOL_GPL(rcu_barrier);
752 859
753/* 860/*
754 * Initialize preemptable RCU's per-CPU data. 861 * Initialize preemptible RCU's per-CPU data.
755 */ 862 */
756static void __cpuinit rcu_preempt_init_percpu_data(int cpu) 863static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
757{ 864{
@@ -759,23 +866,23 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
759} 866}
760 867
761/* 868/*
762 * Move preemptable RCU's callbacks to ->orphan_cbs_list. 869 * Move preemptible RCU's callbacks from dying CPU to other online CPU.
763 */ 870 */
764static void rcu_preempt_send_cbs_to_orphanage(void) 871static void rcu_preempt_send_cbs_to_online(void)
765{ 872{
766 rcu_send_cbs_to_orphanage(&rcu_preempt_state); 873 rcu_send_cbs_to_online(&rcu_preempt_state);
767} 874}
768 875
769/* 876/*
770 * Initialize preemptable RCU's state structures. 877 * Initialize preemptible RCU's state structures.
771 */ 878 */
772static void __init __rcu_init_preempt(void) 879static void __init __rcu_init_preempt(void)
773{ 880{
774 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); 881 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
775} 882}
776 883
777/* 884/*
778 * Check for a task exiting while in a preemptable-RCU read-side 885 * Check for a task exiting while in a preemptible-RCU read-side
779 * critical section, clean up if so. No need to issue warnings, 886 * critical section, clean up if so. No need to issue warnings,
780 * as debug_check_no_locks_held() already does this if lockdep 887 * as debug_check_no_locks_held() already does this if lockdep
781 * is enabled. 888 * is enabled.
@@ -787,11 +894,13 @@ void exit_rcu(void)
787 if (t->rcu_read_lock_nesting == 0) 894 if (t->rcu_read_lock_nesting == 0)
788 return; 895 return;
789 t->rcu_read_lock_nesting = 1; 896 t->rcu_read_lock_nesting = 1;
790 rcu_read_unlock(); 897 __rcu_read_unlock();
791} 898}
792 899
793#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 900#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
794 901
902static struct rcu_state *rcu_state = &rcu_sched_state;
903
795/* 904/*
796 * Tell them what RCU they are running. 905 * Tell them what RCU they are running.
797 */ 906 */
@@ -821,7 +930,7 @@ void rcu_force_quiescent_state(void)
821EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 930EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
822 931
823/* 932/*
824 * Because preemptable RCU does not exist, we never have to check for 933 * Because preemptible RCU does not exist, we never have to check for
825 * CPUs being in quiescent states. 934 * CPUs being in quiescent states.
826 */ 935 */
827static void rcu_preempt_note_context_switch(int cpu) 936static void rcu_preempt_note_context_switch(int cpu)
@@ -829,10 +938,10 @@ static void rcu_preempt_note_context_switch(int cpu)
829} 938}
830 939
831/* 940/*
832 * Because preemptable RCU does not exist, there are never any preempted 941 * Because preemptible RCU does not exist, there are never any preempted
833 * RCU readers. 942 * RCU readers.
834 */ 943 */
835static int rcu_preempted_readers(struct rcu_node *rnp) 944static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
836{ 945{
837 return 0; 946 return 0;
838} 947}
@@ -847,10 +956,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
847 956
848#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 957#endif /* #ifdef CONFIG_HOTPLUG_CPU */
849 958
850#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
851
852/* 959/*
853 * Because preemptable RCU does not exist, we never have to check for 960 * Because preemptible RCU does not exist, we never have to check for
854 * tasks blocked within RCU read-side critical sections. 961 * tasks blocked within RCU read-side critical sections.
855 */ 962 */
856static void rcu_print_detail_task_stall(struct rcu_state *rsp) 963static void rcu_print_detail_task_stall(struct rcu_state *rsp)
@@ -858,17 +965,23 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
858} 965}
859 966
860/* 967/*
861 * Because preemptable RCU does not exist, we never have to check for 968 * Because preemptible RCU does not exist, we never have to check for
862 * tasks blocked within RCU read-side critical sections. 969 * tasks blocked within RCU read-side critical sections.
863 */ 970 */
864static void rcu_print_task_stall(struct rcu_node *rnp) 971static void rcu_print_task_stall(struct rcu_node *rnp)
865{ 972{
866} 973}
867 974
868#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 975/*
976 * Because preemptible RCU does not exist, there is no need to suppress
977 * its CPU stall warnings.
978 */
979static void rcu_preempt_stall_reset(void)
980{
981}
869 982
870/* 983/*
871 * Because there is no preemptable RCU, there can be no readers blocked, 984 * Because there is no preemptible RCU, there can be no readers blocked,
872 * so there is no need to check for blocked tasks. So check only for 985 * so there is no need to check for blocked tasks. So check only for
873 * bogus qsmask values. 986 * bogus qsmask values.
874 */ 987 */
@@ -880,7 +993,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
880#ifdef CONFIG_HOTPLUG_CPU 993#ifdef CONFIG_HOTPLUG_CPU
881 994
882/* 995/*
883 * Because preemptable RCU does not exist, it never needs to migrate 996 * Because preemptible RCU does not exist, it never needs to migrate
884 * tasks that were blocked within RCU read-side critical sections, and 997 * tasks that were blocked within RCU read-side critical sections, and
885 * such non-existent tasks cannot possibly have been blocking the current 998 * such non-existent tasks cannot possibly have been blocking the current
886 * grace period. 999 * grace period.
@@ -893,7 +1006,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
893} 1006}
894 1007
895/* 1008/*
896 * Because preemptable RCU does not exist, it never needs CPU-offline 1009 * Because preemptible RCU does not exist, it never needs CPU-offline
897 * processing. 1010 * processing.
898 */ 1011 */
899static void rcu_preempt_offline_cpu(int cpu) 1012static void rcu_preempt_offline_cpu(int cpu)
@@ -903,7 +1016,7 @@ static void rcu_preempt_offline_cpu(int cpu)
903#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1016#endif /* #ifdef CONFIG_HOTPLUG_CPU */
904 1017
905/* 1018/*
906 * Because preemptable RCU does not exist, it never has any callbacks 1019 * Because preemptible RCU does not exist, it never has any callbacks
907 * to check. 1020 * to check.
908 */ 1021 */
909static void rcu_preempt_check_callbacks(int cpu) 1022static void rcu_preempt_check_callbacks(int cpu)
@@ -911,7 +1024,7 @@ static void rcu_preempt_check_callbacks(int cpu)
911} 1024}
912 1025
913/* 1026/*
914 * Because preemptable RCU does not exist, it never has any callbacks 1027 * Because preemptible RCU does not exist, it never has any callbacks
915 * to process. 1028 * to process.
916 */ 1029 */
917static void rcu_preempt_process_callbacks(void) 1030static void rcu_preempt_process_callbacks(void)
@@ -919,17 +1032,8 @@ static void rcu_preempt_process_callbacks(void)
919} 1032}
920 1033
921/* 1034/*
922 * In classic RCU, call_rcu() is just call_rcu_sched().
923 */
924void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
925{
926 call_rcu_sched(head, func);
927}
928EXPORT_SYMBOL_GPL(call_rcu);
929
930/*
931 * Wait for an rcu-preempt grace period, but make it happen quickly. 1035 * Wait for an rcu-preempt grace period, but make it happen quickly.
932 * But because preemptable RCU does not exist, map to rcu-sched. 1036 * But because preemptible RCU does not exist, map to rcu-sched.
933 */ 1037 */
934void synchronize_rcu_expedited(void) 1038void synchronize_rcu_expedited(void)
935{ 1039{
@@ -940,7 +1044,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
940#ifdef CONFIG_HOTPLUG_CPU 1044#ifdef CONFIG_HOTPLUG_CPU
941 1045
942/* 1046/*
943 * Because preemptable RCU does not exist, there is never any need to 1047 * Because preemptible RCU does not exist, there is never any need to
944 * report on tasks preempted in RCU read-side critical sections during 1048 * report on tasks preempted in RCU read-side critical sections during
945 * expedited RCU grace periods. 1049 * expedited RCU grace periods.
946 */ 1050 */
@@ -952,7 +1056,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
952#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1056#endif /* #ifdef CONFIG_HOTPLUG_CPU */
953 1057
954/* 1058/*
955 * Because preemptable RCU does not exist, it never has any work to do. 1059 * Because preemptible RCU does not exist, it never has any work to do.
956 */ 1060 */
957static int rcu_preempt_pending(int cpu) 1061static int rcu_preempt_pending(int cpu)
958{ 1062{
@@ -960,7 +1064,7 @@ static int rcu_preempt_pending(int cpu)
960} 1064}
961 1065
962/* 1066/*
963 * Because preemptable RCU does not exist, it never needs any CPU. 1067 * Because preemptible RCU does not exist, it never needs any CPU.
964 */ 1068 */
965static int rcu_preempt_needs_cpu(int cpu) 1069static int rcu_preempt_needs_cpu(int cpu)
966{ 1070{
@@ -968,7 +1072,7 @@ static int rcu_preempt_needs_cpu(int cpu)
968} 1072}
969 1073
970/* 1074/*
971 * Because preemptable RCU does not exist, rcu_barrier() is just 1075 * Because preemptible RCU does not exist, rcu_barrier() is just
972 * another name for rcu_barrier_sched(). 1076 * another name for rcu_barrier_sched().
973 */ 1077 */
974void rcu_barrier(void) 1078void rcu_barrier(void)
@@ -978,7 +1082,7 @@ void rcu_barrier(void)
978EXPORT_SYMBOL_GPL(rcu_barrier); 1082EXPORT_SYMBOL_GPL(rcu_barrier);
979 1083
980/* 1084/*
981 * Because preemptable RCU does not exist, there is no per-CPU 1085 * Because preemptible RCU does not exist, there is no per-CPU
982 * data to initialize. 1086 * data to initialize.
983 */ 1087 */
984static void __cpuinit rcu_preempt_init_percpu_data(int cpu) 1088static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
@@ -986,14 +1090,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
986} 1090}
987 1091
988/* 1092/*
989 * Because there is no preemptable RCU, there are no callbacks to move. 1093 * Because there is no preemptible RCU, there are no callbacks to move.
990 */ 1094 */
991static void rcu_preempt_send_cbs_to_orphanage(void) 1095static void rcu_preempt_send_cbs_to_online(void)
992{ 1096{
993} 1097}
994 1098
995/* 1099/*
996 * Because preemptable RCU does not exist, it need not be initialized. 1100 * Because preemptible RCU does not exist, it need not be initialized.
997 */ 1101 */
998static void __init __rcu_init_preempt(void) 1102static void __init __rcu_init_preempt(void)
999{ 1103{
@@ -1001,6 +1105,791 @@ static void __init __rcu_init_preempt(void)
1001 1105
1002#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1106#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1003 1107
1108#ifdef CONFIG_RCU_BOOST
1109
1110#include "rtmutex_common.h"
1111
1112#ifdef CONFIG_RCU_TRACE
1113
1114static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1115{
1116 if (list_empty(&rnp->blkd_tasks))
1117 rnp->n_balk_blkd_tasks++;
1118 else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
1119 rnp->n_balk_exp_gp_tasks++;
1120 else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
1121 rnp->n_balk_boost_tasks++;
1122 else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
1123 rnp->n_balk_notblocked++;
1124 else if (rnp->gp_tasks != NULL &&
1125 ULONG_CMP_LT(jiffies, rnp->boost_time))
1126 rnp->n_balk_notyet++;
1127 else
1128 rnp->n_balk_nos++;
1129}
1130
1131#else /* #ifdef CONFIG_RCU_TRACE */
1132
1133static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1134{
1135}
1136
1137#endif /* #else #ifdef CONFIG_RCU_TRACE */
1138
1139/*
1140 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1141 * or ->boost_tasks, advancing the pointer to the next task in the
1142 * ->blkd_tasks list.
1143 *
1144 * Note that irqs must be enabled: boosting the task can block.
1145 * Returns 1 if there are more tasks needing to be boosted.
1146 */
1147static int rcu_boost(struct rcu_node *rnp)
1148{
1149 unsigned long flags;
1150 struct rt_mutex mtx;
1151 struct task_struct *t;
1152 struct list_head *tb;
1153
1154 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
1155 return 0; /* Nothing left to boost. */
1156
1157 raw_spin_lock_irqsave(&rnp->lock, flags);
1158
1159 /*
1160 * Recheck under the lock: all tasks in need of boosting
1161 * might exit their RCU read-side critical sections on their own.
1162 */
1163 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
1164 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1165 return 0;
1166 }
1167
1168 /*
1169 * Preferentially boost tasks blocking expedited grace periods.
1170 * This cannot starve the normal grace periods because a second
1171 * expedited grace period must boost all blocked tasks, including
1172 * those blocking the pre-existing normal grace period.
1173 */
1174 if (rnp->exp_tasks != NULL) {
1175 tb = rnp->exp_tasks;
1176 rnp->n_exp_boosts++;
1177 } else {
1178 tb = rnp->boost_tasks;
1179 rnp->n_normal_boosts++;
1180 }
1181 rnp->n_tasks_boosted++;
1182
1183 /*
1184 * We boost task t by manufacturing an rt_mutex that appears to
1185 * be held by task t. We leave a pointer to that rt_mutex where
1186 * task t can find it, and task t will release the mutex when it
1187 * exits its outermost RCU read-side critical section. Then
1188 * simply acquiring this artificial rt_mutex will boost task
1189 * t's priority. (Thanks to tglx for suggesting this approach!)
1190 *
1191 * Note that task t must acquire rnp->lock to remove itself from
1192 * the ->blkd_tasks list, which it will do from exit() if from
1193 * nowhere else. We therefore are guaranteed that task t will
1194 * stay around at least until we drop rnp->lock. Note that
1195 * rnp->lock also resolves races between our priority boosting
1196 * and task t's exiting its outermost RCU read-side critical
1197 * section.
1198 */
1199 t = container_of(tb, struct task_struct, rcu_node_entry);
1200 rt_mutex_init_proxy_locked(&mtx, t);
1201 t->rcu_boost_mutex = &mtx;
1202 t->rcu_boosted = 1;
1203 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1204 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
1205 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
1206
1207 return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
1208}
1209
1210/*
1211 * Timer handler to initiate waking up of boost kthreads that
1212 * have yielded the CPU due to excessive numbers of tasks to
1213 * boost. We wake up the per-rcu_node kthread, which in turn
1214 * will wake up the booster kthread.
1215 */
1216static void rcu_boost_kthread_timer(unsigned long arg)
1217{
1218 invoke_rcu_node_kthread((struct rcu_node *)arg);
1219}
1220
1221/*
1222 * Priority-boosting kthread. One per leaf rcu_node and one for the
1223 * root rcu_node.
1224 */
1225static int rcu_boost_kthread(void *arg)
1226{
1227 struct rcu_node *rnp = (struct rcu_node *)arg;
1228 int spincnt = 0;
1229 int more2boost;
1230
1231 for (;;) {
1232 rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1233 rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
1234 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1235 more2boost = rcu_boost(rnp);
1236 if (more2boost)
1237 spincnt++;
1238 else
1239 spincnt = 0;
1240 if (spincnt > 10) {
1241 rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
1242 spincnt = 0;
1243 }
1244 }
1245 /* NOTREACHED */
1246 return 0;
1247}
1248
1249/*
1250 * Check to see if it is time to start boosting RCU readers that are
1251 * blocking the current grace period, and, if so, tell the per-rcu_node
1252 * kthread to start boosting them. If there is an expedited grace
1253 * period in progress, it is always time to boost.
1254 *
1255 * The caller must hold rnp->lock, which this function releases,
1256 * but irqs remain disabled. The ->boost_kthread_task is immortal,
1257 * so we don't need to worry about it going away.
1258 */
1259static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1260{
1261 struct task_struct *t;
1262
1263 if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
1264 rnp->n_balk_exp_gp_tasks++;
1265 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1266 return;
1267 }
1268 if (rnp->exp_tasks != NULL ||
1269 (rnp->gp_tasks != NULL &&
1270 rnp->boost_tasks == NULL &&
1271 rnp->qsmask == 0 &&
1272 ULONG_CMP_GE(jiffies, rnp->boost_time))) {
1273 if (rnp->exp_tasks == NULL)
1274 rnp->boost_tasks = rnp->gp_tasks;
1275 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1276 t = rnp->boost_kthread_task;
1277 if (t != NULL)
1278 wake_up_process(t);
1279 } else {
1280 rcu_initiate_boost_trace(rnp);
1281 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1282 }
1283}
1284
1285/*
1286 * Wake up the per-CPU kthread to invoke RCU callbacks.
1287 */
1288static void invoke_rcu_callbacks_kthread(void)
1289{
1290 unsigned long flags;
1291
1292 local_irq_save(flags);
1293 __this_cpu_write(rcu_cpu_has_work, 1);
1294 if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
1295 local_irq_restore(flags);
1296 return;
1297 }
1298 wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
1299 local_irq_restore(flags);
1300}
1301
1302/*
1303 * Set the affinity of the boost kthread. The CPU-hotplug locks are
1304 * held, so no one should be messing with the existence of the boost
1305 * kthread.
1306 */
1307static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
1308 cpumask_var_t cm)
1309{
1310 struct task_struct *t;
1311
1312 t = rnp->boost_kthread_task;
1313 if (t != NULL)
1314 set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
1315}
1316
1317#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
1318
1319/*
1320 * Do priority-boost accounting for the start of a new grace period.
1321 */
1322static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1323{
1324 rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
1325}
1326
1327/*
1328 * Create an RCU-boost kthread for the specified node if one does not
1329 * already exist. We only create this kthread for preemptible RCU.
1330 * Returns zero if all is well, a negated errno otherwise.
1331 */
1332static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1333 struct rcu_node *rnp,
1334 int rnp_index)
1335{
1336 unsigned long flags;
1337 struct sched_param sp;
1338 struct task_struct *t;
1339
1340 if (&rcu_preempt_state != rsp)
1341 return 0;
1342 rsp->boost = 1;
1343 if (rnp->boost_kthread_task != NULL)
1344 return 0;
1345 t = kthread_create(rcu_boost_kthread, (void *)rnp,
1346 "rcub%d", rnp_index);
1347 if (IS_ERR(t))
1348 return PTR_ERR(t);
1349 raw_spin_lock_irqsave(&rnp->lock, flags);
1350 rnp->boost_kthread_task = t;
1351 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1352 sp.sched_priority = RCU_KTHREAD_PRIO;
1353 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1354 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1355 return 0;
1356}
1357
1358#ifdef CONFIG_HOTPLUG_CPU
1359
1360/*
1361 * Stop the RCU's per-CPU kthread when its CPU goes offline,.
1362 */
1363static void rcu_stop_cpu_kthread(int cpu)
1364{
1365 struct task_struct *t;
1366
1367 /* Stop the CPU's kthread. */
1368 t = per_cpu(rcu_cpu_kthread_task, cpu);
1369 if (t != NULL) {
1370 per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
1371 kthread_stop(t);
1372 }
1373}
1374
1375#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1376
1377static void rcu_kthread_do_work(void)
1378{
1379 rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
1380 rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1381 rcu_preempt_do_callbacks();
1382}
1383
1384/*
1385 * Wake up the specified per-rcu_node-structure kthread.
1386 * Because the per-rcu_node kthreads are immortal, we don't need
1387 * to do anything to keep them alive.
1388 */
1389static void invoke_rcu_node_kthread(struct rcu_node *rnp)
1390{
1391 struct task_struct *t;
1392
1393 t = rnp->node_kthread_task;
1394 if (t != NULL)
1395 wake_up_process(t);
1396}
1397
1398/*
1399 * Set the specified CPU's kthread to run RT or not, as specified by
1400 * the to_rt argument. The CPU-hotplug locks are held, so the task
1401 * is not going away.
1402 */
1403static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1404{
1405 int policy;
1406 struct sched_param sp;
1407 struct task_struct *t;
1408
1409 t = per_cpu(rcu_cpu_kthread_task, cpu);
1410 if (t == NULL)
1411 return;
1412 if (to_rt) {
1413 policy = SCHED_FIFO;
1414 sp.sched_priority = RCU_KTHREAD_PRIO;
1415 } else {
1416 policy = SCHED_NORMAL;
1417 sp.sched_priority = 0;
1418 }
1419 sched_setscheduler_nocheck(t, policy, &sp);
1420}
1421
1422/*
1423 * Timer handler to initiate the waking up of per-CPU kthreads that
1424 * have yielded the CPU due to excess numbers of RCU callbacks.
1425 * We wake up the per-rcu_node kthread, which in turn will wake up
1426 * the booster kthread.
1427 */
1428static void rcu_cpu_kthread_timer(unsigned long arg)
1429{
1430 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
1431 struct rcu_node *rnp = rdp->mynode;
1432
1433 atomic_or(rdp->grpmask, &rnp->wakemask);
1434 invoke_rcu_node_kthread(rnp);
1435}
1436
1437/*
1438 * Drop to non-real-time priority and yield, but only after posting a
1439 * timer that will cause us to regain our real-time priority if we
1440 * remain preempted. Either way, we restore our real-time priority
1441 * before returning.
1442 */
1443static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
1444{
1445 struct sched_param sp;
1446 struct timer_list yield_timer;
1447
1448 setup_timer_on_stack(&yield_timer, f, arg);
1449 mod_timer(&yield_timer, jiffies + 2);
1450 sp.sched_priority = 0;
1451 sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
1452 set_user_nice(current, 19);
1453 schedule();
1454 sp.sched_priority = RCU_KTHREAD_PRIO;
1455 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1456 del_timer(&yield_timer);
1457}
1458
1459/*
1460 * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
1461 * This can happen while the corresponding CPU is either coming online
1462 * or going offline. We cannot wait until the CPU is fully online
1463 * before starting the kthread, because the various notifier functions
1464 * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
1465 * the corresponding CPU is online.
1466 *
1467 * Return 1 if the kthread needs to stop, 0 otherwise.
1468 *
1469 * Caller must disable bh. This function can momentarily enable it.
1470 */
1471static int rcu_cpu_kthread_should_stop(int cpu)
1472{
1473 while (cpu_is_offline(cpu) ||
1474 !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
1475 smp_processor_id() != cpu) {
1476 if (kthread_should_stop())
1477 return 1;
1478 per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1479 per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
1480 local_bh_enable();
1481 schedule_timeout_uninterruptible(1);
1482 if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
1483 set_cpus_allowed_ptr(current, cpumask_of(cpu));
1484 local_bh_disable();
1485 }
1486 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1487 return 0;
1488}
1489
1490/*
1491 * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
1492 * earlier RCU softirq.
1493 */
1494static int rcu_cpu_kthread(void *arg)
1495{
1496 int cpu = (int)(long)arg;
1497 unsigned long flags;
1498 int spincnt = 0;
1499 unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
1500 char work;
1501 char *workp = &per_cpu(rcu_cpu_has_work, cpu);
1502
1503 for (;;) {
1504 *statusp = RCU_KTHREAD_WAITING;
1505 rcu_wait(*workp != 0 || kthread_should_stop());
1506 local_bh_disable();
1507 if (rcu_cpu_kthread_should_stop(cpu)) {
1508 local_bh_enable();
1509 break;
1510 }
1511 *statusp = RCU_KTHREAD_RUNNING;
1512 per_cpu(rcu_cpu_kthread_loops, cpu)++;
1513 local_irq_save(flags);
1514 work = *workp;
1515 *workp = 0;
1516 local_irq_restore(flags);
1517 if (work)
1518 rcu_kthread_do_work();
1519 local_bh_enable();
1520 if (*workp != 0)
1521 spincnt++;
1522 else
1523 spincnt = 0;
1524 if (spincnt > 10) {
1525 *statusp = RCU_KTHREAD_YIELDING;
1526 rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
1527 spincnt = 0;
1528 }
1529 }
1530 *statusp = RCU_KTHREAD_STOPPED;
1531 return 0;
1532}
1533
1534/*
1535 * Spawn a per-CPU kthread, setting up affinity and priority.
1536 * Because the CPU hotplug lock is held, no other CPU will be attempting
1537 * to manipulate rcu_cpu_kthread_task. There might be another CPU
1538 * attempting to access it during boot, but the locking in kthread_bind()
1539 * will enforce sufficient ordering.
1540 *
1541 * Please note that we cannot simply refuse to wake up the per-CPU
1542 * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
1543 * which can result in softlockup complaints if the task ends up being
1544 * idle for more than a couple of minutes.
1545 *
1546 * However, please note also that we cannot bind the per-CPU kthread to its
1547 * CPU until that CPU is fully online. We also cannot wait until the
1548 * CPU is fully online before we create its per-CPU kthread, as this would
1549 * deadlock the system when CPU notifiers tried waiting for grace
1550 * periods. So we bind the per-CPU kthread to its CPU only if the CPU
1551 * is online. If its CPU is not yet fully online, then the code in
1552 * rcu_cpu_kthread() will wait until it is fully online, and then do
1553 * the binding.
1554 */
1555static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
1556{
1557 struct sched_param sp;
1558 struct task_struct *t;
1559
1560 if (!rcu_scheduler_fully_active ||
1561 per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
1562 return 0;
1563 t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
1564 if (IS_ERR(t))
1565 return PTR_ERR(t);
1566 if (cpu_online(cpu))
1567 kthread_bind(t, cpu);
1568 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1569 WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
1570 sp.sched_priority = RCU_KTHREAD_PRIO;
1571 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1572 per_cpu(rcu_cpu_kthread_task, cpu) = t;
1573 wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
1574 return 0;
1575}
1576
1577/*
1578 * Per-rcu_node kthread, which is in charge of waking up the per-CPU
1579 * kthreads when needed. We ignore requests to wake up kthreads
1580 * for offline CPUs, which is OK because force_quiescent_state()
1581 * takes care of this case.
1582 */
1583static int rcu_node_kthread(void *arg)
1584{
1585 int cpu;
1586 unsigned long flags;
1587 unsigned long mask;
1588 struct rcu_node *rnp = (struct rcu_node *)arg;
1589 struct sched_param sp;
1590 struct task_struct *t;
1591
1592 for (;;) {
1593 rnp->node_kthread_status = RCU_KTHREAD_WAITING;
1594 rcu_wait(atomic_read(&rnp->wakemask) != 0);
1595 rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
1596 raw_spin_lock_irqsave(&rnp->lock, flags);
1597 mask = atomic_xchg(&rnp->wakemask, 0);
1598 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
1599 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
1600 if ((mask & 0x1) == 0)
1601 continue;
1602 preempt_disable();
1603 t = per_cpu(rcu_cpu_kthread_task, cpu);
1604 if (!cpu_online(cpu) || t == NULL) {
1605 preempt_enable();
1606 continue;
1607 }
1608 per_cpu(rcu_cpu_has_work, cpu) = 1;
1609 sp.sched_priority = RCU_KTHREAD_PRIO;
1610 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1611 preempt_enable();
1612 }
1613 }
1614 /* NOTREACHED */
1615 rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
1616 return 0;
1617}
1618
1619/*
1620 * Set the per-rcu_node kthread's affinity to cover all CPUs that are
1621 * served by the rcu_node in question. The CPU hotplug lock is still
1622 * held, so the value of rnp->qsmaskinit will be stable.
1623 *
1624 * We don't include outgoingcpu in the affinity set, use -1 if there is
1625 * no outgoing CPU. If there are no CPUs left in the affinity set,
1626 * this function allows the kthread to execute on any CPU.
1627 */
1628static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1629{
1630 cpumask_var_t cm;
1631 int cpu;
1632 unsigned long mask = rnp->qsmaskinit;
1633
1634 if (rnp->node_kthread_task == NULL)
1635 return;
1636 if (!alloc_cpumask_var(&cm, GFP_KERNEL))
1637 return;
1638 cpumask_clear(cm);
1639 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1640 if ((mask & 0x1) && cpu != outgoingcpu)
1641 cpumask_set_cpu(cpu, cm);
1642 if (cpumask_weight(cm) == 0) {
1643 cpumask_setall(cm);
1644 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
1645 cpumask_clear_cpu(cpu, cm);
1646 WARN_ON_ONCE(cpumask_weight(cm) == 0);
1647 }
1648 set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
1649 rcu_boost_kthread_setaffinity(rnp, cm);
1650 free_cpumask_var(cm);
1651}
1652
1653/*
1654 * Spawn a per-rcu_node kthread, setting priority and affinity.
1655 * Called during boot before online/offline can happen, or, if
1656 * during runtime, with the main CPU-hotplug locks held. So only
1657 * one of these can be executing at a time.
1658 */
1659static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
1660 struct rcu_node *rnp)
1661{
1662 unsigned long flags;
1663 int rnp_index = rnp - &rsp->node[0];
1664 struct sched_param sp;
1665 struct task_struct *t;
1666
1667 if (!rcu_scheduler_fully_active ||
1668 rnp->qsmaskinit == 0)
1669 return 0;
1670 if (rnp->node_kthread_task == NULL) {
1671 t = kthread_create(rcu_node_kthread, (void *)rnp,
1672 "rcun%d", rnp_index);
1673 if (IS_ERR(t))
1674 return PTR_ERR(t);
1675 raw_spin_lock_irqsave(&rnp->lock, flags);
1676 rnp->node_kthread_task = t;
1677 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1678 sp.sched_priority = 99;
1679 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1680 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1681 }
1682 return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
1683}
1684
1685/*
1686 * Spawn all kthreads -- called as soon as the scheduler is running.
1687 */
1688static int __init rcu_spawn_kthreads(void)
1689{
1690 int cpu;
1691 struct rcu_node *rnp;
1692
1693 rcu_scheduler_fully_active = 1;
1694 for_each_possible_cpu(cpu) {
1695 per_cpu(rcu_cpu_has_work, cpu) = 0;
1696 if (cpu_online(cpu))
1697 (void)rcu_spawn_one_cpu_kthread(cpu);
1698 }
1699 rnp = rcu_get_root(rcu_state);
1700 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1701 if (NUM_RCU_NODES > 1) {
1702 rcu_for_each_leaf_node(rcu_state, rnp)
1703 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1704 }
1705 return 0;
1706}
1707early_initcall(rcu_spawn_kthreads);
1708
1709static void __cpuinit rcu_prepare_kthreads(int cpu)
1710{
1711 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
1712 struct rcu_node *rnp = rdp->mynode;
1713
1714 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1715 if (rcu_scheduler_fully_active) {
1716 (void)rcu_spawn_one_cpu_kthread(cpu);
1717 if (rnp->node_kthread_task == NULL)
1718 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1719 }
1720}
1721
1722#else /* #ifdef CONFIG_RCU_BOOST */
1723
1724static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1725{
1726 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1727}
1728
1729static void invoke_rcu_callbacks_kthread(void)
1730{
1731 WARN_ON_ONCE(1);
1732}
1733
1734static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1735{
1736}
1737
1738#ifdef CONFIG_HOTPLUG_CPU
1739
1740static void rcu_stop_cpu_kthread(int cpu)
1741{
1742}
1743
1744#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1745
1746static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1747{
1748}
1749
1750static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1751{
1752}
1753
1754static int __init rcu_scheduler_really_started(void)
1755{
1756 rcu_scheduler_fully_active = 1;
1757 return 0;
1758}
1759early_initcall(rcu_scheduler_really_started);
1760
1761static void __cpuinit rcu_prepare_kthreads(int cpu)
1762{
1763}
1764
1765#endif /* #else #ifdef CONFIG_RCU_BOOST */
1766
1767#ifndef CONFIG_SMP
1768
1769void synchronize_sched_expedited(void)
1770{
1771 cond_resched();
1772}
1773EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1774
1775#else /* #ifndef CONFIG_SMP */
1776
1777static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
1778static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
1779
1780static int synchronize_sched_expedited_cpu_stop(void *data)
1781{
1782 /*
1783 * There must be a full memory barrier on each affected CPU
1784 * between the time that try_stop_cpus() is called and the
1785 * time that it returns.
1786 *
1787 * In the current initial implementation of cpu_stop, the
1788 * above condition is already met when the control reaches
1789 * this point and the following smp_mb() is not strictly
1790 * necessary. Do smp_mb() anyway for documentation and
1791 * robustness against future implementation changes.
1792 */
1793 smp_mb(); /* See above comment block. */
1794 return 0;
1795}
1796
1797/*
1798 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
1799 * approach to force grace period to end quickly. This consumes
1800 * significant time on all CPUs, and is thus not recommended for
1801 * any sort of common-case code.
1802 *
1803 * Note that it is illegal to call this function while holding any
1804 * lock that is acquired by a CPU-hotplug notifier. Failing to
1805 * observe this restriction will result in deadlock.
1806 *
1807 * This implementation can be thought of as an application of ticket
1808 * locking to RCU, with sync_sched_expedited_started and
1809 * sync_sched_expedited_done taking on the roles of the halves
1810 * of the ticket-lock word. Each task atomically increments
1811 * sync_sched_expedited_started upon entry, snapshotting the old value,
1812 * then attempts to stop all the CPUs. If this succeeds, then each
1813 * CPU will have executed a context switch, resulting in an RCU-sched
1814 * grace period. We are then done, so we use atomic_cmpxchg() to
1815 * update sync_sched_expedited_done to match our snapshot -- but
1816 * only if someone else has not already advanced past our snapshot.
1817 *
1818 * On the other hand, if try_stop_cpus() fails, we check the value
1819 * of sync_sched_expedited_done. If it has advanced past our
1820 * initial snapshot, then someone else must have forced a grace period
1821 * some time after we took our snapshot. In this case, our work is
1822 * done for us, and we can simply return. Otherwise, we try again,
1823 * but keep our initial snapshot for purposes of checking for someone
1824 * doing our work for us.
1825 *
1826 * If we fail too many times in a row, we fall back to synchronize_sched().
1827 */
1828void synchronize_sched_expedited(void)
1829{
1830 int firstsnap, s, snap, trycount = 0;
1831
1832 /* Note that atomic_inc_return() implies full memory barrier. */
1833 firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
1834 get_online_cpus();
1835
1836 /*
1837 * Each pass through the following loop attempts to force a
1838 * context switch on each CPU.
1839 */
1840 while (try_stop_cpus(cpu_online_mask,
1841 synchronize_sched_expedited_cpu_stop,
1842 NULL) == -EAGAIN) {
1843 put_online_cpus();
1844
1845 /* No joy, try again later. Or just synchronize_sched(). */
1846 if (trycount++ < 10)
1847 udelay(trycount * num_online_cpus());
1848 else {
1849 synchronize_sched();
1850 return;
1851 }
1852
1853 /* Check to see if someone else did our work for us. */
1854 s = atomic_read(&sync_sched_expedited_done);
1855 if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
1856 smp_mb(); /* ensure test happens before caller kfree */
1857 return;
1858 }
1859
1860 /*
1861 * Refetching sync_sched_expedited_started allows later
1862 * callers to piggyback on our grace period. We subtract
1863 * 1 to get the same token that the last incrementer got.
1864 * We retry after they started, so our grace period works
1865 * for them, and they started after our first try, so their
1866 * grace period works for us.
1867 */
1868 get_online_cpus();
1869 snap = atomic_read(&sync_sched_expedited_started) - 1;
1870 smp_mb(); /* ensure read is before try_stop_cpus(). */
1871 }
1872
1873 /*
1874 * Everyone up to our most recent fetch is covered by our grace
1875 * period. Update the counter, but only if our work is still
1876 * relevant -- which it won't be if someone who started later
1877 * than we did beat us to the punch.
1878 */
1879 do {
1880 s = atomic_read(&sync_sched_expedited_done);
1881 if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
1882 smp_mb(); /* ensure test happens before caller kfree */
1883 break;
1884 }
1885 } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
1886
1887 put_online_cpus();
1888}
1889EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1890
1891#endif /* #else #ifndef CONFIG_SMP */
1892
1004#if !defined(CONFIG_RCU_FAST_NO_HZ) 1893#if !defined(CONFIG_RCU_FAST_NO_HZ)
1005 1894
1006/* 1895/*
@@ -1047,14 +1936,13 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1047 * 1936 *
1048 * Because it is not legal to invoke rcu_process_callbacks() with irqs 1937 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1049 * disabled, we do one pass of force_quiescent_state(), then do a 1938 * disabled, we do one pass of force_quiescent_state(), then do a
1050 * raise_softirq() to cause rcu_process_callbacks() to be invoked later. 1939 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
1051 * The per-cpu rcu_dyntick_drain variable controls the sequencing. 1940 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing.
1052 */ 1941 */
1053int rcu_needs_cpu(int cpu) 1942int rcu_needs_cpu(int cpu)
1054{ 1943{
1055 int c = 0; 1944 int c = 0;
1056 int snap; 1945 int snap;
1057 int snap_nmi;
1058 int thatcpu; 1946 int thatcpu;
1059 1947
1060 /* Check for being in the holdoff period. */ 1948 /* Check for being in the holdoff period. */
@@ -1065,10 +1953,10 @@ int rcu_needs_cpu(int cpu)
1065 for_each_online_cpu(thatcpu) { 1953 for_each_online_cpu(thatcpu) {
1066 if (thatcpu == cpu) 1954 if (thatcpu == cpu)
1067 continue; 1955 continue;
1068 snap = per_cpu(rcu_dynticks, thatcpu).dynticks; 1956 snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
1069 snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi; 1957 thatcpu).dynticks);
1070 smp_mb(); /* Order sampling of snap with end of grace period. */ 1958 smp_mb(); /* Order sampling of snap with end of grace period. */
1071 if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) { 1959 if ((snap & 0x1) != 0) {
1072 per_cpu(rcu_dyntick_drain, cpu) = 0; 1960 per_cpu(rcu_dyntick_drain, cpu) = 0;
1073 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 1961 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1074 return rcu_needs_cpu_quick_check(cpu); 1962 return rcu_needs_cpu_quick_check(cpu);
@@ -1099,7 +1987,7 @@ int rcu_needs_cpu(int cpu)
1099 1987
1100 /* If RCU callbacks are still pending, RCU still needs this CPU. */ 1988 /* If RCU callbacks are still pending, RCU still needs this CPU. */
1101 if (c) 1989 if (c)
1102 raise_softirq(RCU_SOFTIRQ); 1990 invoke_rcu_core();
1103 return c; 1991 return c;
1104} 1992}
1105 1993
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 36c95b45738e..4e144876dc68 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,6 +46,22 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49#ifdef CONFIG_RCU_BOOST
50
51DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
52DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
53DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
54DECLARE_PER_CPU(char, rcu_cpu_has_work);
55
56static char convert_kthread_status(unsigned int kthread_status)
57{
58 if (kthread_status > RCU_KTHREAD_MAX)
59 return '?';
60 return "SRWOY"[kthread_status];
61}
62
63#endif /* #ifdef CONFIG_RCU_BOOST */
64
49static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 65static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
50{ 66{
51 if (!rdp->beenonline) 67 if (!rdp->beenonline)
@@ -57,14 +73,33 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
57 rdp->passed_quiesc, rdp->passed_quiesc_completed, 73 rdp->passed_quiesc, rdp->passed_quiesc_completed,
58 rdp->qs_pending); 74 rdp->qs_pending);
59#ifdef CONFIG_NO_HZ 75#ifdef CONFIG_NO_HZ
60 seq_printf(m, " dt=%d/%d dn=%d df=%lu", 76 seq_printf(m, " dt=%d/%d/%d df=%lu",
61 rdp->dynticks->dynticks, 77 atomic_read(&rdp->dynticks->dynticks),
62 rdp->dynticks->dynticks_nesting, 78 rdp->dynticks->dynticks_nesting,
63 rdp->dynticks->dynticks_nmi, 79 rdp->dynticks->dynticks_nmi_nesting,
64 rdp->dynticks_fqs); 80 rdp->dynticks_fqs);
65#endif /* #ifdef CONFIG_NO_HZ */ 81#endif /* #ifdef CONFIG_NO_HZ */
66 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 82 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
67 seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit); 83 seq_printf(m, " ql=%ld qs=%c%c%c%c",
84 rdp->qlen,
85 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
86 rdp->nxttail[RCU_NEXT_TAIL]],
87 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
88 rdp->nxttail[RCU_NEXT_READY_TAIL]],
89 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
90 rdp->nxttail[RCU_WAIT_TAIL]],
91 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
92#ifdef CONFIG_RCU_BOOST
93 seq_printf(m, " kt=%d/%c/%d ktl=%x",
94 per_cpu(rcu_cpu_has_work, rdp->cpu),
95 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
96 rdp->cpu)),
97 per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
98 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
99#endif /* #ifdef CONFIG_RCU_BOOST */
100 seq_printf(m, " b=%ld", rdp->blimit);
101 seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
102 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
68} 103}
69 104
70#define PRINT_RCU_DATA(name, func, m) \ 105#define PRINT_RCU_DATA(name, func, m) \
@@ -113,22 +148,42 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
113 rdp->qs_pending); 148 rdp->qs_pending);
114#ifdef CONFIG_NO_HZ 149#ifdef CONFIG_NO_HZ
115 seq_printf(m, ",%d,%d,%d,%lu", 150 seq_printf(m, ",%d,%d,%d,%lu",
116 rdp->dynticks->dynticks, 151 atomic_read(&rdp->dynticks->dynticks),
117 rdp->dynticks->dynticks_nesting, 152 rdp->dynticks->dynticks_nesting,
118 rdp->dynticks->dynticks_nmi, 153 rdp->dynticks->dynticks_nmi_nesting,
119 rdp->dynticks_fqs); 154 rdp->dynticks_fqs);
120#endif /* #ifdef CONFIG_NO_HZ */ 155#endif /* #ifdef CONFIG_NO_HZ */
121 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 156 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
122 seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit); 157 seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
158 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
159 rdp->nxttail[RCU_NEXT_TAIL]],
160 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
161 rdp->nxttail[RCU_NEXT_READY_TAIL]],
162 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
163 rdp->nxttail[RCU_WAIT_TAIL]],
164 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
165#ifdef CONFIG_RCU_BOOST
166 seq_printf(m, ",%d,\"%c\"",
167 per_cpu(rcu_cpu_has_work, rdp->cpu),
168 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
169 rdp->cpu)));
170#endif /* #ifdef CONFIG_RCU_BOOST */
171 seq_printf(m, ",%ld", rdp->blimit);
172 seq_printf(m, ",%lu,%lu,%lu\n",
173 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
123} 174}
124 175
125static int show_rcudata_csv(struct seq_file *m, void *unused) 176static int show_rcudata_csv(struct seq_file *m, void *unused)
126{ 177{
127 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); 178 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
128#ifdef CONFIG_NO_HZ 179#ifdef CONFIG_NO_HZ
129 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); 180 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
130#endif /* #ifdef CONFIG_NO_HZ */ 181#endif /* #ifdef CONFIG_NO_HZ */
131 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); 182 seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
183#ifdef CONFIG_RCU_BOOST
184 seq_puts(m, "\"kt\",\"ktl\"");
185#endif /* #ifdef CONFIG_RCU_BOOST */
186 seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n");
132#ifdef CONFIG_TREE_PREEMPT_RCU 187#ifdef CONFIG_TREE_PREEMPT_RCU
133 seq_puts(m, "\"rcu_preempt:\"\n"); 188 seq_puts(m, "\"rcu_preempt:\"\n");
134 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); 189 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
@@ -153,34 +208,97 @@ static const struct file_operations rcudata_csv_fops = {
153 .release = single_release, 208 .release = single_release,
154}; 209};
155 210
211#ifdef CONFIG_RCU_BOOST
212
213static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
214{
215 seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu "
216 "j=%04x bt=%04x\n",
217 rnp->grplo, rnp->grphi,
218 "T."[list_empty(&rnp->blkd_tasks)],
219 "N."[!rnp->gp_tasks],
220 "E."[!rnp->exp_tasks],
221 "B."[!rnp->boost_tasks],
222 convert_kthread_status(rnp->boost_kthread_status),
223 rnp->n_tasks_boosted, rnp->n_exp_boosts,
224 rnp->n_normal_boosts,
225 (int)(jiffies & 0xffff),
226 (int)(rnp->boost_time & 0xffff));
227 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
228 " balk",
229 rnp->n_balk_blkd_tasks,
230 rnp->n_balk_exp_gp_tasks,
231 rnp->n_balk_boost_tasks,
232 rnp->n_balk_notblocked,
233 rnp->n_balk_notyet,
234 rnp->n_balk_nos);
235}
236
237static int show_rcu_node_boost(struct seq_file *m, void *unused)
238{
239 struct rcu_node *rnp;
240
241 rcu_for_each_leaf_node(&rcu_preempt_state, rnp)
242 print_one_rcu_node_boost(m, rnp);
243 return 0;
244}
245
246static int rcu_node_boost_open(struct inode *inode, struct file *file)
247{
248 return single_open(file, show_rcu_node_boost, NULL);
249}
250
251static const struct file_operations rcu_node_boost_fops = {
252 .owner = THIS_MODULE,
253 .open = rcu_node_boost_open,
254 .read = seq_read,
255 .llseek = seq_lseek,
256 .release = single_release,
257};
258
259/*
260 * Create the rcuboost debugfs entry. Standard error return.
261 */
262static int rcu_boost_trace_create_file(struct dentry *rcudir)
263{
264 return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
265 &rcu_node_boost_fops);
266}
267
268#else /* #ifdef CONFIG_RCU_BOOST */
269
270static int rcu_boost_trace_create_file(struct dentry *rcudir)
271{
272 return 0; /* There cannot be an error if we didn't create it! */
273}
274
275#endif /* #else #ifdef CONFIG_RCU_BOOST */
276
156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 277static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
157{ 278{
158 unsigned long gpnum; 279 unsigned long gpnum;
159 int level = 0; 280 int level = 0;
160 int phase;
161 struct rcu_node *rnp; 281 struct rcu_node *rnp;
162 282
163 gpnum = rsp->gpnum; 283 gpnum = rsp->gpnum;
164 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 284 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", 285 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
166 rsp->completed, gpnum, rsp->signaled, 286 rsp->completed, gpnum, rsp->signaled,
167 (long)(rsp->jiffies_force_qs - jiffies), 287 (long)(rsp->jiffies_force_qs - jiffies),
168 (int)(jiffies & 0xffff), 288 (int)(jiffies & 0xffff),
169 rsp->n_force_qs, rsp->n_force_qs_ngp, 289 rsp->n_force_qs, rsp->n_force_qs_ngp,
170 rsp->n_force_qs - rsp->n_force_qs_ngp, 290 rsp->n_force_qs - rsp->n_force_qs_ngp,
171 rsp->n_force_qs_lh, rsp->orphan_qlen); 291 rsp->n_force_qs_lh);
172 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 292 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
173 if (rnp->level != level) { 293 if (rnp->level != level) {
174 seq_puts(m, "\n"); 294 seq_puts(m, "\n");
175 level = rnp->level; 295 level = rnp->level;
176 } 296 }
177 phase = gpnum & 0x1; 297 seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ",
178 seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ",
179 rnp->qsmask, rnp->qsmaskinit, 298 rnp->qsmask, rnp->qsmaskinit,
180 "T."[list_empty(&rnp->blocked_tasks[phase])], 299 ".G"[rnp->gp_tasks != NULL],
181 "E."[list_empty(&rnp->blocked_tasks[phase + 2])], 300 ".E"[rnp->exp_tasks != NULL],
182 "T."[list_empty(&rnp->blocked_tasks[!phase])], 301 ".T"[!list_empty(&rnp->blkd_tasks)],
183 "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
184 rnp->grplo, rnp->grphi, rnp->grpnum); 302 rnp->grplo, rnp->grphi, rnp->grpnum);
185 } 303 }
186 seq_puts(m, "\n"); 304 seq_puts(m, "\n");
@@ -212,16 +330,35 @@ static const struct file_operations rcuhier_fops = {
212 .release = single_release, 330 .release = single_release,
213}; 331};
214 332
333static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
334{
335 unsigned long flags;
336 unsigned long completed;
337 unsigned long gpnum;
338 unsigned long gpage;
339 unsigned long gpmax;
340 struct rcu_node *rnp = &rsp->node[0];
341
342 raw_spin_lock_irqsave(&rnp->lock, flags);
343 completed = rsp->completed;
344 gpnum = rsp->gpnum;
345 if (rsp->completed == rsp->gpnum)
346 gpage = 0;
347 else
348 gpage = jiffies - rsp->gp_start;
349 gpmax = rsp->gp_max;
350 raw_spin_unlock_irqrestore(&rnp->lock, flags);
351 seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n",
352 rsp->name, completed, gpnum, gpage, gpmax);
353}
354
215static int show_rcugp(struct seq_file *m, void *unused) 355static int show_rcugp(struct seq_file *m, void *unused)
216{ 356{
217#ifdef CONFIG_TREE_PREEMPT_RCU 357#ifdef CONFIG_TREE_PREEMPT_RCU
218 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n", 358 show_one_rcugp(m, &rcu_preempt_state);
219 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
220#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 359#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
221 seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n", 360 show_one_rcugp(m, &rcu_sched_state);
222 rcu_sched_state.completed, rcu_sched_state.gpnum); 361 show_one_rcugp(m, &rcu_bh_state);
223 seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n",
224 rcu_bh_state.completed, rcu_bh_state.gpnum);
225 return 0; 362 return 0;
226} 363}
227 364
@@ -262,7 +399,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
262 struct rcu_data *rdp; 399 struct rcu_data *rdp;
263 400
264 for_each_possible_cpu(cpu) { 401 for_each_possible_cpu(cpu) {
265 rdp = rsp->rda[cpu]; 402 rdp = per_cpu_ptr(rsp->rda, cpu);
266 if (rdp->beenonline) 403 if (rdp->beenonline)
267 print_one_rcu_pending(m, rdp); 404 print_one_rcu_pending(m, rdp);
268 } 405 }
@@ -294,9 +431,32 @@ static const struct file_operations rcu_pending_fops = {
294 .release = single_release, 431 .release = single_release,
295}; 432};
296 433
434static int show_rcutorture(struct seq_file *m, void *unused)
435{
436 seq_printf(m, "rcutorture test sequence: %lu %s\n",
437 rcutorture_testseq >> 1,
438 (rcutorture_testseq & 0x1) ? "(test in progress)" : "");
439 seq_printf(m, "rcutorture update version number: %lu\n",
440 rcutorture_vernum);
441 return 0;
442}
443
444static int rcutorture_open(struct inode *inode, struct file *file)
445{
446 return single_open(file, show_rcutorture, NULL);
447}
448
449static const struct file_operations rcutorture_fops = {
450 .owner = THIS_MODULE,
451 .open = rcutorture_open,
452 .read = seq_read,
453 .llseek = seq_lseek,
454 .release = single_release,
455};
456
297static struct dentry *rcudir; 457static struct dentry *rcudir;
298 458
299static int __init rcuclassic_trace_init(void) 459static int __init rcutree_trace_init(void)
300{ 460{
301 struct dentry *retval; 461 struct dentry *retval;
302 462
@@ -314,6 +474,9 @@ static int __init rcuclassic_trace_init(void)
314 if (!retval) 474 if (!retval)
315 goto free_out; 475 goto free_out;
316 476
477 if (rcu_boost_trace_create_file(rcudir))
478 goto free_out;
479
317 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 480 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
318 if (!retval) 481 if (!retval)
319 goto free_out; 482 goto free_out;
@@ -327,20 +490,25 @@ static int __init rcuclassic_trace_init(void)
327 NULL, &rcu_pending_fops); 490 NULL, &rcu_pending_fops);
328 if (!retval) 491 if (!retval)
329 goto free_out; 492 goto free_out;
493
494 retval = debugfs_create_file("rcutorture", 0444, rcudir,
495 NULL, &rcutorture_fops);
496 if (!retval)
497 goto free_out;
330 return 0; 498 return 0;
331free_out: 499free_out:
332 debugfs_remove_recursive(rcudir); 500 debugfs_remove_recursive(rcudir);
333 return 1; 501 return 1;
334} 502}
335 503
336static void __exit rcuclassic_trace_cleanup(void) 504static void __exit rcutree_trace_cleanup(void)
337{ 505{
338 debugfs_remove_recursive(rcudir); 506 debugfs_remove_recursive(rcudir);
339} 507}
340 508
341 509
342module_init(rcuclassic_trace_init); 510module_init(rcutree_trace_init);
343module_exit(rcuclassic_trace_cleanup); 511module_exit(rcutree_trace_cleanup);
344 512
345MODULE_AUTHOR("Paul E. McKenney"); 513MODULE_AUTHOR("Paul E. McKenney");
346MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); 514MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
diff --git a/kernel/relay.c b/kernel/relay.c
index c7cf397fb929..859ea5a9605f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -70,17 +70,10 @@ static const struct vm_operations_struct relay_file_mmap_ops = {
70 */ 70 */
71static struct page **relay_alloc_page_array(unsigned int n_pages) 71static struct page **relay_alloc_page_array(unsigned int n_pages)
72{ 72{
73 struct page **array; 73 const size_t pa_size = n_pages * sizeof(struct page *);
74 size_t pa_size = n_pages * sizeof(struct page *); 74 if (pa_size > PAGE_SIZE)
75 75 return vzalloc(pa_size);
76 if (pa_size > PAGE_SIZE) { 76 return kzalloc(pa_size, GFP_KERNEL);
77 array = vmalloc(pa_size);
78 if (array)
79 memset(array, 0, pa_size);
80 } else {
81 array = kzalloc(pa_size, GFP_KERNEL);
82 }
83 return array;
84} 77}
85 78
86/* 79/*
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index c7eaa37a768b..34683efa2cce 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
126 pos, buf, s - buf); 126 pos, buf, s - buf);
127} 127}
128 128
129#if BITS_PER_LONG == 32
130u64 res_counter_read_u64(struct res_counter *counter, int member)
131{
132 unsigned long flags;
133 u64 ret;
134
135 spin_lock_irqsave(&counter->lock, flags);
136 ret = *res_counter_member(counter, member);
137 spin_unlock_irqrestore(&counter->lock, flags);
138
139 return ret;
140}
141#else
129u64 res_counter_read_u64(struct res_counter *counter, int member) 142u64 res_counter_read_u64(struct res_counter *counter, int member)
130{ 143{
131 return *res_counter_member(counter, member); 144 return *res_counter_member(counter, member);
132} 145}
146#endif
133 147
134int res_counter_memparse_write_strategy(const char *buf, 148int res_counter_memparse_write_strategy(const char *buf,
135 unsigned long long *res) 149 unsigned long long *res)
diff --git a/kernel/resource.c b/kernel/resource.c
index 7b36976e5dea..3ff40178dce7 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -38,6 +38,14 @@ struct resource iomem_resource = {
38}; 38};
39EXPORT_SYMBOL(iomem_resource); 39EXPORT_SYMBOL(iomem_resource);
40 40
41/* constraints to be met while allocating resources */
42struct resource_constraint {
43 resource_size_t min, max, align;
44 resource_size_t (*alignf)(void *, const struct resource *,
45 resource_size_t, resource_size_t);
46 void *alignf_data;
47};
48
41static DEFINE_RWLOCK(resource_lock); 49static DEFINE_RWLOCK(resource_lock);
42 50
43static void *r_next(struct seq_file *m, void *v, loff_t *pos) 51static void *r_next(struct seq_file *m, void *v, loff_t *pos)
@@ -357,57 +365,148 @@ int __weak page_is_ram(unsigned long pfn)
357 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; 365 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
358} 366}
359 367
368void __weak arch_remove_reservations(struct resource *avail)
369{
370}
371
372static resource_size_t simple_align_resource(void *data,
373 const struct resource *avail,
374 resource_size_t size,
375 resource_size_t align)
376{
377 return avail->start;
378}
379
380static void resource_clip(struct resource *res, resource_size_t min,
381 resource_size_t max)
382{
383 if (res->start < min)
384 res->start = min;
385 if (res->end > max)
386 res->end = max;
387}
388
389static bool resource_contains(struct resource *res1, struct resource *res2)
390{
391 return res1->start <= res2->start && res1->end >= res2->end;
392}
393
360/* 394/*
361 * Find empty slot in the resource tree given range and alignment. 395 * Find empty slot in the resource tree with the given range and
396 * alignment constraints
362 */ 397 */
363static int find_resource(struct resource *root, struct resource *new, 398static int __find_resource(struct resource *root, struct resource *old,
364 resource_size_t size, resource_size_t min, 399 struct resource *new,
365 resource_size_t max, resource_size_t align, 400 resource_size_t size,
366 resource_size_t (*alignf)(void *, 401 struct resource_constraint *constraint)
367 const struct resource *,
368 resource_size_t,
369 resource_size_t),
370 void *alignf_data)
371{ 402{
372 struct resource *this = root->child; 403 struct resource *this = root->child;
373 struct resource tmp = *new; 404 struct resource tmp = *new, avail, alloc;
374 405
406 tmp.flags = new->flags;
375 tmp.start = root->start; 407 tmp.start = root->start;
376 /* 408 /*
377 * Skip past an allocated resource that starts at 0, since the assignment 409 * Skip past an allocated resource that starts at 0, since the assignment
378 * of this->start - 1 to tmp->end below would cause an underflow. 410 * of this->start - 1 to tmp->end below would cause an underflow.
379 */ 411 */
380 if (this && this->start == 0) { 412 if (this && this->start == root->start) {
381 tmp.start = this->end + 1; 413 tmp.start = (this == old) ? old->start : this->end + 1;
382 this = this->sibling; 414 this = this->sibling;
383 } 415 }
384 for(;;) { 416 for(;;) {
385 if (this) 417 if (this)
386 tmp.end = this->start - 1; 418 tmp.end = (this == old) ? this->end : this->start - 1;
387 else 419 else
388 tmp.end = root->end; 420 tmp.end = root->end;
389 if (tmp.start < min) 421
390 tmp.start = min; 422 resource_clip(&tmp, constraint->min, constraint->max);
391 if (tmp.end > max) 423 arch_remove_reservations(&tmp);
392 tmp.end = max; 424
393 tmp.start = ALIGN(tmp.start, align); 425 /* Check for overflow after ALIGN() */
394 if (alignf) 426 avail = *new;
395 tmp.start = alignf(alignf_data, &tmp, size, align); 427 avail.start = ALIGN(tmp.start, constraint->align);
396 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { 428 avail.end = tmp.end;
397 new->start = tmp.start; 429 if (avail.start >= tmp.start) {
398 new->end = tmp.start + size - 1; 430 alloc.start = constraint->alignf(constraint->alignf_data, &avail,
399 return 0; 431 size, constraint->align);
432 alloc.end = alloc.start + size - 1;
433 if (resource_contains(&avail, &alloc)) {
434 new->start = alloc.start;
435 new->end = alloc.end;
436 return 0;
437 }
400 } 438 }
401 if (!this) 439 if (!this)
402 break; 440 break;
403 tmp.start = this->end + 1; 441 if (this != old)
442 tmp.start = this->end + 1;
404 this = this->sibling; 443 this = this->sibling;
405 } 444 }
406 return -EBUSY; 445 return -EBUSY;
407} 446}
408 447
448/*
449 * Find empty slot in the resource tree given range and alignment.
450 */
451static int find_resource(struct resource *root, struct resource *new,
452 resource_size_t size,
453 struct resource_constraint *constraint)
454{
455 return __find_resource(root, NULL, new, size, constraint);
456}
457
458/**
459 * reallocate_resource - allocate a slot in the resource tree given range & alignment.
460 * The resource will be relocated if the new size cannot be reallocated in the
461 * current location.
462 *
463 * @root: root resource descriptor
464 * @old: resource descriptor desired by caller
465 * @newsize: new size of the resource descriptor
466 * @constraint: the size and alignment constraints to be met.
467 */
468int reallocate_resource(struct resource *root, struct resource *old,
469 resource_size_t newsize,
470 struct resource_constraint *constraint)
471{
472 int err=0;
473 struct resource new = *old;
474 struct resource *conflict;
475
476 write_lock(&resource_lock);
477
478 if ((err = __find_resource(root, old, &new, newsize, constraint)))
479 goto out;
480
481 if (resource_contains(&new, old)) {
482 old->start = new.start;
483 old->end = new.end;
484 goto out;
485 }
486
487 if (old->child) {
488 err = -EBUSY;
489 goto out;
490 }
491
492 if (resource_contains(old, &new)) {
493 old->start = new.start;
494 old->end = new.end;
495 } else {
496 __release_resource(old);
497 *old = new;
498 conflict = __request_resource(root, old);
499 BUG_ON(conflict);
500 }
501out:
502 write_unlock(&resource_lock);
503 return err;
504}
505
506
409/** 507/**
410 * allocate_resource - allocate empty slot in the resource tree given range & alignment 508 * allocate_resource - allocate empty slot in the resource tree given range & alignment.
509 * The resource will be reallocated with a new size if it was already allocated
411 * @root: root resource descriptor 510 * @root: root resource descriptor
412 * @new: resource descriptor desired by caller 511 * @new: resource descriptor desired by caller
413 * @size: requested resource region size 512 * @size: requested resource region size
@@ -427,9 +526,25 @@ int allocate_resource(struct resource *root, struct resource *new,
427 void *alignf_data) 526 void *alignf_data)
428{ 527{
429 int err; 528 int err;
529 struct resource_constraint constraint;
530
531 if (!alignf)
532 alignf = simple_align_resource;
533
534 constraint.min = min;
535 constraint.max = max;
536 constraint.align = align;
537 constraint.alignf = alignf;
538 constraint.alignf_data = alignf_data;
539
540 if ( new->parent ) {
541 /* resource is already allocated, try reallocating with
542 the new constraints */
543 return reallocate_resource(root, new, size, &constraint);
544 }
430 545
431 write_lock(&resource_lock); 546 write_lock(&resource_lock);
432 err = find_resource(root, new, size, min, max, align, alignf, alignf_data); 547 err = find_resource(root, new, size, &constraint);
433 if (err >= 0 && __request_resource(root, new)) 548 if (err >= 0 && __request_resource(root, new))
434 err = -EBUSY; 549 err = -EBUSY;
435 write_unlock(&resource_lock); 550 write_unlock(&resource_lock);
@@ -453,6 +568,8 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
453 568
454 if (first == parent) 569 if (first == parent)
455 return first; 570 return first;
571 if (WARN_ON(first == new)) /* duplicated insertion */
572 return first;
456 573
457 if ((first->start > new->start) || (first->end < new->end)) 574 if ((first->start > new->start) || (first->end < new->end))
458 break; 575 break;
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index ddabb54bb5c8..3c7cbc2c33be 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
215 put_pid(waiter->deadlock_task_pid); 215 put_pid(waiter->deadlock_task_pid);
216 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); 216 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
217 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 217 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
218 TRACE_WARN_ON(waiter->task);
219 memset(waiter, 0x22, sizeof(*waiter)); 218 memset(waiter, 0x22, sizeof(*waiter));
220} 219}
221 220
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index a56f629b057a..5c9ccd380966 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -9,7 +9,6 @@
9#include <linux/kthread.h> 9#include <linux/kthread.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/smp_lock.h>
13#include <linux/spinlock.h> 12#include <linux/spinlock.h>
14#include <linux/sysdev.h> 13#include <linux/sysdev.h>
15#include <linux/timer.h> 14#include <linux/timer.h>
@@ -27,7 +26,6 @@ struct test_thread_data {
27 int opcode; 26 int opcode;
28 int opdata; 27 int opdata;
29 int mutexes[MAX_RT_TEST_MUTEXES]; 28 int mutexes[MAX_RT_TEST_MUTEXES];
30 int bkl;
31 int event; 29 int event;
32 struct sys_device sysdev; 30 struct sys_device sysdev;
33}; 31};
@@ -46,9 +44,8 @@ enum test_opcodes {
46 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ 44 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
47 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ 45 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
48 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ 46 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
49 RTTEST_LOCKBKL, /* 9 Lock BKL */ 47 /* 9, 10 - reserved for BKL commemoration */
50 RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ 48 RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */
51 RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */
52 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ 49 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
53 RTTEST_RESET = 99, /* 99 Reset all pending operations */ 50 RTTEST_RESET = 99, /* 99 Reset all pending operations */
54}; 51};
@@ -74,11 +71,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
74 td->mutexes[i] = 0; 71 td->mutexes[i] = 0;
75 } 72 }
76 } 73 }
77
78 if (!lockwakeup && td->bkl == 4) {
79 unlock_kernel();
80 td->bkl = 0;
81 }
82 return 0; 74 return 0;
83 75
84 case RTTEST_RESETEVENT: 76 case RTTEST_RESETEVENT:
@@ -129,21 +121,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
129 td->mutexes[id] = 0; 121 td->mutexes[id] = 0;
130 return 0; 122 return 0;
131 123
132 case RTTEST_LOCKBKL:
133 if (td->bkl)
134 return 0;
135 td->bkl = 1;
136 lock_kernel();
137 td->bkl = 4;
138 return 0;
139
140 case RTTEST_UNLOCKBKL:
141 if (td->bkl != 4)
142 break;
143 unlock_kernel();
144 td->bkl = 0;
145 return 0;
146
147 default: 124 default:
148 break; 125 break;
149 } 126 }
@@ -190,7 +167,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
190 td->event = atomic_add_return(1, &rttest_event); 167 td->event = atomic_add_return(1, &rttest_event);
191 break; 168 break;
192 169
193 case RTTEST_LOCKBKL:
194 default: 170 default:
195 break; 171 break;
196 } 172 }
@@ -223,8 +199,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
223 td->event = atomic_add_return(1, &rttest_event); 199 td->event = atomic_add_return(1, &rttest_event);
224 return; 200 return;
225 201
226 case RTTEST_LOCKBKL:
227 return;
228 default: 202 default:
229 return; 203 return;
230 } 204 }
@@ -374,11 +348,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
374 spin_lock(&rttest_lock); 348 spin_lock(&rttest_lock);
375 349
376 curr += sprintf(curr, 350 curr += sprintf(curr,
377 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", 351 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
378 td->opcode, td->event, tsk->state, 352 td->opcode, td->event, tsk->state,
379 (MAX_RT_PRIO - 1) - tsk->prio, 353 (MAX_RT_PRIO - 1) - tsk->prio,
380 (MAX_RT_PRIO - 1) - tsk->normal_prio, 354 (MAX_RT_PRIO - 1) - tsk->normal_prio,
381 tsk->pi_blocked_on, td->bkl); 355 tsk->pi_blocked_on);
382 356
383 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) 357 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
384 curr += sprintf(curr, "%d", td->mutexes[i]); 358 curr += sprintf(curr, "%d", td->mutexes[i]);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a9604815786a..ab449117aaf2 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -20,41 +20,34 @@
20/* 20/*
21 * lock->owner state tracking: 21 * lock->owner state tracking:
22 * 22 *
23 * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 23 * lock->owner holds the task_struct pointer of the owner. Bit 0
24 * are used to keep track of the "owner is pending" and "lock has 24 * is used to keep track of the "lock has waiters" state.
25 * waiters" state.
26 * 25 *
27 * owner bit1 bit0 26 * owner bit0
28 * NULL 0 0 lock is free (fast acquire possible) 27 * NULL 0 lock is free (fast acquire possible)
29 * NULL 0 1 invalid state 28 * NULL 1 lock is free and has waiters and the top waiter
30 * NULL 1 0 Transitional State* 29 * is going to take the lock*
31 * NULL 1 1 invalid state 30 * taskpointer 0 lock is held (fast release possible)
32 * taskpointer 0 0 lock is held (fast release possible) 31 * taskpointer 1 lock is held and has waiters**
33 * taskpointer 0 1 task is pending owner
34 * taskpointer 1 0 lock is held and has waiters
35 * taskpointer 1 1 task is pending owner and lock has more waiters
36 *
37 * Pending ownership is assigned to the top (highest priority)
38 * waiter of the lock, when the lock is released. The thread is woken
39 * up and can now take the lock. Until the lock is taken (bit 0
40 * cleared) a competing higher priority thread can steal the lock
41 * which puts the woken up thread back on the waiters list.
42 * 32 *
43 * The fast atomic compare exchange based acquire and release is only 33 * The fast atomic compare exchange based acquire and release is only
44 * possible when bit 0 and 1 of lock->owner are 0. 34 * possible when bit 0 of lock->owner is 0.
35 *
36 * (*) It also can be a transitional state when grabbing the lock
37 * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
38 * we need to set the bit0 before looking at the lock, and the owner may be
39 * NULL in this small time, hence this can be a transitional state.
45 * 40 *
46 * (*) There's a small time where the owner can be NULL and the 41 * (**) There is a small time when bit 0 is set but there are no
47 * "lock has waiters" bit is set. This can happen when grabbing the lock. 42 * waiters. This can happen when grabbing the lock in the slow path.
48 * To prevent a cmpxchg of the owner releasing the lock, we need to set this 43 * To prevent a cmpxchg of the owner releasing the lock, we need to
49 * bit before looking at the lock, hence the reason this is a transitional 44 * set this bit before looking at the lock.
50 * state.
51 */ 45 */
52 46
53static void 47static void
54rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, 48rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
55 unsigned long mask)
56{ 49{
57 unsigned long val = (unsigned long)owner | mask; 50 unsigned long val = (unsigned long)owner;
58 51
59 if (rt_mutex_has_waiters(lock)) 52 if (rt_mutex_has_waiters(lock))
60 val |= RT_MUTEX_HAS_WAITERS; 53 val |= RT_MUTEX_HAS_WAITERS;
@@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
203 * reached or the state of the chain has changed while we 196 * reached or the state of the chain has changed while we
204 * dropped the locks. 197 * dropped the locks.
205 */ 198 */
206 if (!waiter || !waiter->task) 199 if (!waiter)
207 goto out_unlock_pi; 200 goto out_unlock_pi;
208 201
209 /* 202 /*
210 * Check the orig_waiter state. After we dropped the locks, 203 * Check the orig_waiter state. After we dropped the locks,
211 * the previous owner of the lock might have released the lock 204 * the previous owner of the lock might have released the lock.
212 * and made us the pending owner:
213 */ 205 */
214 if (orig_waiter && !orig_waiter->task) 206 if (orig_waiter && !rt_mutex_owner(orig_lock))
215 goto out_unlock_pi; 207 goto out_unlock_pi;
216 208
217 /* 209 /*
@@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
254 246
255 /* Release the task */ 247 /* Release the task */
256 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 248 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
249 if (!rt_mutex_owner(lock)) {
250 /*
251 * If the requeue above changed the top waiter, then we need
252 * to wake the new top waiter up to try to get the lock.
253 */
254
255 if (top_waiter != rt_mutex_top_waiter(lock))
256 wake_up_process(rt_mutex_top_waiter(lock)->task);
257 raw_spin_unlock(&lock->wait_lock);
258 goto out_put_task;
259 }
257 put_task_struct(task); 260 put_task_struct(task);
258 261
259 /* Grab the next task */ 262 /* Grab the next task */
@@ -296,78 +299,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
296} 299}
297 300
298/* 301/*
299 * Optimization: check if we can steal the lock from the
300 * assigned pending owner [which might not have taken the
301 * lock yet]:
302 */
303static inline int try_to_steal_lock(struct rt_mutex *lock,
304 struct task_struct *task)
305{
306 struct task_struct *pendowner = rt_mutex_owner(lock);
307 struct rt_mutex_waiter *next;
308 unsigned long flags;
309
310 if (!rt_mutex_owner_pending(lock))
311 return 0;
312
313 if (pendowner == task)
314 return 1;
315
316 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
317 if (task->prio >= pendowner->prio) {
318 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
319 return 0;
320 }
321
322 /*
323 * Check if a waiter is enqueued on the pending owners
324 * pi_waiters list. Remove it and readjust pending owners
325 * priority.
326 */
327 if (likely(!rt_mutex_has_waiters(lock))) {
328 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
329 return 1;
330 }
331
332 /* No chain handling, pending owner is not blocked on anything: */
333 next = rt_mutex_top_waiter(lock);
334 plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
335 __rt_mutex_adjust_prio(pendowner);
336 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
337
338 /*
339 * We are going to steal the lock and a waiter was
340 * enqueued on the pending owners pi_waiters queue. So
341 * we have to enqueue this waiter into
342 * task->pi_waiters list. This covers the case,
343 * where task is boosted because it holds another
344 * lock and gets unboosted because the booster is
345 * interrupted, so we would delay a waiter with higher
346 * priority as task->normal_prio.
347 *
348 * Note: in the rare case of a SCHED_OTHER task changing
349 * its priority and thus stealing the lock, next->task
350 * might be task:
351 */
352 if (likely(next->task != task)) {
353 raw_spin_lock_irqsave(&task->pi_lock, flags);
354 plist_add(&next->pi_list_entry, &task->pi_waiters);
355 __rt_mutex_adjust_prio(task);
356 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
357 }
358 return 1;
359}
360
361/*
362 * Try to take an rt-mutex 302 * Try to take an rt-mutex
363 * 303 *
364 * This fails
365 * - when the lock has a real owner
366 * - when a different pending owner exists and has higher priority than current
367 *
368 * Must be called with lock->wait_lock held. 304 * Must be called with lock->wait_lock held.
305 *
306 * @lock: the lock to be acquired.
307 * @task: the task which wants to acquire the lock
308 * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
369 */ 309 */
370static int try_to_take_rt_mutex(struct rt_mutex *lock) 310static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
311 struct rt_mutex_waiter *waiter)
371{ 312{
372 /* 313 /*
373 * We have to be careful here if the atomic speedups are 314 * We have to be careful here if the atomic speedups are
@@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
390 */ 331 */
391 mark_rt_mutex_waiters(lock); 332 mark_rt_mutex_waiters(lock);
392 333
393 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) 334 if (rt_mutex_owner(lock))
394 return 0; 335 return 0;
395 336
337 /*
338 * It will get the lock because of one of these conditions:
339 * 1) there is no waiter
340 * 2) higher priority than waiters
341 * 3) it is top waiter
342 */
343 if (rt_mutex_has_waiters(lock)) {
344 if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) {
345 if (!waiter || waiter != rt_mutex_top_waiter(lock))
346 return 0;
347 }
348 }
349
350 if (waiter || rt_mutex_has_waiters(lock)) {
351 unsigned long flags;
352 struct rt_mutex_waiter *top;
353
354 raw_spin_lock_irqsave(&task->pi_lock, flags);
355
356 /* remove the queued waiter. */
357 if (waiter) {
358 plist_del(&waiter->list_entry, &lock->wait_list);
359 task->pi_blocked_on = NULL;
360 }
361
362 /*
363 * We have to enqueue the top waiter(if it exists) into
364 * task->pi_waiters list.
365 */
366 if (rt_mutex_has_waiters(lock)) {
367 top = rt_mutex_top_waiter(lock);
368 top->pi_list_entry.prio = top->list_entry.prio;
369 plist_add(&top->pi_list_entry, &task->pi_waiters);
370 }
371 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
372 }
373
396 /* We got the lock. */ 374 /* We got the lock. */
397 debug_rt_mutex_lock(lock); 375 debug_rt_mutex_lock(lock);
398 376
399 rt_mutex_set_owner(lock, current, 0); 377 rt_mutex_set_owner(lock, task);
400 378
401 rt_mutex_deadlock_account_lock(lock, current); 379 rt_mutex_deadlock_account_lock(lock, task);
402 380
403 return 1; 381 return 1;
404} 382}
@@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
436 414
437 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 415 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
438 416
417 if (!owner)
418 return 0;
419
439 if (waiter == rt_mutex_top_waiter(lock)) { 420 if (waiter == rt_mutex_top_waiter(lock)) {
440 raw_spin_lock_irqsave(&owner->pi_lock, flags); 421 raw_spin_lock_irqsave(&owner->pi_lock, flags);
441 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); 422 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
@@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
472/* 453/*
473 * Wake up the next waiter on the lock. 454 * Wake up the next waiter on the lock.
474 * 455 *
475 * Remove the top waiter from the current tasks waiter list and from 456 * Remove the top waiter from the current tasks waiter list and wake it up.
476 * the lock waiter list. Set it as pending owner. Then wake it up.
477 * 457 *
478 * Called with lock->wait_lock held. 458 * Called with lock->wait_lock held.
479 */ 459 */
480static void wakeup_next_waiter(struct rt_mutex *lock) 460static void wakeup_next_waiter(struct rt_mutex *lock)
481{ 461{
482 struct rt_mutex_waiter *waiter; 462 struct rt_mutex_waiter *waiter;
483 struct task_struct *pendowner;
484 unsigned long flags; 463 unsigned long flags;
485 464
486 raw_spin_lock_irqsave(&current->pi_lock, flags); 465 raw_spin_lock_irqsave(&current->pi_lock, flags);
487 466
488 waiter = rt_mutex_top_waiter(lock); 467 waiter = rt_mutex_top_waiter(lock);
489 plist_del(&waiter->list_entry, &lock->wait_list);
490 468
491 /* 469 /*
492 * Remove it from current->pi_waiters. We do not adjust a 470 * Remove it from current->pi_waiters. We do not adjust a
@@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
495 * lock->wait_lock. 473 * lock->wait_lock.
496 */ 474 */
497 plist_del(&waiter->pi_list_entry, &current->pi_waiters); 475 plist_del(&waiter->pi_list_entry, &current->pi_waiters);
498 pendowner = waiter->task;
499 waiter->task = NULL;
500 476
501 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); 477 rt_mutex_set_owner(lock, NULL);
502 478
503 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 479 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
504 480
505 /* 481 wake_up_process(waiter->task);
506 * Clear the pi_blocked_on variable and enqueue a possible
507 * waiter into the pi_waiters list of the pending owner. This
508 * prevents that in case the pending owner gets unboosted a
509 * waiter with higher priority than pending-owner->normal_prio
510 * is blocked on the unboosted (pending) owner.
511 */
512 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
513
514 WARN_ON(!pendowner->pi_blocked_on);
515 WARN_ON(pendowner->pi_blocked_on != waiter);
516 WARN_ON(pendowner->pi_blocked_on->lock != lock);
517
518 pendowner->pi_blocked_on = NULL;
519
520 if (rt_mutex_has_waiters(lock)) {
521 struct rt_mutex_waiter *next;
522
523 next = rt_mutex_top_waiter(lock);
524 plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
525 }
526 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
527
528 wake_up_process(pendowner);
529} 482}
530 483
531/* 484/*
532 * Remove a waiter from a lock 485 * Remove a waiter from a lock and give up
533 * 486 *
534 * Must be called with lock->wait_lock held 487 * Must be called with lock->wait_lock held and
488 * have just failed to try_to_take_rt_mutex().
535 */ 489 */
536static void remove_waiter(struct rt_mutex *lock, 490static void remove_waiter(struct rt_mutex *lock,
537 struct rt_mutex_waiter *waiter) 491 struct rt_mutex_waiter *waiter)
@@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock,
543 497
544 raw_spin_lock_irqsave(&current->pi_lock, flags); 498 raw_spin_lock_irqsave(&current->pi_lock, flags);
545 plist_del(&waiter->list_entry, &lock->wait_list); 499 plist_del(&waiter->list_entry, &lock->wait_list);
546 waiter->task = NULL;
547 current->pi_blocked_on = NULL; 500 current->pi_blocked_on = NULL;
548 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 501 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
549 502
550 if (first && owner != current) { 503 if (!owner)
504 return;
505
506 if (first) {
551 507
552 raw_spin_lock_irqsave(&owner->pi_lock, flags); 508 raw_spin_lock_irqsave(&owner->pi_lock, flags);
553 509
@@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task)
614 * or TASK_UNINTERRUPTIBLE) 570 * or TASK_UNINTERRUPTIBLE)
615 * @timeout: the pre-initialized and started timer, or NULL for none 571 * @timeout: the pre-initialized and started timer, or NULL for none
616 * @waiter: the pre-initialized rt_mutex_waiter 572 * @waiter: the pre-initialized rt_mutex_waiter
617 * @detect_deadlock: passed to task_blocks_on_rt_mutex
618 * 573 *
619 * lock->wait_lock must be held by the caller. 574 * lock->wait_lock must be held by the caller.
620 */ 575 */
621static int __sched 576static int __sched
622__rt_mutex_slowlock(struct rt_mutex *lock, int state, 577__rt_mutex_slowlock(struct rt_mutex *lock, int state,
623 struct hrtimer_sleeper *timeout, 578 struct hrtimer_sleeper *timeout,
624 struct rt_mutex_waiter *waiter, 579 struct rt_mutex_waiter *waiter)
625 int detect_deadlock)
626{ 580{
627 int ret = 0; 581 int ret = 0;
628 582
629 for (;;) { 583 for (;;) {
630 /* Try to acquire the lock: */ 584 /* Try to acquire the lock: */
631 if (try_to_take_rt_mutex(lock)) 585 if (try_to_take_rt_mutex(lock, current, waiter))
632 break; 586 break;
633 587
634 /* 588 /*
@@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
645 break; 599 break;
646 } 600 }
647 601
648 /*
649 * waiter->task is NULL the first time we come here and
650 * when we have been woken up by the previous owner
651 * but the lock got stolen by a higher prio task.
652 */
653 if (!waiter->task) {
654 ret = task_blocks_on_rt_mutex(lock, waiter, current,
655 detect_deadlock);
656 /*
657 * If we got woken up by the owner then start loop
658 * all over without going into schedule to try
659 * to get the lock now:
660 */
661 if (unlikely(!waiter->task)) {
662 /*
663 * Reset the return value. We might
664 * have returned with -EDEADLK and the
665 * owner released the lock while we
666 * were walking the pi chain.
667 */
668 ret = 0;
669 continue;
670 }
671 if (unlikely(ret))
672 break;
673 }
674
675 raw_spin_unlock(&lock->wait_lock); 602 raw_spin_unlock(&lock->wait_lock);
676 603
677 debug_rt_mutex_print_deadlock(waiter); 604 debug_rt_mutex_print_deadlock(waiter);
678 605
679 if (waiter->task) 606 schedule_rt_mutex(lock);
680 schedule_rt_mutex(lock);
681 607
682 raw_spin_lock(&lock->wait_lock); 608 raw_spin_lock(&lock->wait_lock);
683 set_current_state(state); 609 set_current_state(state);
@@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
698 int ret = 0; 624 int ret = 0;
699 625
700 debug_rt_mutex_init_waiter(&waiter); 626 debug_rt_mutex_init_waiter(&waiter);
701 waiter.task = NULL;
702 627
703 raw_spin_lock(&lock->wait_lock); 628 raw_spin_lock(&lock->wait_lock);
704 629
705 /* Try to acquire the lock again: */ 630 /* Try to acquire the lock again: */
706 if (try_to_take_rt_mutex(lock)) { 631 if (try_to_take_rt_mutex(lock, current, NULL)) {
707 raw_spin_unlock(&lock->wait_lock); 632 raw_spin_unlock(&lock->wait_lock);
708 return 0; 633 return 0;
709 } 634 }
@@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
717 timeout->task = NULL; 642 timeout->task = NULL;
718 } 643 }
719 644
720 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, 645 ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock);
721 detect_deadlock); 646
647 if (likely(!ret))
648 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
722 649
723 set_current_state(TASK_RUNNING); 650 set_current_state(TASK_RUNNING);
724 651
725 if (unlikely(waiter.task)) 652 if (unlikely(ret))
726 remove_waiter(lock, &waiter); 653 remove_waiter(lock, &waiter);
727 654
728 /* 655 /*
@@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
737 if (unlikely(timeout)) 664 if (unlikely(timeout))
738 hrtimer_cancel(&timeout->timer); 665 hrtimer_cancel(&timeout->timer);
739 666
740 /*
741 * Readjust priority, when we did not get the lock. We might
742 * have been the pending owner and boosted. Since we did not
743 * take the lock, the PI boost has to go.
744 */
745 if (unlikely(ret))
746 rt_mutex_adjust_prio(current);
747
748 debug_rt_mutex_free_waiter(&waiter); 667 debug_rt_mutex_free_waiter(&waiter);
749 668
750 return ret; 669 return ret;
@@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
762 681
763 if (likely(rt_mutex_owner(lock) != current)) { 682 if (likely(rt_mutex_owner(lock) != current)) {
764 683
765 ret = try_to_take_rt_mutex(lock); 684 ret = try_to_take_rt_mutex(lock, current, NULL);
766 /* 685 /*
767 * try_to_take_rt_mutex() sets the lock waiters 686 * try_to_take_rt_mutex() sets the lock waiters
768 * bit unconditionally. Clean this up. 687 * bit unconditionally. Clean this up.
@@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
992{ 911{
993 __rt_mutex_init(lock, NULL); 912 __rt_mutex_init(lock, NULL);
994 debug_rt_mutex_proxy_lock(lock, proxy_owner); 913 debug_rt_mutex_proxy_lock(lock, proxy_owner);
995 rt_mutex_set_owner(lock, proxy_owner, 0); 914 rt_mutex_set_owner(lock, proxy_owner);
996 rt_mutex_deadlock_account_lock(lock, proxy_owner); 915 rt_mutex_deadlock_account_lock(lock, proxy_owner);
997} 916}
998 917
@@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
1008 struct task_struct *proxy_owner) 927 struct task_struct *proxy_owner)
1009{ 928{
1010 debug_rt_mutex_proxy_unlock(lock); 929 debug_rt_mutex_proxy_unlock(lock);
1011 rt_mutex_set_owner(lock, NULL, 0); 930 rt_mutex_set_owner(lock, NULL);
1012 rt_mutex_deadlock_account_unlock(proxy_owner); 931 rt_mutex_deadlock_account_unlock(proxy_owner);
1013} 932}
1014 933
@@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1034 953
1035 raw_spin_lock(&lock->wait_lock); 954 raw_spin_lock(&lock->wait_lock);
1036 955
1037 mark_rt_mutex_waiters(lock); 956 if (try_to_take_rt_mutex(lock, task, NULL)) {
1038
1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock);
1042 rt_mutex_set_owner(lock, task, 0);
1043 raw_spin_unlock(&lock->wait_lock); 957 raw_spin_unlock(&lock->wait_lock);
1044 rt_mutex_deadlock_account_lock(lock, task);
1045 return 1; 958 return 1;
1046 } 959 }
1047 960
1048 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); 961 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
1049 962
1050 if (ret && !waiter->task) { 963 if (ret && !rt_mutex_owner(lock)) {
1051 /* 964 /*
1052 * Reset the return value. We might have 965 * Reset the return value. We might have
1053 * returned with -EDEADLK and the owner 966 * returned with -EDEADLK and the owner
@@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1056 */ 969 */
1057 ret = 0; 970 ret = 0;
1058 } 971 }
972
973 if (unlikely(ret))
974 remove_waiter(lock, waiter);
975
1059 raw_spin_unlock(&lock->wait_lock); 976 raw_spin_unlock(&lock->wait_lock);
1060 977
1061 debug_rt_mutex_print_deadlock(waiter); 978 debug_rt_mutex_print_deadlock(waiter);
@@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1110 1027
1111 set_current_state(TASK_INTERRUPTIBLE); 1028 set_current_state(TASK_INTERRUPTIBLE);
1112 1029
1113 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, 1030 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
1114 detect_deadlock);
1115 1031
1116 set_current_state(TASK_RUNNING); 1032 set_current_state(TASK_RUNNING);
1117 1033
1118 if (unlikely(waiter->task)) 1034 if (unlikely(ret))
1119 remove_waiter(lock, waiter); 1035 remove_waiter(lock, waiter);
1120 1036
1121 /* 1037 /*
@@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1126 1042
1127 raw_spin_unlock(&lock->wait_lock); 1043 raw_spin_unlock(&lock->wait_lock);
1128 1044
1129 /*
1130 * Readjust priority, when we did not get the lock. We might have been
1131 * the pending owner and boosted. Since we did not take the lock, the
1132 * PI boost has to go.
1133 */
1134 if (unlikely(ret))
1135 rt_mutex_adjust_prio(current);
1136
1137 return ret; 1045 return ret;
1138} 1046}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 97a2f81866af..53a66c85261b 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p)
91/* 91/*
92 * lock->owner state tracking: 92 * lock->owner state tracking:
93 */ 93 */
94#define RT_MUTEX_OWNER_PENDING 1UL 94#define RT_MUTEX_HAS_WAITERS 1UL
95#define RT_MUTEX_HAS_WAITERS 2UL 95#define RT_MUTEX_OWNER_MASKALL 1UL
96#define RT_MUTEX_OWNER_MASKALL 3UL
97 96
98static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) 97static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
99{ 98{
@@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
101 ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); 100 ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
102} 101}
103 102
104static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
105{
106 return (struct task_struct *)
107 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
108}
109
110static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
111{
112 return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
113}
114
115/* 103/*
116 * PI-futex support (proxy locking functions, etc.): 104 * PI-futex support (proxy locking functions, etc.):
117 */ 105 */
diff --git a/kernel/sched.c b/kernel/sched.c
index c5d775079027..935f8e8e6160 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -32,7 +32,6 @@
32#include <linux/init.h> 32#include <linux/init.h>
33#include <linux/uaccess.h> 33#include <linux/uaccess.h>
34#include <linux/highmem.h> 34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h> 35#include <asm/mmu_context.h>
37#include <linux/interrupt.h> 36#include <linux/interrupt.h>
38#include <linux/capability.h> 37#include <linux/capability.h>
@@ -75,9 +74,11 @@
75 74
76#include <asm/tlb.h> 75#include <asm/tlb.h>
77#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
77#include <asm/mutex.h>
78 78
79#include "sched_cpupri.h" 79#include "sched_cpupri.h"
80#include "workqueue_sched.h" 80#include "workqueue_sched.h"
81#include "sched_autogroup.h"
81 82
82#include <litmus/sched_trace.h> 83#include <litmus/sched_trace.h>
83#include <litmus/trace.h> 84#include <litmus/trace.h>
@@ -235,7 +236,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
235#endif 236#endif
236 237
237/* 238/*
238 * sched_domains_mutex serializes calls to arch_init_sched_domains, 239 * sched_domains_mutex serializes calls to init_sched_domains,
239 * detach_destroy_domains and partition_sched_domains. 240 * detach_destroy_domains and partition_sched_domains.
240 */ 241 */
241static DEFINE_MUTEX(sched_domains_mutex); 242static DEFINE_MUTEX(sched_domains_mutex);
@@ -258,6 +259,8 @@ struct task_group {
258 /* runqueue "owned" by this group on each cpu */ 259 /* runqueue "owned" by this group on each cpu */
259 struct cfs_rq **cfs_rq; 260 struct cfs_rq **cfs_rq;
260 unsigned long shares; 261 unsigned long shares;
262
263 atomic_t load_weight;
261#endif 264#endif
262 265
263#ifdef CONFIG_RT_GROUP_SCHED 266#ifdef CONFIG_RT_GROUP_SCHED
@@ -273,25 +276,18 @@ struct task_group {
273 struct task_group *parent; 276 struct task_group *parent;
274 struct list_head siblings; 277 struct list_head siblings;
275 struct list_head children; 278 struct list_head children;
276};
277 279
278#define root_task_group init_task_group 280#ifdef CONFIG_SCHED_AUTOGROUP
281 struct autogroup *autogroup;
282#endif
283};
279 284
280/* task_group_lock serializes add/remove of task groups and also changes to 285/* task_group_lock serializes the addition/removal of task groups */
281 * a task group's cpu shares.
282 */
283static DEFINE_SPINLOCK(task_group_lock); 286static DEFINE_SPINLOCK(task_group_lock);
284 287
285#ifdef CONFIG_FAIR_GROUP_SCHED 288#ifdef CONFIG_FAIR_GROUP_SCHED
286 289
287#ifdef CONFIG_SMP 290# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
288static int root_task_group_empty(void)
289{
290 return list_empty(&root_task_group.children);
291}
292#endif
293
294# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
295 291
296/* 292/*
297 * A weight of 0 or 1 can cause arithmetics problems. 293 * A weight of 0 or 1 can cause arithmetics problems.
@@ -301,16 +297,16 @@ static int root_task_group_empty(void)
301 * (The default weight is 1024 - so there's no practical 297 * (The default weight is 1024 - so there's no practical
302 * limitation from this.) 298 * limitation from this.)
303 */ 299 */
304#define MIN_SHARES 2 300#define MIN_SHARES (1UL << 1)
305#define MAX_SHARES (1UL << 18) 301#define MAX_SHARES (1UL << 18)
306 302
307static int init_task_group_load = INIT_TASK_GROUP_LOAD; 303static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
308#endif 304#endif
309 305
310/* Default task group. 306/* Default task group.
311 * Every task in system belong to this group at bootup. 307 * Every task in system belong to this group at bootup.
312 */ 308 */
313struct task_group init_task_group; 309struct task_group root_task_group;
314 310
315#endif /* CONFIG_CGROUP_SCHED */ 311#endif /* CONFIG_CGROUP_SCHED */
316 312
@@ -321,6 +317,9 @@ struct cfs_rq {
321 317
322 u64 exec_clock; 318 u64 exec_clock;
323 u64 min_vruntime; 319 u64 min_vruntime;
320#ifndef CONFIG_64BIT
321 u64 min_vruntime_copy;
322#endif
324 323
325 struct rb_root tasks_timeline; 324 struct rb_root tasks_timeline;
326 struct rb_node *rb_leftmost; 325 struct rb_node *rb_leftmost;
@@ -332,9 +331,11 @@ struct cfs_rq {
332 * 'curr' points to currently running entity on this cfs_rq. 331 * 'curr' points to currently running entity on this cfs_rq.
333 * It is set to NULL otherwise (i.e when none are currently running). 332 * It is set to NULL otherwise (i.e when none are currently running).
334 */ 333 */
335 struct sched_entity *curr, *next, *last; 334 struct sched_entity *curr, *next, *last, *skip;
336 335
336#ifdef CONFIG_SCHED_DEBUG
337 unsigned int nr_spread_over; 337 unsigned int nr_spread_over;
338#endif
338 339
339#ifdef CONFIG_FAIR_GROUP_SCHED 340#ifdef CONFIG_FAIR_GROUP_SCHED
340 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 341 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -347,6 +348,7 @@ struct cfs_rq {
347 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 348 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
348 * list is used during load balance. 349 * list is used during load balance.
349 */ 350 */
351 int on_list;
350 struct list_head leaf_cfs_rq_list; 352 struct list_head leaf_cfs_rq_list;
351 struct task_group *tg; /* group that "owns" this runqueue */ 353 struct task_group *tg; /* group that "owns" this runqueue */
352 354
@@ -365,14 +367,17 @@ struct cfs_rq {
365 unsigned long h_load; 367 unsigned long h_load;
366 368
367 /* 369 /*
368 * this cpu's part of tg->shares 370 * Maintaining per-cpu shares distribution for group scheduling
371 *
372 * load_stamp is the last time we updated the load average
373 * load_last is the last time we updated the load average and saw load
374 * load_unacc_exec_time is currently unaccounted execution time
369 */ 375 */
370 unsigned long shares; 376 u64 load_avg;
377 u64 load_period;
378 u64 load_stamp, load_last, load_unacc_exec_time;
371 379
372 /* 380 unsigned long load_contribution;
373 * load.weight at the time we set shares
374 */
375 unsigned long rq_weight;
376#endif 381#endif
377#endif 382#endif
378}; 383};
@@ -428,6 +433,7 @@ struct litmus_rq {
428 */ 433 */
429struct root_domain { 434struct root_domain {
430 atomic_t refcount; 435 atomic_t refcount;
436 struct rcu_head rcu;
431 cpumask_var_t span; 437 cpumask_var_t span;
432 cpumask_var_t online; 438 cpumask_var_t online;
433 439
@@ -437,9 +443,7 @@ struct root_domain {
437 */ 443 */
438 cpumask_var_t rto_mask; 444 cpumask_var_t rto_mask;
439 atomic_t rto_count; 445 atomic_t rto_count;
440#ifdef CONFIG_SMP
441 struct cpupri cpupri; 446 struct cpupri cpupri;
442#endif
443}; 447};
444 448
445/* 449/*
@@ -448,7 +452,7 @@ struct root_domain {
448 */ 452 */
449static struct root_domain def_root_domain; 453static struct root_domain def_root_domain;
450 454
451#endif 455#endif /* CONFIG_SMP */
452 456
453/* 457/*
454 * This is the main, per-CPU runqueue data structure. 458 * This is the main, per-CPU runqueue data structure.
@@ -473,7 +477,7 @@ struct rq {
473 u64 nohz_stamp; 477 u64 nohz_stamp;
474 unsigned char nohz_balance_kick; 478 unsigned char nohz_balance_kick;
475#endif 479#endif
476 unsigned int skip_clock_update; 480 int skip_clock_update;
477 481
478 /* capture load from *all* tasks on this cpu: */ 482 /* capture load from *all* tasks on this cpu: */
479 struct load_weight load; 483 struct load_weight load;
@@ -500,11 +504,12 @@ struct rq {
500 */ 504 */
501 unsigned long nr_uninterruptible; 505 unsigned long nr_uninterruptible;
502 506
503 struct task_struct *curr, *idle; 507 struct task_struct *curr, *idle, *stop;
504 unsigned long next_balance; 508 unsigned long next_balance;
505 struct mm_struct *prev_mm; 509 struct mm_struct *prev_mm;
506 510
507 u64 clock; 511 u64 clock;
512 u64 clock_task;
508 513
509 atomic_t nr_iowait; 514 atomic_t nr_iowait;
510 515
@@ -532,6 +537,10 @@ struct rq {
532 u64 avg_idle; 537 u64 avg_idle;
533#endif 538#endif
534 539
540#ifdef CONFIG_IRQ_TIME_ACCOUNTING
541 u64 prev_irq_time;
542#endif
543
535 /* calc_load related fields */ 544 /* calc_load related fields */
536 unsigned long calc_load_update; 545 unsigned long calc_load_update;
537 long calc_load_active; 546 long calc_load_active;
@@ -561,32 +570,17 @@ struct rq {
561 /* try_to_wake_up() stats */ 570 /* try_to_wake_up() stats */
562 unsigned int ttwu_count; 571 unsigned int ttwu_count;
563 unsigned int ttwu_local; 572 unsigned int ttwu_local;
573#endif
564 574
565 /* BKL stats */ 575#ifdef CONFIG_SMP
566 unsigned int bkl_count; 576 struct task_struct *wake_list;
567#endif 577#endif
568}; 578};
569 579
570static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 580static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
571 581
572static inline
573void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
574{
575 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
576 582
577 /* 583static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
578 * A queue event has occurred, and we're going to schedule. In
579 * this case, we can save a useless back to back clock update.
580 */
581 /* LITMUS^RT: turning off the clock update is buggy in Linux 2.6.36;
582 * the scheduler can "forget" to renable the runqueue clock in some
583 * cases. LITMUS^RT amplifies the effects of this problem. Hence, we
584 * turn it off to avoid stalling clocks. */
585 /*
586 if (test_tsk_need_resched(p))
587 rq->skip_clock_update = 1;
588 */
589}
590 584
591static inline int cpu_of(struct rq *rq) 585static inline int cpu_of(struct rq *rq)
592{ 586{
@@ -599,7 +593,7 @@ static inline int cpu_of(struct rq *rq)
599 593
600#define rcu_dereference_check_sched_domain(p) \ 594#define rcu_dereference_check_sched_domain(p) \
601 rcu_dereference_check((p), \ 595 rcu_dereference_check((p), \
602 rcu_read_lock_sched_held() || \ 596 rcu_read_lock_held() || \
603 lockdep_is_held(&sched_domains_mutex)) 597 lockdep_is_held(&sched_domains_mutex))
604 598
605/* 599/*
@@ -623,18 +617,22 @@ static inline int cpu_of(struct rq *rq)
623/* 617/*
624 * Return the group to which this tasks belongs. 618 * Return the group to which this tasks belongs.
625 * 619 *
626 * We use task_subsys_state_check() and extend the RCU verification 620 * We use task_subsys_state_check() and extend the RCU verification with
627 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() 621 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
628 * holds that lock for each task it moves into the cgroup. Therefore 622 * task it moves into the cgroup. Therefore by holding either of those locks,
629 * by holding that lock, we pin the task to the current cgroup. 623 * we pin the task to the current cgroup.
630 */ 624 */
631static inline struct task_group *task_group(struct task_struct *p) 625static inline struct task_group *task_group(struct task_struct *p)
632{ 626{
627 struct task_group *tg;
633 struct cgroup_subsys_state *css; 628 struct cgroup_subsys_state *css;
634 629
635 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 630 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
631 lockdep_is_held(&p->pi_lock) ||
636 lockdep_is_held(&task_rq(p)->lock)); 632 lockdep_is_held(&task_rq(p)->lock));
637 return container_of(css, struct task_group, css); 633 tg = container_of(css, struct task_group, css);
634
635 return autogroup_task_group(p, tg);
638} 636}
639 637
640/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 638/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -661,10 +659,18 @@ static inline struct task_group *task_group(struct task_struct *p)
661 659
662#endif /* CONFIG_CGROUP_SCHED */ 660#endif /* CONFIG_CGROUP_SCHED */
663 661
664inline void update_rq_clock(struct rq *rq) 662static void update_rq_clock_task(struct rq *rq, s64 delta);
663
664static void update_rq_clock(struct rq *rq)
665{ 665{
666 if (!rq->skip_clock_update) 666 s64 delta;
667 rq->clock = sched_clock_cpu(cpu_of(rq)); 667
668 if (rq->skip_clock_update > 0)
669 return;
670
671 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
672 rq->clock += delta;
673 update_rq_clock_task(rq, delta);
668} 674}
669 675
670/* 676/*
@@ -677,10 +683,9 @@ inline void update_rq_clock(struct rq *rq)
677#endif 683#endif
678 684
679/** 685/**
680 * runqueue_is_locked 686 * runqueue_is_locked - Returns true if the current cpu runqueue is locked
681 * @cpu: the processor in question. 687 * @cpu: the processor in question.
682 * 688 *
683 * Returns true if the current cpu runqueue is locked.
684 * This interface allows printk to be called with the runqueue lock 689 * This interface allows printk to be called with the runqueue lock
685 * held and know whether or not it is OK to wake up the klogd. 690 * held and know whether or not it is OK to wake up the klogd.
686 */ 691 */
@@ -741,7 +746,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
741 size_t cnt, loff_t *ppos) 746 size_t cnt, loff_t *ppos)
742{ 747{
743 char buf[64]; 748 char buf[64];
744 char *cmp = buf; 749 char *cmp;
745 int neg = 0; 750 int neg = 0;
746 int i; 751 int i;
747 752
@@ -752,16 +757,15 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
752 return -EFAULT; 757 return -EFAULT;
753 758
754 buf[cnt] = 0; 759 buf[cnt] = 0;
760 cmp = strstrip(buf);
755 761
756 if (strncmp(buf, "NO_", 3) == 0) { 762 if (strncmp(cmp, "NO_", 3) == 0) {
757 neg = 1; 763 neg = 1;
758 cmp += 3; 764 cmp += 3;
759 } 765 }
760 766
761 for (i = 0; sched_feat_names[i]; i++) { 767 for (i = 0; sched_feat_names[i]; i++) {
762 int len = strlen(sched_feat_names[i]); 768 if (strcmp(cmp, sched_feat_names[i]) == 0) {
763
764 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
765 if (neg) 769 if (neg)
766 sysctl_sched_features &= ~(1UL << i); 770 sysctl_sched_features &= ~(1UL << i);
767 else 771 else
@@ -811,20 +815,6 @@ late_initcall(sched_init_debug);
811const_debug unsigned int sysctl_sched_nr_migrate = 32; 815const_debug unsigned int sysctl_sched_nr_migrate = 32;
812 816
813/* 817/*
814 * ratelimit for updating the group shares.
815 * default: 0.25ms
816 */
817unsigned int sysctl_sched_shares_ratelimit = 250000;
818unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
819
820/*
821 * Inject some fuzzyness into changing the per-cpu group shares
822 * this avoids remote rq-locks at the expense of fairness.
823 * default: 4
824 */
825unsigned int sysctl_sched_shares_thresh = 4;
826
827/*
828 * period over which we average the RT time consumption, measured 818 * period over which we average the RT time consumption, measured
829 * in ms. 819 * in ms.
830 * 820 *
@@ -871,18 +861,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
871 return rq->curr == p; 861 return rq->curr == p;
872} 862}
873 863
874#ifndef __ARCH_WANT_UNLOCKED_CTXSW
875static inline int task_running(struct rq *rq, struct task_struct *p) 864static inline int task_running(struct rq *rq, struct task_struct *p)
876{ 865{
866#ifdef CONFIG_SMP
867 return p->on_cpu;
868#else
877 return task_current(rq, p); 869 return task_current(rq, p);
870#endif
878} 871}
879 872
873#ifndef __ARCH_WANT_UNLOCKED_CTXSW
880static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 874static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
881{ 875{
876#ifdef CONFIG_SMP
877 /*
878 * We can optimise this out completely for !SMP, because the
879 * SMP rebalancing from interrupt is the only thing that cares
880 * here.
881 */
882 next->on_cpu = 1;
883#endif
882} 884}
883 885
884static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 886static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
885{ 887{
888#ifdef CONFIG_SMP
889 /*
890 * After ->on_cpu is cleared, the task can be moved to a different CPU.
891 * We must ensure this doesn't happen until the switch is completely
892 * finished.
893 */
894 smp_wmb();
895 prev->on_cpu = 0;
896#endif
886#ifdef CONFIG_DEBUG_SPINLOCK 897#ifdef CONFIG_DEBUG_SPINLOCK
887 /* this is a valid case when another task releases the spinlock */ 898 /* this is a valid case when another task releases the spinlock */
888 rq->lock.owner = current; 899 rq->lock.owner = current;
@@ -898,15 +909,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
898} 909}
899 910
900#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 911#else /* __ARCH_WANT_UNLOCKED_CTXSW */
901static inline int task_running(struct rq *rq, struct task_struct *p)
902{
903#ifdef CONFIG_SMP
904 return p->oncpu;
905#else
906 return task_current(rq, p);
907#endif
908}
909
910static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 912static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
911{ 913{
912#ifdef CONFIG_SMP 914#ifdef CONFIG_SMP
@@ -915,7 +917,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
915 * SMP rebalancing from interrupt is the only thing that cares 917 * SMP rebalancing from interrupt is the only thing that cares
916 * here. 918 * here.
917 */ 919 */
918 next->oncpu = 1; 920 next->on_cpu = 1;
919#endif 921#endif
920#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 922#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
921 raw_spin_unlock_irq(&rq->lock); 923 raw_spin_unlock_irq(&rq->lock);
@@ -928,12 +930,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
928{ 930{
929#ifdef CONFIG_SMP 931#ifdef CONFIG_SMP
930 /* 932 /*
931 * After ->oncpu is cleared, the task can be moved to a different CPU. 933 * After ->on_cpu is cleared, the task can be moved to a different CPU.
932 * We must ensure this doesn't happen until the switch is completely 934 * We must ensure this doesn't happen until the switch is completely
933 * finished. 935 * finished.
934 */ 936 */
935 smp_wmb(); 937 smp_wmb();
936 prev->oncpu = 0; 938 prev->on_cpu = 0;
937#endif 939#endif
938#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 940#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
939 local_irq_enable(); 941 local_irq_enable();
@@ -942,23 +944,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
942#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 944#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
943 945
944/* 946/*
945 * Check whether the task is waking, we use this to synchronize ->cpus_allowed 947 * __task_rq_lock - lock the rq @p resides on.
946 * against ttwu().
947 */
948static inline int task_is_waking(struct task_struct *p)
949{
950 return unlikely(p->state == TASK_WAKING);
951}
952
953/*
954 * __task_rq_lock - lock the runqueue a given task resides on.
955 * Must be called interrupts disabled.
956 */ 948 */
957static inline struct rq *__task_rq_lock(struct task_struct *p) 949static inline struct rq *__task_rq_lock(struct task_struct *p)
958 __acquires(rq->lock) 950 __acquires(rq->lock)
959{ 951{
960 struct rq *rq; 952 struct rq *rq;
961 953
954 lockdep_assert_held(&p->pi_lock);
955
962 for (;;) { 956 for (;;) {
963 rq = task_rq(p); 957 rq = task_rq(p);
964 raw_spin_lock(&rq->lock); 958 raw_spin_lock(&rq->lock);
@@ -969,22 +963,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
969} 963}
970 964
971/* 965/*
972 * task_rq_lock - lock the runqueue a given task resides on and disable 966 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
973 * interrupts. Note the ordering: we can safely lookup the task_rq without
974 * explicitly disabling preemption.
975 */ 967 */
976static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 968static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
969 __acquires(p->pi_lock)
977 __acquires(rq->lock) 970 __acquires(rq->lock)
978{ 971{
979 struct rq *rq; 972 struct rq *rq;
980 973
981 for (;;) { 974 for (;;) {
982 local_irq_save(*flags); 975 raw_spin_lock_irqsave(&p->pi_lock, *flags);
983 rq = task_rq(p); 976 rq = task_rq(p);
984 raw_spin_lock(&rq->lock); 977 raw_spin_lock(&rq->lock);
985 if (likely(rq == task_rq(p))) 978 if (likely(rq == task_rq(p)))
986 return rq; 979 return rq;
987 raw_spin_unlock_irqrestore(&rq->lock, *flags); 980 raw_spin_unlock(&rq->lock);
981 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
988 } 982 }
989} 983}
990 984
@@ -994,10 +988,13 @@ static void __task_rq_unlock(struct rq *rq)
994 raw_spin_unlock(&rq->lock); 988 raw_spin_unlock(&rq->lock);
995} 989}
996 990
997static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 991static inline void
992task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
998 __releases(rq->lock) 993 __releases(rq->lock)
994 __releases(p->pi_lock)
999{ 995{
1000 raw_spin_unlock_irqrestore(&rq->lock, *flags); 996 raw_spin_unlock(&rq->lock);
997 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
1001} 998}
1002 999
1003/* 1000/*
@@ -1227,11 +1224,17 @@ int get_nohz_timer_target(void)
1227 int i; 1224 int i;
1228 struct sched_domain *sd; 1225 struct sched_domain *sd;
1229 1226
1227 rcu_read_lock();
1230 for_each_domain(cpu, sd) { 1228 for_each_domain(cpu, sd) {
1231 for_each_cpu(i, sched_domain_span(sd)) 1229 for_each_cpu(i, sched_domain_span(sd)) {
1232 if (!idle_cpu(i)) 1230 if (!idle_cpu(i)) {
1233 return i; 1231 cpu = i;
1232 goto unlock;
1233 }
1234 }
1234 } 1235 }
1236unlock:
1237 rcu_read_unlock();
1235 return cpu; 1238 return cpu;
1236} 1239}
1237/* 1240/*
@@ -1341,15 +1344,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1341{ 1344{
1342 u64 tmp; 1345 u64 tmp;
1343 1346
1347 /*
1348 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1349 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1350 * 2^SCHED_LOAD_RESOLUTION.
1351 */
1352 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1353 tmp = (u64)delta_exec * scale_load_down(weight);
1354 else
1355 tmp = (u64)delta_exec;
1356
1344 if (!lw->inv_weight) { 1357 if (!lw->inv_weight) {
1345 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) 1358 unsigned long w = scale_load_down(lw->weight);
1359
1360 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1346 lw->inv_weight = 1; 1361 lw->inv_weight = 1;
1362 else if (unlikely(!w))
1363 lw->inv_weight = WMULT_CONST;
1347 else 1364 else
1348 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) 1365 lw->inv_weight = WMULT_CONST / w;
1349 / (lw->weight+1);
1350 } 1366 }
1351 1367
1352 tmp = (u64)delta_exec * weight;
1353 /* 1368 /*
1354 * Check whether we'd overflow the 64-bit multiplication: 1369 * Check whether we'd overflow the 64-bit multiplication:
1355 */ 1370 */
@@ -1374,6 +1389,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1374 lw->inv_weight = 0; 1389 lw->inv_weight = 0;
1375} 1390}
1376 1391
1392static inline void update_load_set(struct load_weight *lw, unsigned long w)
1393{
1394 lw->weight = w;
1395 lw->inv_weight = 0;
1396}
1397
1377/* 1398/*
1378 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1399 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1379 * of tasks with abnormal "nice" values across CPUs the contribution that 1400 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1562,101 +1583,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1562 1583
1563#ifdef CONFIG_FAIR_GROUP_SCHED 1584#ifdef CONFIG_FAIR_GROUP_SCHED
1564 1585
1565static __read_mostly unsigned long __percpu *update_shares_data;
1566
1567static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1568
1569/*
1570 * Calculate and set the cpu's group shares.
1571 */
1572static void update_group_shares_cpu(struct task_group *tg, int cpu,
1573 unsigned long sd_shares,
1574 unsigned long sd_rq_weight,
1575 unsigned long *usd_rq_weight)
1576{
1577 unsigned long shares, rq_weight;
1578 int boost = 0;
1579
1580 rq_weight = usd_rq_weight[cpu];
1581 if (!rq_weight) {
1582 boost = 1;
1583 rq_weight = NICE_0_LOAD;
1584 }
1585
1586 /*
1587 * \Sum_j shares_j * rq_weight_i
1588 * shares_i = -----------------------------
1589 * \Sum_j rq_weight_j
1590 */
1591 shares = (sd_shares * rq_weight) / sd_rq_weight;
1592 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1593
1594 if (abs(shares - tg->se[cpu]->load.weight) >
1595 sysctl_sched_shares_thresh) {
1596 struct rq *rq = cpu_rq(cpu);
1597 unsigned long flags;
1598
1599 raw_spin_lock_irqsave(&rq->lock, flags);
1600 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1601 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1602 __set_se_shares(tg->se[cpu], shares);
1603 raw_spin_unlock_irqrestore(&rq->lock, flags);
1604 }
1605}
1606
1607/*
1608 * Re-compute the task group their per cpu shares over the given domain.
1609 * This needs to be done in a bottom-up fashion because the rq weight of a
1610 * parent group depends on the shares of its child groups.
1611 */
1612static int tg_shares_up(struct task_group *tg, void *data)
1613{
1614 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1615 unsigned long *usd_rq_weight;
1616 struct sched_domain *sd = data;
1617 unsigned long flags;
1618 int i;
1619
1620 if (!tg->se[0])
1621 return 0;
1622
1623 local_irq_save(flags);
1624 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1625
1626 for_each_cpu(i, sched_domain_span(sd)) {
1627 weight = tg->cfs_rq[i]->load.weight;
1628 usd_rq_weight[i] = weight;
1629
1630 rq_weight += weight;
1631 /*
1632 * If there are currently no tasks on the cpu pretend there
1633 * is one of average load so that when a new task gets to
1634 * run here it will not get delayed by group starvation.
1635 */
1636 if (!weight)
1637 weight = NICE_0_LOAD;
1638
1639 sum_weight += weight;
1640 shares += tg->cfs_rq[i]->shares;
1641 }
1642
1643 if (!rq_weight)
1644 rq_weight = sum_weight;
1645
1646 if ((!shares && rq_weight) || shares > tg->shares)
1647 shares = tg->shares;
1648
1649 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1650 shares = tg->shares;
1651
1652 for_each_cpu(i, sched_domain_span(sd))
1653 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1654
1655 local_irq_restore(flags);
1656
1657 return 0;
1658}
1659
1660/* 1586/*
1661 * Compute the cpu's hierarchical load factor for each task group. 1587 * Compute the cpu's hierarchical load factor for each task group.
1662 * This needs to be done in a top-down fashion because the load of a child 1588 * This needs to be done in a top-down fashion because the load of a child
@@ -1671,7 +1597,7 @@ static int tg_load_down(struct task_group *tg, void *data)
1671 load = cpu_rq(cpu)->load.weight; 1597 load = cpu_rq(cpu)->load.weight;
1672 } else { 1598 } else {
1673 load = tg->parent->cfs_rq[cpu]->h_load; 1599 load = tg->parent->cfs_rq[cpu]->h_load;
1674 load *= tg->cfs_rq[cpu]->shares; 1600 load *= tg->se[cpu]->load.weight;
1675 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1601 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1676 } 1602 }
1677 1603
@@ -1680,34 +1606,11 @@ static int tg_load_down(struct task_group *tg, void *data)
1680 return 0; 1606 return 0;
1681} 1607}
1682 1608
1683static void update_shares(struct sched_domain *sd)
1684{
1685 s64 elapsed;
1686 u64 now;
1687
1688 if (root_task_group_empty())
1689 return;
1690
1691 now = local_clock();
1692 elapsed = now - sd->last_update;
1693
1694 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1695 sd->last_update = now;
1696 walk_tg_tree(tg_nop, tg_shares_up, sd);
1697 }
1698}
1699
1700static void update_h_load(long cpu) 1609static void update_h_load(long cpu)
1701{ 1610{
1702 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1611 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1703} 1612}
1704 1613
1705#else
1706
1707static inline void update_shares(struct sched_domain *sd)
1708{
1709}
1710
1711#endif 1614#endif
1712 1615
1713#ifdef CONFIG_PREEMPT 1616#ifdef CONFIG_PREEMPT
@@ -1827,15 +1730,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1827 __release(rq2->lock); 1730 __release(rq2->lock);
1828} 1731}
1829 1732
1830#endif 1733#else /* CONFIG_SMP */
1831 1734
1832#ifdef CONFIG_FAIR_GROUP_SCHED 1735/*
1833static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) 1736 * double_rq_lock - safely lock two runqueues
1737 *
1738 * Note this does not disable interrupts like task_rq_lock,
1739 * you need to do so manually before calling.
1740 */
1741static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1742 __acquires(rq1->lock)
1743 __acquires(rq2->lock)
1834{ 1744{
1835#ifdef CONFIG_SMP 1745 BUG_ON(!irqs_disabled());
1836 cfs_rq->shares = shares; 1746 BUG_ON(rq1 != rq2);
1837#endif 1747 raw_spin_lock(&rq1->lock);
1748 __acquire(rq2->lock); /* Fake it out ;) */
1838} 1749}
1750
1751/*
1752 * double_rq_unlock - safely unlock two runqueues
1753 *
1754 * Note this does not restore interrupts like task_rq_unlock,
1755 * you need to do so manually after calling.
1756 */
1757static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1758 __releases(rq1->lock)
1759 __releases(rq2->lock)
1760{
1761 BUG_ON(rq1 != rq2);
1762 raw_spin_unlock(&rq1->lock);
1763 __release(rq2->lock);
1764}
1765
1839#endif 1766#endif
1840 1767
1841static void calc_load_account_idle(struct rq *this_rq); 1768static void calc_load_account_idle(struct rq *this_rq);
@@ -1877,23 +1804,20 @@ static void dec_nr_running(struct rq *rq)
1877 1804
1878static void set_load_weight(struct task_struct *p) 1805static void set_load_weight(struct task_struct *p)
1879{ 1806{
1880 if (task_has_rt_policy(p)) { 1807 int prio = p->static_prio - MAX_RT_PRIO;
1881 p->se.load.weight = 0; 1808 struct load_weight *load = &p->se.load;
1882 p->se.load.inv_weight = WMULT_CONST;
1883 return;
1884 }
1885 1809
1886 /* 1810 /*
1887 * SCHED_IDLE tasks get minimal weight: 1811 * SCHED_IDLE tasks get minimal weight:
1888 */ 1812 */
1889 if (p->policy == SCHED_IDLE) { 1813 if (p->policy == SCHED_IDLE) {
1890 p->se.load.weight = WEIGHT_IDLEPRIO; 1814 load->weight = scale_load(WEIGHT_IDLEPRIO);
1891 p->se.load.inv_weight = WMULT_IDLEPRIO; 1815 load->inv_weight = WMULT_IDLEPRIO;
1892 return; 1816 return;
1893 } 1817 }
1894 1818
1895 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; 1819 load->weight = scale_load(prio_to_weight[prio]);
1896 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1820 load->inv_weight = prio_to_wmult[prio];
1897} 1821}
1898 1822
1899static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 1823static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1901,7 +1825,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1901 update_rq_clock(rq); 1825 update_rq_clock(rq);
1902 sched_info_queued(p); 1826 sched_info_queued(p);
1903 p->sched_class->enqueue_task(rq, p, flags); 1827 p->sched_class->enqueue_task(rq, p, flags);
1904 p->se.on_rq = 1;
1905} 1828}
1906 1829
1907static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 1830static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1909,7 +1832,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1909 update_rq_clock(rq); 1832 update_rq_clock(rq);
1910 sched_info_dequeued(p); 1833 sched_info_dequeued(p);
1911 p->sched_class->dequeue_task(rq, p, flags); 1834 p->sched_class->dequeue_task(rq, p, flags);
1912 p->se.on_rq = 0;
1913} 1835}
1914 1836
1915/* 1837/*
@@ -1936,14 +1858,227 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1936 dec_nr_running(rq); 1858 dec_nr_running(rq);
1937} 1859}
1938 1860
1861#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1862
1863/*
1864 * There are no locks covering percpu hardirq/softirq time.
1865 * They are only modified in account_system_vtime, on corresponding CPU
1866 * with interrupts disabled. So, writes are safe.
1867 * They are read and saved off onto struct rq in update_rq_clock().
1868 * This may result in other CPU reading this CPU's irq time and can
1869 * race with irq/account_system_vtime on this CPU. We would either get old
1870 * or new value with a side effect of accounting a slice of irq time to wrong
1871 * task when irq is in progress while we read rq->clock. That is a worthy
1872 * compromise in place of having locks on each irq in account_system_time.
1873 */
1874static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1875static DEFINE_PER_CPU(u64, cpu_softirq_time);
1876
1877static DEFINE_PER_CPU(u64, irq_start_time);
1878static int sched_clock_irqtime;
1879
1880void enable_sched_clock_irqtime(void)
1881{
1882 sched_clock_irqtime = 1;
1883}
1884
1885void disable_sched_clock_irqtime(void)
1886{
1887 sched_clock_irqtime = 0;
1888}
1889
1890#ifndef CONFIG_64BIT
1891static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
1892
1893static inline void irq_time_write_begin(void)
1894{
1895 __this_cpu_inc(irq_time_seq.sequence);
1896 smp_wmb();
1897}
1898
1899static inline void irq_time_write_end(void)
1900{
1901 smp_wmb();
1902 __this_cpu_inc(irq_time_seq.sequence);
1903}
1904
1905static inline u64 irq_time_read(int cpu)
1906{
1907 u64 irq_time;
1908 unsigned seq;
1909
1910 do {
1911 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1912 irq_time = per_cpu(cpu_softirq_time, cpu) +
1913 per_cpu(cpu_hardirq_time, cpu);
1914 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1915
1916 return irq_time;
1917}
1918#else /* CONFIG_64BIT */
1919static inline void irq_time_write_begin(void)
1920{
1921}
1922
1923static inline void irq_time_write_end(void)
1924{
1925}
1926
1927static inline u64 irq_time_read(int cpu)
1928{
1929 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1930}
1931#endif /* CONFIG_64BIT */
1932
1933/*
1934 * Called before incrementing preempt_count on {soft,}irq_enter
1935 * and before decrementing preempt_count on {soft,}irq_exit.
1936 */
1937void account_system_vtime(struct task_struct *curr)
1938{
1939 unsigned long flags;
1940 s64 delta;
1941 int cpu;
1942
1943 if (!sched_clock_irqtime)
1944 return;
1945
1946 local_irq_save(flags);
1947
1948 cpu = smp_processor_id();
1949 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
1950 __this_cpu_add(irq_start_time, delta);
1951
1952 irq_time_write_begin();
1953 /*
1954 * We do not account for softirq time from ksoftirqd here.
1955 * We want to continue accounting softirq time to ksoftirqd thread
1956 * in that case, so as not to confuse scheduler with a special task
1957 * that do not consume any time, but still wants to run.
1958 */
1959 if (hardirq_count())
1960 __this_cpu_add(cpu_hardirq_time, delta);
1961 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
1962 __this_cpu_add(cpu_softirq_time, delta);
1963
1964 irq_time_write_end();
1965 local_irq_restore(flags);
1966}
1967EXPORT_SYMBOL_GPL(account_system_vtime);
1968
1969static void update_rq_clock_task(struct rq *rq, s64 delta)
1970{
1971 s64 irq_delta;
1972
1973 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1974
1975 /*
1976 * Since irq_time is only updated on {soft,}irq_exit, we might run into
1977 * this case when a previous update_rq_clock() happened inside a
1978 * {soft,}irq region.
1979 *
1980 * When this happens, we stop ->clock_task and only update the
1981 * prev_irq_time stamp to account for the part that fit, so that a next
1982 * update will consume the rest. This ensures ->clock_task is
1983 * monotonic.
1984 *
1985 * It does however cause some slight miss-attribution of {soft,}irq
1986 * time, a more accurate solution would be to update the irq_time using
1987 * the current rq->clock timestamp, except that would require using
1988 * atomic ops.
1989 */
1990 if (irq_delta > delta)
1991 irq_delta = delta;
1992
1993 rq->prev_irq_time += irq_delta;
1994 delta -= irq_delta;
1995 rq->clock_task += delta;
1996
1997 if (irq_delta && sched_feat(NONIRQ_POWER))
1998 sched_rt_avg_update(rq, irq_delta);
1999}
2000
2001static int irqtime_account_hi_update(void)
2002{
2003 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2004 unsigned long flags;
2005 u64 latest_ns;
2006 int ret = 0;
2007
2008 local_irq_save(flags);
2009 latest_ns = this_cpu_read(cpu_hardirq_time);
2010 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
2011 ret = 1;
2012 local_irq_restore(flags);
2013 return ret;
2014}
2015
2016static int irqtime_account_si_update(void)
2017{
2018 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2019 unsigned long flags;
2020 u64 latest_ns;
2021 int ret = 0;
2022
2023 local_irq_save(flags);
2024 latest_ns = this_cpu_read(cpu_softirq_time);
2025 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
2026 ret = 1;
2027 local_irq_restore(flags);
2028 return ret;
2029}
2030
2031#else /* CONFIG_IRQ_TIME_ACCOUNTING */
2032
2033#define sched_clock_irqtime (0)
2034
2035static void update_rq_clock_task(struct rq *rq, s64 delta)
2036{
2037 rq->clock_task += delta;
2038}
2039
2040#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2041
1939#include "sched_idletask.c" 2042#include "sched_idletask.c"
1940#include "sched_fair.c" 2043#include "sched_fair.c"
1941#include "sched_rt.c" 2044#include "sched_rt.c"
2045#include "sched_autogroup.c"
2046#include "sched_stoptask.c"
1942#include "../litmus/sched_litmus.c" 2047#include "../litmus/sched_litmus.c"
1943#ifdef CONFIG_SCHED_DEBUG 2048#ifdef CONFIG_SCHED_DEBUG
1944# include "sched_debug.c" 2049# include "sched_debug.c"
1945#endif 2050#endif
1946 2051
2052void sched_set_stop_task(int cpu, struct task_struct *stop)
2053{
2054 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2055 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2056
2057 if (stop) {
2058 /*
2059 * Make it appear like a SCHED_FIFO task, its something
2060 * userspace knows about and won't get confused about.
2061 *
2062 * Also, it will make PI more or less work without too
2063 * much confusion -- but then, stop work should not
2064 * rely on PI working anyway.
2065 */
2066 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2067
2068 stop->sched_class = &stop_sched_class;
2069 }
2070
2071 cpu_rq(cpu)->stop = stop;
2072
2073 if (old_stop) {
2074 /*
2075 * Reset it back to a normal scheduling class so that
2076 * it can die in pieces.
2077 */
2078 old_stop->sched_class = &rt_sched_class;
2079 }
2080}
2081
1947/* 2082/*
1948 * __normal_prio - return the priority that is based on the static prio 2083 * __normal_prio - return the priority that is based on the static prio
1949 */ 2084 */
@@ -2001,14 +2136,43 @@ inline int task_curr(const struct task_struct *p)
2001 2136
2002static inline void check_class_changed(struct rq *rq, struct task_struct *p, 2137static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2003 const struct sched_class *prev_class, 2138 const struct sched_class *prev_class,
2004 int oldprio, int running) 2139 int oldprio)
2005{ 2140{
2006 if (prev_class != p->sched_class) { 2141 if (prev_class != p->sched_class) {
2007 if (prev_class->switched_from) 2142 if (prev_class->switched_from)
2008 prev_class->switched_from(rq, p, running); 2143 prev_class->switched_from(rq, p);
2009 p->sched_class->switched_to(rq, p, running); 2144 p->sched_class->switched_to(rq, p);
2010 } else 2145 } else if (oldprio != p->prio)
2011 p->sched_class->prio_changed(rq, p, oldprio, running); 2146 p->sched_class->prio_changed(rq, p, oldprio);
2147}
2148
2149static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2150{
2151 const struct sched_class *class;
2152
2153 if (p->sched_class == rq->curr->sched_class) {
2154 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2155 } else {
2156 for_each_class(class) {
2157 if (class == rq->curr->sched_class)
2158 break;
2159 if (class == p->sched_class) {
2160 resched_task(rq->curr);
2161 break;
2162 }
2163 }
2164 }
2165
2166 /*
2167 * A queue event has occurred, and we're going to schedule. In
2168 * this case, we can save a useless back to back clock update.
2169 */
2170 /* LITMUS^RT:
2171 * The "disable-clock-update" approach was buggy in Linux 2.6.36.
2172 * The issue has been solved in 2.6.37.
2173 */
2174 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
2175 rq->skip_clock_update = 1;
2012} 2176}
2013 2177
2014#ifdef CONFIG_SMP 2178#ifdef CONFIG_SMP
@@ -2023,6 +2187,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2023 if (p->sched_class != &fair_sched_class) 2187 if (p->sched_class != &fair_sched_class)
2024 return 0; 2188 return 0;
2025 2189
2190 if (unlikely(p->policy == SCHED_IDLE))
2191 return 0;
2192
2026 /* 2193 /*
2027 * Buddy candidates are cache hot: 2194 * Buddy candidates are cache hot:
2028 */ 2195 */
@@ -2050,6 +2217,21 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2050 */ 2217 */
2051 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 2218 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2052 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2219 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2220
2221#ifdef CONFIG_LOCKDEP
2222 /*
2223 * The caller should hold either p->pi_lock or rq->lock, when changing
2224 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
2225 *
2226 * sched_move_task() holds both and thus holding either pins the cgroup,
2227 * see set_task_rq().
2228 *
2229 * Furthermore, all task_rq users should acquire both locks, see
2230 * task_rq_lock().
2231 */
2232 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2233 lockdep_is_held(&task_rq(p)->lock)));
2234#endif
2053#endif 2235#endif
2054 2236
2055 trace_sched_migrate_task(p, new_cpu); 2237 trace_sched_migrate_task(p, new_cpu);
@@ -2070,21 +2252,6 @@ struct migration_arg {
2070static int migration_cpu_stop(void *data); 2252static int migration_cpu_stop(void *data);
2071 2253
2072/* 2254/*
2073 * The task's runqueue lock must be held.
2074 * Returns true if you have to wait for migration thread.
2075 */
2076static bool migrate_task(struct task_struct *p, int dest_cpu)
2077{
2078 struct rq *rq = task_rq(p);
2079
2080 /*
2081 * If the task is not on a runqueue (and not running), then
2082 * the next wake-up will properly place the task.
2083 */
2084 return p->se.on_rq || task_running(rq, p);
2085}
2086
2087/*
2088 * wait_task_inactive - wait for a thread to unschedule. 2255 * wait_task_inactive - wait for a thread to unschedule.
2089 * 2256 *
2090 * If @match_state is nonzero, it's the @p->state value just checked and 2257 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2141,11 +2308,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2141 rq = task_rq_lock(p, &flags); 2308 rq = task_rq_lock(p, &flags);
2142 trace_sched_wait_task(p); 2309 trace_sched_wait_task(p);
2143 running = task_running(rq, p); 2310 running = task_running(rq, p);
2144 on_rq = p->se.on_rq; 2311 on_rq = p->on_rq;
2145 ncsw = 0; 2312 ncsw = 0;
2146 if (!match_state || p->state == match_state) 2313 if (!match_state || p->state == match_state)
2147 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2314 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2148 task_rq_unlock(rq, &flags); 2315 task_rq_unlock(rq, p, &flags);
2149 2316
2150 /* 2317 /*
2151 * If it changed from the expected state, bail out now. 2318 * If it changed from the expected state, bail out now.
@@ -2174,7 +2341,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2174 * yield - it could be a while. 2341 * yield - it could be a while.
2175 */ 2342 */
2176 if (unlikely(on_rq)) { 2343 if (unlikely(on_rq)) {
2177 schedule_timeout_uninterruptible(1); 2344 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
2345
2346 set_current_state(TASK_UNINTERRUPTIBLE);
2347 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2178 continue; 2348 continue;
2179 } 2349 }
2180 2350
@@ -2196,7 +2366,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2196 * Cause a process which is running on another CPU to enter 2366 * Cause a process which is running on another CPU to enter
2197 * kernel-mode, without any delay. (to get signals handled.) 2367 * kernel-mode, without any delay. (to get signals handled.)
2198 * 2368 *
2199 * NOTE: this function doesnt have to take the runqueue lock, 2369 * NOTE: this function doesn't have to take the runqueue lock,
2200 * because all it wants to ensure is that the remote task enters 2370 * because all it wants to ensure is that the remote task enters
2201 * the kernel. If the IPI races and the task has been migrated 2371 * the kernel. If the IPI races and the task has been migrated
2202 * to another CPU then no harm is done and the purpose has been 2372 * to another CPU then no harm is done and the purpose has been
@@ -2215,30 +2385,9 @@ void kick_process(struct task_struct *p)
2215EXPORT_SYMBOL_GPL(kick_process); 2385EXPORT_SYMBOL_GPL(kick_process);
2216#endif /* CONFIG_SMP */ 2386#endif /* CONFIG_SMP */
2217 2387
2218/**
2219 * task_oncpu_function_call - call a function on the cpu on which a task runs
2220 * @p: the task to evaluate
2221 * @func: the function to be called
2222 * @info: the function call argument
2223 *
2224 * Calls the function @func when the task is currently running. This might
2225 * be on the current CPU, which just calls the function directly
2226 */
2227void task_oncpu_function_call(struct task_struct *p,
2228 void (*func) (void *info), void *info)
2229{
2230 int cpu;
2231
2232 preempt_disable();
2233 cpu = task_cpu(p);
2234 if (task_curr(p))
2235 smp_call_function_single(cpu, func, info, 1);
2236 preempt_enable();
2237}
2238
2239#ifdef CONFIG_SMP 2388#ifdef CONFIG_SMP
2240/* 2389/*
2241 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2390 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
2242 */ 2391 */
2243static int select_fallback_rq(int cpu, struct task_struct *p) 2392static int select_fallback_rq(int cpu, struct task_struct *p)
2244{ 2393{
@@ -2256,30 +2405,27 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2256 return dest_cpu; 2405 return dest_cpu;
2257 2406
2258 /* No more Mr. Nice Guy. */ 2407 /* No more Mr. Nice Guy. */
2259 if (unlikely(dest_cpu >= nr_cpu_ids)) { 2408 dest_cpu = cpuset_cpus_allowed_fallback(p);
2260 dest_cpu = cpuset_cpus_allowed_fallback(p); 2409 /*
2261 /* 2410 * Don't tell them about moving exiting tasks or
2262 * Don't tell them about moving exiting tasks or 2411 * kernel threads (both mm NULL), since they never
2263 * kernel threads (both mm NULL), since they never 2412 * leave kernel.
2264 * leave kernel. 2413 */
2265 */ 2414 if (p->mm && printk_ratelimit()) {
2266 if (p->mm && printk_ratelimit()) { 2415 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2267 printk(KERN_INFO "process %d (%s) no " 2416 task_pid_nr(p), p->comm, cpu);
2268 "longer affine to cpu%d\n",
2269 task_pid_nr(p), p->comm, cpu);
2270 }
2271 } 2417 }
2272 2418
2273 return dest_cpu; 2419 return dest_cpu;
2274} 2420}
2275 2421
2276/* 2422/*
2277 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. 2423 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
2278 */ 2424 */
2279static inline 2425static inline
2280int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) 2426int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2281{ 2427{
2282 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); 2428 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2283 2429
2284 /* 2430 /*
2285 * In order not to call set_task_cpu() on a blocking task we need 2431 * In order not to call set_task_cpu() on a blocking task we need
@@ -2305,27 +2451,63 @@ static void update_avg(u64 *avg, u64 sample)
2305} 2451}
2306#endif 2452#endif
2307 2453
2308static inline void ttwu_activate(struct task_struct *p, struct rq *rq, 2454static void
2309 bool is_sync, bool is_migrate, bool is_local, 2455ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2310 unsigned long en_flags)
2311{ 2456{
2312 schedstat_inc(p, se.statistics.nr_wakeups); 2457#ifdef CONFIG_SCHEDSTATS
2313 if (is_sync) 2458 struct rq *rq = this_rq();
2314 schedstat_inc(p, se.statistics.nr_wakeups_sync); 2459
2315 if (is_migrate) 2460#ifdef CONFIG_SMP
2316 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 2461 int this_cpu = smp_processor_id();
2317 if (is_local) 2462
2463 if (cpu == this_cpu) {
2464 schedstat_inc(rq, ttwu_local);
2318 schedstat_inc(p, se.statistics.nr_wakeups_local); 2465 schedstat_inc(p, se.statistics.nr_wakeups_local);
2319 else 2466 } else {
2467 struct sched_domain *sd;
2468
2320 schedstat_inc(p, se.statistics.nr_wakeups_remote); 2469 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2470 rcu_read_lock();
2471 for_each_domain(this_cpu, sd) {
2472 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2473 schedstat_inc(sd, ttwu_wake_remote);
2474 break;
2475 }
2476 }
2477 rcu_read_unlock();
2478 }
2479
2480 if (wake_flags & WF_MIGRATED)
2481 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2482
2483#endif /* CONFIG_SMP */
2484
2485 schedstat_inc(rq, ttwu_count);
2486 schedstat_inc(p, se.statistics.nr_wakeups);
2487
2488 if (wake_flags & WF_SYNC)
2489 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2321 2490
2491#endif /* CONFIG_SCHEDSTATS */
2492}
2493
2494static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2495{
2322 activate_task(rq, p, en_flags); 2496 activate_task(rq, p, en_flags);
2497 p->on_rq = 1;
2498
2499 /* if a worker is waking up, notify workqueue */
2500 if (p->flags & PF_WQ_WORKER)
2501 wq_worker_waking_up(p, cpu_of(rq));
2323} 2502}
2324 2503
2325static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, 2504/*
2326 int wake_flags, bool success) 2505 * Mark the task runnable and perform wakeup-preemption.
2506 */
2507static void
2508ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2327{ 2509{
2328 trace_sched_wakeup(p, success); 2510 trace_sched_wakeup(p, true);
2329 check_preempt_curr(rq, p, wake_flags); 2511 check_preempt_curr(rq, p, wake_flags);
2330 2512
2331 p->state = TASK_RUNNING; 2513 p->state = TASK_RUNNING;
@@ -2344,9 +2526,151 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2344 rq->idle_stamp = 0; 2526 rq->idle_stamp = 0;
2345 } 2527 }
2346#endif 2528#endif
2347 /* if a worker is waking up, notify workqueue */ 2529}
2348 if ((p->flags & PF_WQ_WORKER) && success) 2530
2349 wq_worker_waking_up(p, cpu_of(rq)); 2531static void
2532ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
2533{
2534#ifdef CONFIG_SMP
2535 if (p->sched_contributes_to_load)
2536 rq->nr_uninterruptible--;
2537#endif
2538
2539 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
2540 ttwu_do_wakeup(rq, p, wake_flags);
2541}
2542
2543/*
2544 * Called in case the task @p isn't fully descheduled from its runqueue,
2545 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2546 * since all we need to do is flip p->state to TASK_RUNNING, since
2547 * the task is still ->on_rq.
2548 */
2549static int ttwu_remote(struct task_struct *p, int wake_flags)
2550{
2551 struct rq *rq;
2552 int ret = 0;
2553
2554 rq = __task_rq_lock(p);
2555 if (p->on_rq) {
2556 ttwu_do_wakeup(rq, p, wake_flags);
2557 ret = 1;
2558 }
2559 __task_rq_unlock(rq);
2560
2561 return ret;
2562}
2563
2564#ifdef CONFIG_SMP
2565static void sched_ttwu_do_pending(struct task_struct *list)
2566{
2567 struct rq *rq = this_rq();
2568
2569 raw_spin_lock(&rq->lock);
2570
2571 while (list) {
2572 struct task_struct *p = list;
2573 list = list->wake_entry;
2574 ttwu_do_activate(rq, p, 0);
2575 }
2576
2577 raw_spin_unlock(&rq->lock);
2578}
2579
2580#ifdef CONFIG_HOTPLUG_CPU
2581
2582static void sched_ttwu_pending(void)
2583{
2584 struct rq *rq = this_rq();
2585 struct task_struct *list = xchg(&rq->wake_list, NULL);
2586
2587 if (!list)
2588 return;
2589
2590 sched_ttwu_do_pending(list);
2591}
2592
2593#endif /* CONFIG_HOTPLUG_CPU */
2594
2595void scheduler_ipi(void)
2596{
2597 struct rq *rq = this_rq();
2598 struct task_struct *list = xchg(&rq->wake_list, NULL);
2599
2600 if (!list)
2601 return;
2602
2603 /*
2604 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
2605 * traditionally all their work was done from the interrupt return
2606 * path. Now that we actually do some work, we need to make sure
2607 * we do call them.
2608 *
2609 * Some archs already do call them, luckily irq_enter/exit nest
2610 * properly.
2611 *
2612 * Arguably we should visit all archs and update all handlers,
2613 * however a fair share of IPIs are still resched only so this would
2614 * somewhat pessimize the simple resched case.
2615 */
2616 irq_enter();
2617 sched_ttwu_do_pending(list);
2618 irq_exit();
2619}
2620
2621static void ttwu_queue_remote(struct task_struct *p, int cpu)
2622{
2623 struct rq *rq = cpu_rq(cpu);
2624 struct task_struct *next = rq->wake_list;
2625
2626 for (;;) {
2627 struct task_struct *old = next;
2628
2629 p->wake_entry = next;
2630 next = cmpxchg(&rq->wake_list, old, p);
2631 if (next == old)
2632 break;
2633 }
2634
2635 if (!next)
2636 smp_send_reschedule(cpu);
2637}
2638
2639#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2640static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
2641{
2642 struct rq *rq;
2643 int ret = 0;
2644
2645 rq = __task_rq_lock(p);
2646 if (p->on_cpu) {
2647 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2648 ttwu_do_wakeup(rq, p, wake_flags);
2649 ret = 1;
2650 }
2651 __task_rq_unlock(rq);
2652
2653 return ret;
2654
2655}
2656#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2657#endif /* CONFIG_SMP */
2658
2659static void ttwu_queue(struct task_struct *p, int cpu)
2660{
2661 struct rq *rq = cpu_rq(cpu);
2662
2663#if defined(CONFIG_SMP)
2664 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2665 sched_clock_cpu(cpu); /* sync clocks x-cpu */
2666 ttwu_queue_remote(p, cpu);
2667 return;
2668 }
2669#endif
2670
2671 raw_spin_lock(&rq->lock);
2672 ttwu_do_activate(rq, p, 0);
2673 raw_spin_unlock(&rq->lock);
2350} 2674}
2351 2675
2352/** 2676/**
@@ -2364,97 +2688,79 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2364 * Returns %true if @p was woken up, %false if it was already running 2688 * Returns %true if @p was woken up, %false if it was already running
2365 * or @state didn't match @p's state. 2689 * or @state didn't match @p's state.
2366 */ 2690 */
2367static int try_to_wake_up(struct task_struct *p, unsigned int state, 2691static int
2368 int wake_flags) 2692try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2369{ 2693{
2370 int cpu, orig_cpu, this_cpu, success = 0;
2371 unsigned long flags; 2694 unsigned long flags;
2372 unsigned long en_flags = ENQUEUE_WAKEUP; 2695 int cpu, success = 0;
2373 struct rq *rq;
2374 2696
2375 if (is_realtime(p)) 2697 if (is_realtime(p))
2376 TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state); 2698 TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state);
2377 2699
2378 this_cpu = get_cpu();
2379
2380 smp_wmb(); 2700 smp_wmb();
2381 rq = task_rq_lock(p, &flags); 2701 raw_spin_lock_irqsave(&p->pi_lock, flags);
2382 if (!(p->state & state)) 2702 if (!(p->state & state))
2383 goto out; 2703 goto out;
2384 2704
2385 if (p->se.on_rq) 2705 success = 1; /* we're going to change ->state */
2386 goto out_running;
2387
2388 cpu = task_cpu(p); 2706 cpu = task_cpu(p);
2389 orig_cpu = cpu;
2390 2707
2391#ifdef CONFIG_SMP 2708 if (p->on_rq && ttwu_remote(p, wake_flags))
2392 if (unlikely(task_running(rq, p)) || is_realtime(p)) 2709 goto stat;
2393 goto out_activate;
2394 2710
2711#ifdef CONFIG_SMP
2395 /* 2712 /*
2396 * In order to handle concurrent wakeups and release the rq->lock 2713 * If the owning (remote) cpu is still in the middle of schedule() with
2397 * we put the task in TASK_WAKING state. 2714 * this task as prev, wait until its done referencing the task.
2398 *
2399 * First fix up the nr_uninterruptible count:
2400 */ 2715 */
2401 if (task_contributes_to_load(p)) { 2716 while (p->on_cpu) {
2402 if (likely(cpu_online(orig_cpu))) 2717#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2403 rq->nr_uninterruptible--; 2718 /*
2404 else 2719 * In case the architecture enables interrupts in
2405 this_rq()->nr_uninterruptible--; 2720 * context_switch(), we cannot busy wait, since that
2406 } 2721 * would lead to deadlocks when an interrupt hits and
2407 p->state = TASK_WAKING; 2722 * tries to wake up @prev. So bail and do a complete
2408 2723 * remote wakeup.
2409 if (p->sched_class->task_waking) { 2724 */
2410 p->sched_class->task_waking(rq, p); 2725 if (ttwu_activate_remote(p, wake_flags))
2411 en_flags |= ENQUEUE_WAKING; 2726 goto stat;
2727#else
2728 cpu_relax();
2729#endif
2412 } 2730 }
2731 /*
2732 * Pairs with the smp_wmb() in finish_lock_switch().
2733 */
2734 smp_rmb();
2413 2735
2414 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); 2736 /* LITMUS^RT: once the task can be safely referenced by this
2415 if (cpu != orig_cpu) 2737 * CPU, don't mess up with Linux load balancing stuff.
2416 set_task_cpu(p, cpu); 2738 */
2417 __task_rq_unlock(rq); 2739 if (is_realtime(p))
2740 goto litmus_out_activate;
2418 2741
2419 rq = cpu_rq(cpu); 2742 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2420 raw_spin_lock(&rq->lock); 2743 p->state = TASK_WAKING;
2421 2744
2422 /* 2745 if (p->sched_class->task_waking)
2423 * We migrated the task without holding either rq->lock, however 2746 p->sched_class->task_waking(p);
2424 * since the task is not on the task list itself, nobody else
2425 * will try and migrate the task, hence the rq should match the
2426 * cpu we just moved it to.
2427 */
2428 WARN_ON(task_cpu(p) != cpu);
2429 WARN_ON(p->state != TASK_WAKING);
2430 2747
2431#ifdef CONFIG_SCHEDSTATS 2748 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2432 schedstat_inc(rq, ttwu_count); 2749 if (task_cpu(p) != cpu) {
2433 if (cpu == this_cpu) 2750 wake_flags |= WF_MIGRATED;
2434 schedstat_inc(rq, ttwu_local); 2751 set_task_cpu(p, cpu);
2435 else {
2436 struct sched_domain *sd;
2437 for_each_domain(this_cpu, sd) {
2438 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2439 schedstat_inc(sd, ttwu_wake_remote);
2440 break;
2441 }
2442 }
2443 } 2752 }
2444#endif /* CONFIG_SCHEDSTATS */
2445 2753
2446out_activate: 2754litmus_out_activate:
2447#endif /* CONFIG_SMP */ 2755#endif /* CONFIG_SMP */
2448 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, 2756
2449 cpu == this_cpu, en_flags); 2757 ttwu_queue(p, cpu);
2450 success = 1; 2758stat:
2451out_running: 2759 ttwu_stat(p, cpu, wake_flags);
2452 ttwu_post_activation(p, rq, wake_flags, success);
2453out: 2760out:
2454 if (is_realtime(p)) 2761 if (is_realtime(p))
2455 TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state); 2762 TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state);
2456 task_rq_unlock(rq, &flags); 2763 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2457 put_cpu();
2458 2764
2459 return success; 2765 return success;
2460} 2766}
@@ -2463,31 +2769,34 @@ out:
2463 * try_to_wake_up_local - try to wake up a local task with rq lock held 2769 * try_to_wake_up_local - try to wake up a local task with rq lock held
2464 * @p: the thread to be awakened 2770 * @p: the thread to be awakened
2465 * 2771 *
2466 * Put @p on the run-queue if it's not alredy there. The caller must 2772 * Put @p on the run-queue if it's not already there. The caller must
2467 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2773 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2468 * the current task. this_rq() stays locked over invocation. 2774 * the current task.
2469 */ 2775 */
2470static void try_to_wake_up_local(struct task_struct *p) 2776static void try_to_wake_up_local(struct task_struct *p)
2471{ 2777{
2472 struct rq *rq = task_rq(p); 2778 struct rq *rq = task_rq(p);
2473 bool success = false;
2474 2779
2475 BUG_ON(rq != this_rq()); 2780 BUG_ON(rq != this_rq());
2476 BUG_ON(p == current); 2781 BUG_ON(p == current);
2477 lockdep_assert_held(&rq->lock); 2782 lockdep_assert_held(&rq->lock);
2478 2783
2784 if (!raw_spin_trylock(&p->pi_lock)) {
2785 raw_spin_unlock(&rq->lock);
2786 raw_spin_lock(&p->pi_lock);
2787 raw_spin_lock(&rq->lock);
2788 }
2789
2479 if (!(p->state & TASK_NORMAL)) 2790 if (!(p->state & TASK_NORMAL))
2480 return; 2791 goto out;
2481 2792
2482 if (!p->se.on_rq) { 2793 if (!p->on_rq)
2483 if (likely(!task_running(rq, p))) { 2794 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2484 schedstat_inc(rq, ttwu_count); 2795
2485 schedstat_inc(rq, ttwu_local); 2796 ttwu_do_wakeup(rq, p, 0);
2486 } 2797 ttwu_stat(p, smp_processor_id(), 0);
2487 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); 2798out:
2488 success = true; 2799 raw_spin_unlock(&p->pi_lock);
2489 }
2490 ttwu_post_activation(p, rq, 0, success);
2491} 2800}
2492 2801
2493/** 2802/**
@@ -2520,18 +2829,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
2520 */ 2829 */
2521static void __sched_fork(struct task_struct *p) 2830static void __sched_fork(struct task_struct *p)
2522{ 2831{
2832 p->on_rq = 0;
2833
2834 p->se.on_rq = 0;
2523 p->se.exec_start = 0; 2835 p->se.exec_start = 0;
2524 p->se.sum_exec_runtime = 0; 2836 p->se.sum_exec_runtime = 0;
2525 p->se.prev_sum_exec_runtime = 0; 2837 p->se.prev_sum_exec_runtime = 0;
2526 p->se.nr_migrations = 0; 2838 p->se.nr_migrations = 0;
2839 p->se.vruntime = 0;
2840 INIT_LIST_HEAD(&p->se.group_node);
2527 2841
2528#ifdef CONFIG_SCHEDSTATS 2842#ifdef CONFIG_SCHEDSTATS
2529 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2843 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2530#endif 2844#endif
2531 2845
2532 INIT_LIST_HEAD(&p->rt.run_list); 2846 INIT_LIST_HEAD(&p->rt.run_list);
2533 p->se.on_rq = 0;
2534 INIT_LIST_HEAD(&p->se.group_node);
2535 2847
2536#ifdef CONFIG_PREEMPT_NOTIFIERS 2848#ifdef CONFIG_PREEMPT_NOTIFIERS
2537 INIT_HLIST_HEAD(&p->preempt_notifiers); 2849 INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2541,8 +2853,9 @@ static void __sched_fork(struct task_struct *p)
2541/* 2853/*
2542 * fork()/clone()-time setup: 2854 * fork()/clone()-time setup:
2543 */ 2855 */
2544void sched_fork(struct task_struct *p, int clone_flags) 2856void sched_fork(struct task_struct *p)
2545{ 2857{
2858 unsigned long flags;
2546 int cpu = get_cpu(); 2859 int cpu = get_cpu();
2547 2860
2548 __sched_fork(p); 2861 __sched_fork(p);
@@ -2594,22 +2907,24 @@ void sched_fork(struct task_struct *p, int clone_flags)
2594 * 2907 *
2595 * Silence PROVE_RCU. 2908 * Silence PROVE_RCU.
2596 */ 2909 */
2597 rcu_read_lock(); 2910 raw_spin_lock_irqsave(&p->pi_lock, flags);
2598 set_task_cpu(p, cpu); 2911 set_task_cpu(p, cpu);
2599 rcu_read_unlock(); 2912 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2600 2913
2601#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2914#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2602 if (likely(sched_info_on())) 2915 if (likely(sched_info_on()))
2603 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2916 memset(&p->sched_info, 0, sizeof(p->sched_info));
2604#endif 2917#endif
2605#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 2918#if defined(CONFIG_SMP)
2606 p->oncpu = 0; 2919 p->on_cpu = 0;
2607#endif 2920#endif
2608#ifdef CONFIG_PREEMPT 2921#ifdef CONFIG_PREEMPT
2609 /* Want to start with kernel preemption disabled. */ 2922 /* Want to start with kernel preemption disabled. */
2610 task_thread_info(p)->preempt_count = 1; 2923 task_thread_info(p)->preempt_count = 1;
2611#endif 2924#endif
2925#ifdef CONFIG_SMP
2612 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2926 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2927#endif
2613 2928
2614 put_cpu(); 2929 put_cpu();
2615} 2930}
@@ -2621,41 +2936,31 @@ void sched_fork(struct task_struct *p, int clone_flags)
2621 * that must be done for every newly created context, then puts the task 2936 * that must be done for every newly created context, then puts the task
2622 * on the runqueue and wakes it. 2937 * on the runqueue and wakes it.
2623 */ 2938 */
2624void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 2939void wake_up_new_task(struct task_struct *p)
2625{ 2940{
2626 unsigned long flags; 2941 unsigned long flags;
2627 struct rq *rq; 2942 struct rq *rq;
2628 int cpu __maybe_unused = get_cpu();
2629 2943
2944 raw_spin_lock_irqsave(&p->pi_lock, flags);
2630#ifdef CONFIG_SMP 2945#ifdef CONFIG_SMP
2631 rq = task_rq_lock(p, &flags);
2632 p->state = TASK_WAKING;
2633
2634 /* 2946 /*
2635 * Fork balancing, do it here and not earlier because: 2947 * Fork balancing, do it here and not earlier because:
2636 * - cpus_allowed can change in the fork path 2948 * - cpus_allowed can change in the fork path
2637 * - any previously selected cpu might disappear through hotplug 2949 * - any previously selected cpu might disappear through hotplug
2638 *
2639 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2640 * without people poking at ->cpus_allowed.
2641 */ 2950 */
2642 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); 2951 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
2643 set_task_cpu(p, cpu);
2644
2645 p->state = TASK_RUNNING;
2646 task_rq_unlock(rq, &flags);
2647#endif 2952#endif
2648 2953
2649 rq = task_rq_lock(p, &flags); 2954 rq = __task_rq_lock(p);
2650 activate_task(rq, p, 0); 2955 activate_task(rq, p, 0);
2651 trace_sched_wakeup_new(p, 1); 2956 p->on_rq = 1;
2957 trace_sched_wakeup_new(p, true);
2652 check_preempt_curr(rq, p, WF_FORK); 2958 check_preempt_curr(rq, p, WF_FORK);
2653#ifdef CONFIG_SMP 2959#ifdef CONFIG_SMP
2654 if (p->sched_class->task_woken) 2960 if (p->sched_class->task_woken)
2655 p->sched_class->task_woken(rq, p); 2961 p->sched_class->task_woken(rq, p);
2656#endif 2962#endif
2657 task_rq_unlock(rq, &flags); 2963 task_rq_unlock(rq, p, &flags);
2658 put_cpu();
2659} 2964}
2660 2965
2661#ifdef CONFIG_PREEMPT_NOTIFIERS 2966#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2733,9 +3038,12 @@ static inline void
2733prepare_task_switch(struct rq *rq, struct task_struct *prev, 3038prepare_task_switch(struct rq *rq, struct task_struct *prev,
2734 struct task_struct *next) 3039 struct task_struct *next)
2735{ 3040{
3041 sched_info_switch(prev, next);
3042 perf_event_task_sched_out(prev, next);
2736 fire_sched_out_preempt_notifiers(prev, next); 3043 fire_sched_out_preempt_notifiers(prev, next);
2737 prepare_lock_switch(rq, next); 3044 prepare_lock_switch(rq, next);
2738 prepare_arch_switch(next); 3045 prepare_arch_switch(next);
3046 trace_sched_switch(prev, next);
2739} 3047}
2740 3048
2741/** 3049/**
@@ -2879,7 +3187,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2879 struct mm_struct *mm, *oldmm; 3187 struct mm_struct *mm, *oldmm;
2880 3188
2881 prepare_task_switch(rq, prev, next); 3189 prepare_task_switch(rq, prev, next);
2882 trace_sched_switch(prev, next); 3190
2883 mm = next->mm; 3191 mm = next->mm;
2884 oldmm = prev->active_mm; 3192 oldmm = prev->active_mm;
2885 /* 3193 /*
@@ -2889,14 +3197,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2889 */ 3197 */
2890 arch_start_context_switch(prev); 3198 arch_start_context_switch(prev);
2891 3199
2892 if (likely(!mm)) { 3200 if (!mm) {
2893 next->active_mm = oldmm; 3201 next->active_mm = oldmm;
2894 atomic_inc(&oldmm->mm_count); 3202 atomic_inc(&oldmm->mm_count);
2895 enter_lazy_tlb(oldmm, next); 3203 enter_lazy_tlb(oldmm, next);
2896 } else 3204 } else
2897 switch_mm(oldmm, mm, next); 3205 switch_mm(oldmm, mm, next);
2898 3206
2899 if (likely(!prev->mm)) { 3207 if (!prev->mm) {
2900 prev->active_mm = NULL; 3208 prev->active_mm = NULL;
2901 rq->prev_mm = oldmm; 3209 rq->prev_mm = oldmm;
2902 } 3210 }
@@ -3011,6 +3319,15 @@ static long calc_load_fold_active(struct rq *this_rq)
3011 return delta; 3319 return delta;
3012} 3320}
3013 3321
3322static unsigned long
3323calc_load(unsigned long load, unsigned long exp, unsigned long active)
3324{
3325 load *= exp;
3326 load += active * (FIXED_1 - exp);
3327 load += 1UL << (FSHIFT - 1);
3328 return load >> FSHIFT;
3329}
3330
3014#ifdef CONFIG_NO_HZ 3331#ifdef CONFIG_NO_HZ
3015/* 3332/*
3016 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 3333 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3040,6 +3357,128 @@ static long calc_load_fold_idle(void)
3040 3357
3041 return delta; 3358 return delta;
3042} 3359}
3360
3361/**
3362 * fixed_power_int - compute: x^n, in O(log n) time
3363 *
3364 * @x: base of the power
3365 * @frac_bits: fractional bits of @x
3366 * @n: power to raise @x to.
3367 *
3368 * By exploiting the relation between the definition of the natural power
3369 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
3370 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
3371 * (where: n_i \elem {0, 1}, the binary vector representing n),
3372 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
3373 * of course trivially computable in O(log_2 n), the length of our binary
3374 * vector.
3375 */
3376static unsigned long
3377fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3378{
3379 unsigned long result = 1UL << frac_bits;
3380
3381 if (n) for (;;) {
3382 if (n & 1) {
3383 result *= x;
3384 result += 1UL << (frac_bits - 1);
3385 result >>= frac_bits;
3386 }
3387 n >>= 1;
3388 if (!n)
3389 break;
3390 x *= x;
3391 x += 1UL << (frac_bits - 1);
3392 x >>= frac_bits;
3393 }
3394
3395 return result;
3396}
3397
3398/*
3399 * a1 = a0 * e + a * (1 - e)
3400 *
3401 * a2 = a1 * e + a * (1 - e)
3402 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
3403 * = a0 * e^2 + a * (1 - e) * (1 + e)
3404 *
3405 * a3 = a2 * e + a * (1 - e)
3406 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
3407 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
3408 *
3409 * ...
3410 *
3411 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
3412 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
3413 * = a0 * e^n + a * (1 - e^n)
3414 *
3415 * [1] application of the geometric series:
3416 *
3417 * n 1 - x^(n+1)
3418 * S_n := \Sum x^i = -------------
3419 * i=0 1 - x
3420 */
3421static unsigned long
3422calc_load_n(unsigned long load, unsigned long exp,
3423 unsigned long active, unsigned int n)
3424{
3425
3426 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3427}
3428
3429/*
3430 * NO_HZ can leave us missing all per-cpu ticks calling
3431 * calc_load_account_active(), but since an idle CPU folds its delta into
3432 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
3433 * in the pending idle delta if our idle period crossed a load cycle boundary.
3434 *
3435 * Once we've updated the global active value, we need to apply the exponential
3436 * weights adjusted to the number of cycles missed.
3437 */
3438static void calc_global_nohz(unsigned long ticks)
3439{
3440 long delta, active, n;
3441
3442 if (time_before(jiffies, calc_load_update))
3443 return;
3444
3445 /*
3446 * If we crossed a calc_load_update boundary, make sure to fold
3447 * any pending idle changes, the respective CPUs might have
3448 * missed the tick driven calc_load_account_active() update
3449 * due to NO_HZ.
3450 */
3451 delta = calc_load_fold_idle();
3452 if (delta)
3453 atomic_long_add(delta, &calc_load_tasks);
3454
3455 /*
3456 * If we were idle for multiple load cycles, apply them.
3457 */
3458 if (ticks >= LOAD_FREQ) {
3459 n = ticks / LOAD_FREQ;
3460
3461 active = atomic_long_read(&calc_load_tasks);
3462 active = active > 0 ? active * FIXED_1 : 0;
3463
3464 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3465 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3466 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3467
3468 calc_load_update += n * LOAD_FREQ;
3469 }
3470
3471 /*
3472 * Its possible the remainder of the above division also crosses
3473 * a LOAD_FREQ period, the regular check in calc_global_load()
3474 * which comes after this will take care of that.
3475 *
3476 * Consider us being 11 ticks before a cycle completion, and us
3477 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
3478 * age us 4 cycles, and the test in calc_global_load() will
3479 * pick up the final one.
3480 */
3481}
3043#else 3482#else
3044static void calc_load_account_idle(struct rq *this_rq) 3483static void calc_load_account_idle(struct rq *this_rq)
3045{ 3484{
@@ -3049,6 +3488,10 @@ static inline long calc_load_fold_idle(void)
3049{ 3488{
3050 return 0; 3489 return 0;
3051} 3490}
3491
3492static void calc_global_nohz(unsigned long ticks)
3493{
3494}
3052#endif 3495#endif
3053 3496
3054/** 3497/**
@@ -3066,24 +3509,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3066 loads[2] = (avenrun[2] + offset) << shift; 3509 loads[2] = (avenrun[2] + offset) << shift;
3067} 3510}
3068 3511
3069static unsigned long
3070calc_load(unsigned long load, unsigned long exp, unsigned long active)
3071{
3072 load *= exp;
3073 load += active * (FIXED_1 - exp);
3074 return load >> FSHIFT;
3075}
3076
3077/* 3512/*
3078 * calc_load - update the avenrun load estimates 10 ticks after the 3513 * calc_load - update the avenrun load estimates 10 ticks after the
3079 * CPUs have updated calc_load_tasks. 3514 * CPUs have updated calc_load_tasks.
3080 */ 3515 */
3081void calc_global_load(void) 3516void calc_global_load(unsigned long ticks)
3082{ 3517{
3083 unsigned long upd = calc_load_update + 10;
3084 long active; 3518 long active;
3085 3519
3086 if (time_before(jiffies, upd)) 3520 calc_global_nohz(ticks);
3521
3522 if (time_before(jiffies, calc_load_update + 10))
3087 return; 3523 return;
3088 3524
3089 active = atomic_long_read(&calc_load_tasks); 3525 active = atomic_long_read(&calc_load_tasks);
@@ -3244,27 +3680,22 @@ void sched_exec(void)
3244{ 3680{
3245 struct task_struct *p = current; 3681 struct task_struct *p = current;
3246 unsigned long flags; 3682 unsigned long flags;
3247 struct rq *rq;
3248 int dest_cpu; 3683 int dest_cpu;
3249 3684
3250 rq = task_rq_lock(p, &flags); 3685 raw_spin_lock_irqsave(&p->pi_lock, flags);
3251 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); 3686 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
3252 if (dest_cpu == smp_processor_id()) 3687 if (dest_cpu == smp_processor_id())
3253 goto unlock; 3688 goto unlock;
3254 3689
3255 /* 3690 if (likely(cpu_active(dest_cpu))) {
3256 * select_task_rq() can race against ->cpus_allowed
3257 */
3258 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3259 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
3260 struct migration_arg arg = { p, dest_cpu }; 3691 struct migration_arg arg = { p, dest_cpu };
3261 3692
3262 task_rq_unlock(rq, &flags); 3693 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3263 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 3694 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3264 return; 3695 return;
3265 } 3696 }
3266unlock: 3697unlock:
3267 task_rq_unlock(rq, &flags); 3698 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3268} 3699}
3269 3700
3270#endif 3701#endif
@@ -3285,7 +3716,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3285 3716
3286 if (task_current(rq, p)) { 3717 if (task_current(rq, p)) {
3287 update_rq_clock(rq); 3718 update_rq_clock(rq);
3288 ns = rq->clock - p->se.exec_start; 3719 ns = rq->clock_task - p->se.exec_start;
3289 if ((s64)ns < 0) 3720 if ((s64)ns < 0)
3290 ns = 0; 3721 ns = 0;
3291 } 3722 }
@@ -3301,7 +3732,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
3301 3732
3302 rq = task_rq_lock(p, &flags); 3733 rq = task_rq_lock(p, &flags);
3303 ns = do_task_delta_exec(p, rq); 3734 ns = do_task_delta_exec(p, rq);
3304 task_rq_unlock(rq, &flags); 3735 task_rq_unlock(rq, p, &flags);
3305 3736
3306 return ns; 3737 return ns;
3307} 3738}
@@ -3319,7 +3750,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3319 3750
3320 rq = task_rq_lock(p, &flags); 3751 rq = task_rq_lock(p, &flags);
3321 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 3752 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3322 task_rq_unlock(rq, &flags); 3753 task_rq_unlock(rq, p, &flags);
3323 3754
3324 return ns; 3755 return ns;
3325} 3756}
@@ -3343,7 +3774,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
3343 rq = task_rq_lock(p, &flags); 3774 rq = task_rq_lock(p, &flags);
3344 thread_group_cputime(p, &totals); 3775 thread_group_cputime(p, &totals);
3345 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 3776 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3346 task_rq_unlock(rq, &flags); 3777 task_rq_unlock(rq, p, &flags);
3347 3778
3348 return ns; 3779 return ns;
3349} 3780}
@@ -3408,6 +3839,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3408} 3839}
3409 3840
3410/* 3841/*
3842 * Account system cpu time to a process and desired cpustat field
3843 * @p: the process that the cpu time gets accounted to
3844 * @cputime: the cpu time spent in kernel space since the last update
3845 * @cputime_scaled: cputime scaled by cpu frequency
3846 * @target_cputime64: pointer to cpustat field that has to be updated
3847 */
3848static inline
3849void __account_system_time(struct task_struct *p, cputime_t cputime,
3850 cputime_t cputime_scaled, cputime64_t *target_cputime64)
3851{
3852 cputime64_t tmp = cputime_to_cputime64(cputime);
3853
3854 /* Add system time to process. */
3855 p->stime = cputime_add(p->stime, cputime);
3856 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3857 account_group_system_time(p, cputime);
3858
3859 /* Add system time to cpustat. */
3860 *target_cputime64 = cputime64_add(*target_cputime64, tmp);
3861 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3862
3863 /* Account for system time used */
3864 acct_update_integrals(p);
3865}
3866
3867/*
3411 * Account system cpu time to a process. 3868 * Account system cpu time to a process.
3412 * @p: the process that the cpu time gets accounted to 3869 * @p: the process that the cpu time gets accounted to
3413 * @hardirq_offset: the offset to subtract from hardirq_count() 3870 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3418,36 +3875,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3418 cputime_t cputime, cputime_t cputime_scaled) 3875 cputime_t cputime, cputime_t cputime_scaled)
3419{ 3876{
3420 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3877 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3421 cputime64_t tmp; 3878 cputime64_t *target_cputime64;
3422 3879
3423 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 3880 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3424 account_guest_time(p, cputime, cputime_scaled); 3881 account_guest_time(p, cputime, cputime_scaled);
3425 return; 3882 return;
3426 } 3883 }
3427 3884
3428 /* Add system time to process. */
3429 p->stime = cputime_add(p->stime, cputime);
3430 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3431 account_group_system_time(p, cputime);
3432
3433 /* Add system time to cpustat. */
3434 tmp = cputime_to_cputime64(cputime);
3435 if (hardirq_count() - hardirq_offset) 3885 if (hardirq_count() - hardirq_offset)
3436 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3886 target_cputime64 = &cpustat->irq;
3437 else if (softirq_count()) 3887 else if (in_serving_softirq())
3438 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3888 target_cputime64 = &cpustat->softirq;
3439 else 3889 else
3440 cpustat->system = cputime64_add(cpustat->system, tmp); 3890 target_cputime64 = &cpustat->system;
3441
3442 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3443 3891
3444 /* Account for system time used */ 3892 __account_system_time(p, cputime, cputime_scaled, target_cputime64);
3445 acct_update_integrals(p);
3446} 3893}
3447 3894
3448/* 3895/*
3449 * Account for involuntary wait time. 3896 * Account for involuntary wait time.
3450 * @steal: the cpu time spent in involuntary wait 3897 * @cputime: the cpu time spent in involuntary wait
3451 */ 3898 */
3452void account_steal_time(cputime_t cputime) 3899void account_steal_time(cputime_t cputime)
3453{ 3900{
@@ -3475,6 +3922,73 @@ void account_idle_time(cputime_t cputime)
3475 3922
3476#ifndef CONFIG_VIRT_CPU_ACCOUNTING 3923#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3477 3924
3925#ifdef CONFIG_IRQ_TIME_ACCOUNTING
3926/*
3927 * Account a tick to a process and cpustat
3928 * @p: the process that the cpu time gets accounted to
3929 * @user_tick: is the tick from userspace
3930 * @rq: the pointer to rq
3931 *
3932 * Tick demultiplexing follows the order
3933 * - pending hardirq update
3934 * - pending softirq update
3935 * - user_time
3936 * - idle_time
3937 * - system time
3938 * - check for guest_time
3939 * - else account as system_time
3940 *
3941 * Check for hardirq is done both for system and user time as there is
3942 * no timer going off while we are on hardirq and hence we may never get an
3943 * opportunity to update it solely in system time.
3944 * p->stime and friends are only updated on system time and not on irq
3945 * softirq as those do not count in task exec_runtime any more.
3946 */
3947static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3948 struct rq *rq)
3949{
3950 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3951 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3952 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3953
3954 if (irqtime_account_hi_update()) {
3955 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3956 } else if (irqtime_account_si_update()) {
3957 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3958 } else if (this_cpu_ksoftirqd() == p) {
3959 /*
3960 * ksoftirqd time do not get accounted in cpu_softirq_time.
3961 * So, we have to handle it separately here.
3962 * Also, p->stime needs to be updated for ksoftirqd.
3963 */
3964 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3965 &cpustat->softirq);
3966 } else if (user_tick) {
3967 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3968 } else if (p == rq->idle) {
3969 account_idle_time(cputime_one_jiffy);
3970 } else if (p->flags & PF_VCPU) { /* System time or guest time */
3971 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3972 } else {
3973 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3974 &cpustat->system);
3975 }
3976}
3977
3978static void irqtime_account_idle_ticks(int ticks)
3979{
3980 int i;
3981 struct rq *rq = this_rq();
3982
3983 for (i = 0; i < ticks; i++)
3984 irqtime_account_process_tick(current, 0, rq);
3985}
3986#else /* CONFIG_IRQ_TIME_ACCOUNTING */
3987static void irqtime_account_idle_ticks(int ticks) {}
3988static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3989 struct rq *rq) {}
3990#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
3991
3478/* 3992/*
3479 * Account a single tick of cpu time. 3993 * Account a single tick of cpu time.
3480 * @p: the process that the cpu time gets accounted to 3994 * @p: the process that the cpu time gets accounted to
@@ -3485,6 +3999,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
3485 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 3999 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3486 struct rq *rq = this_rq(); 4000 struct rq *rq = this_rq();
3487 4001
4002 if (sched_clock_irqtime) {
4003 irqtime_account_process_tick(p, user_tick, rq);
4004 return;
4005 }
4006
3488 if (user_tick) 4007 if (user_tick)
3489 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 4008 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3490 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 4009 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3510,6 +4029,12 @@ void account_steal_ticks(unsigned long ticks)
3510 */ 4029 */
3511void account_idle_ticks(unsigned long ticks) 4030void account_idle_ticks(unsigned long ticks)
3512{ 4031{
4032
4033 if (sched_clock_irqtime) {
4034 irqtime_account_idle_ticks(ticks);
4035 return;
4036 }
4037
3513 account_idle_time(jiffies_to_cputime(ticks)); 4038 account_idle_time(jiffies_to_cputime(ticks));
3514} 4039}
3515 4040
@@ -3603,9 +4128,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3603/* 4128/*
3604 * This function gets called by the timer code, with HZ frequency. 4129 * This function gets called by the timer code, with HZ frequency.
3605 * We call it with interrupts disabled. 4130 * We call it with interrupts disabled.
3606 *
3607 * It also gets called by the fork code, when changing the parent's
3608 * timeslices.
3609 */ 4131 */
3610void scheduler_tick(void) 4132void scheduler_tick(void)
3611{ 4133{
@@ -3627,7 +4149,7 @@ void scheduler_tick(void)
3627 4149
3628 raw_spin_unlock(&rq->lock); 4150 raw_spin_unlock(&rq->lock);
3629 4151
3630 perf_event_task_tick(curr); 4152 perf_event_task_tick();
3631 4153
3632#ifdef CONFIG_SMP 4154#ifdef CONFIG_SMP
3633 rq->idle_at_tick = idle_cpu(cpu); 4155 rq->idle_at_tick = idle_cpu(cpu);
@@ -3733,19 +4255,12 @@ static inline void schedule_debug(struct task_struct *prev)
3733 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 4255 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3734 4256
3735 schedstat_inc(this_rq(), sched_count); 4257 schedstat_inc(this_rq(), sched_count);
3736#ifdef CONFIG_SCHEDSTATS
3737 if (unlikely(prev->lock_depth >= 0)) {
3738 schedstat_inc(this_rq(), bkl_count);
3739 schedstat_inc(prev, sched_info.bkl_count);
3740 }
3741#endif
3742} 4258}
3743 4259
3744static void put_prev_task(struct rq *rq, struct task_struct *prev) 4260static void put_prev_task(struct rq *rq, struct task_struct *prev)
3745{ 4261{
3746 if (prev->se.on_rq) 4262 if (prev->on_rq || rq->skip_clock_update < 0)
3747 update_rq_clock(rq); 4263 update_rq_clock(rq);
3748 rq->skip_clock_update = 0;
3749 prev->sched_class->put_prev_task(rq, prev); 4264 prev->sched_class->put_prev_task(rq, prev);
3750} 4265}
3751 4266
@@ -3776,17 +4291,13 @@ pick_next_task(struct rq *rq)
3776 } 4291 }
3777 */ 4292 */
3778 4293
3779 class = sched_class_highest; 4294 for_each_class(class) {
3780 for ( ; ; ) {
3781 p = class->pick_next_task(rq); 4295 p = class->pick_next_task(rq);
3782 if (p) 4296 if (p)
3783 return p; 4297 return p;
3784 /*
3785 * Will never be NULL as the idle class always
3786 * returns a non-NULL p:
3787 */
3788 class = class->next;
3789 } 4298 }
4299
4300 BUG(); /* the idle class will always have a runnable task */
3790} 4301}
3791 4302
3792/* 4303/*
@@ -3807,8 +4318,10 @@ need_resched:
3807 rcu_note_context_switch(cpu); 4318 rcu_note_context_switch(cpu);
3808 prev = rq->curr; 4319 prev = rq->curr;
3809 4320
3810 release_kernel_lock(prev); 4321 /* LITMUS^RT: quickly re-evaluate the scheduling decision
3811need_resched_nonpreemptible: 4322 * if the previous one is no longer valid after CTX.
4323 */
4324litmus_need_resched_nonpreemptible:
3812 TS_SCHED_START; 4325 TS_SCHED_START;
3813 sched_trace_task_switch_away(prev); 4326 sched_trace_task_switch_away(prev);
3814 4327
@@ -3818,18 +4331,19 @@ need_resched_nonpreemptible:
3818 hrtick_clear(rq); 4331 hrtick_clear(rq);
3819 4332
3820 raw_spin_lock_irq(&rq->lock); 4333 raw_spin_lock_irq(&rq->lock);
3821 clear_tsk_need_resched(prev);
3822 4334
3823 switch_count = &prev->nivcsw; 4335 switch_count = &prev->nivcsw;
3824 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 4336 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3825 if (unlikely(signal_pending_state(prev->state, prev))) { 4337 if (unlikely(signal_pending_state(prev->state, prev))) {
3826 prev->state = TASK_RUNNING; 4338 prev->state = TASK_RUNNING;
3827 } else { 4339 } else {
4340 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4341 prev->on_rq = 0;
4342
3828 /* 4343 /*
3829 * If a worker is going to sleep, notify and 4344 * If a worker went to sleep, notify and ask workqueue
3830 * ask workqueue whether it wants to wake up a 4345 * whether it wants to wake up a task to maintain
3831 * task to maintain concurrency. If so, wake 4346 * concurrency.
3832 * up the task.
3833 */ 4347 */
3834 if (prev->flags & PF_WQ_WORKER) { 4348 if (prev->flags & PF_WQ_WORKER) {
3835 struct task_struct *to_wakeup; 4349 struct task_struct *to_wakeup;
@@ -3838,7 +4352,16 @@ need_resched_nonpreemptible:
3838 if (to_wakeup) 4352 if (to_wakeup)
3839 try_to_wake_up_local(to_wakeup); 4353 try_to_wake_up_local(to_wakeup);
3840 } 4354 }
3841 deactivate_task(rq, prev, DEQUEUE_SLEEP); 4355
4356 /*
4357 * If we are going to sleep and we have plugged IO
4358 * queued, make sure to submit it to avoid deadlocks.
4359 */
4360 if (blk_needs_flush_plug(prev)) {
4361 raw_spin_unlock(&rq->lock);
4362 blk_schedule_flush_plug(prev);
4363 raw_spin_lock(&rq->lock);
4364 }
3842 } 4365 }
3843 switch_count = &prev->nvcsw; 4366 switch_count = &prev->nvcsw;
3844 } 4367 }
@@ -3850,11 +4373,10 @@ need_resched_nonpreemptible:
3850 4373
3851 put_prev_task(rq, prev); 4374 put_prev_task(rq, prev);
3852 next = pick_next_task(rq); 4375 next = pick_next_task(rq);
4376 clear_tsk_need_resched(prev);
4377 rq->skip_clock_update = 0;
3853 4378
3854 if (likely(prev != next)) { 4379 if (likely(prev != next)) {
3855 sched_info_switch(prev, next);
3856 perf_event_task_sched_out(prev, next);
3857
3858 rq->nr_switches++; 4380 rq->nr_switches++;
3859 rq->curr = next; 4381 rq->curr = next;
3860 ++*switch_count; 4382 ++*switch_count;
@@ -3880,8 +4402,8 @@ need_resched_nonpreemptible:
3880 4402
3881 post_schedule(rq); 4403 post_schedule(rq);
3882 4404
3883 if (sched_state_validate_switch() || unlikely(reacquire_kernel_lock(prev))) 4405 if (sched_state_validate_switch())
3884 goto need_resched_nonpreemptible; 4406 goto litmus_need_resched_nonpreemptible;
3885 4407
3886 preempt_enable_no_resched(); 4408 preempt_enable_no_resched();
3887 if (need_resched()) 4409 if (need_resched())
@@ -3892,70 +4414,53 @@ need_resched_nonpreemptible:
3892EXPORT_SYMBOL(schedule); 4414EXPORT_SYMBOL(schedule);
3893 4415
3894#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 4416#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4417
4418static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4419{
4420 bool ret = false;
4421
4422 rcu_read_lock();
4423 if (lock->owner != owner)
4424 goto fail;
4425
4426 /*
4427 * Ensure we emit the owner->on_cpu, dereference _after_ checking
4428 * lock->owner still matches owner, if that fails, owner might
4429 * point to free()d memory, if it still matches, the rcu_read_lock()
4430 * ensures the memory stays valid.
4431 */
4432 barrier();
4433
4434 ret = owner->on_cpu;
4435fail:
4436 rcu_read_unlock();
4437
4438 return ret;
4439}
4440
3895/* 4441/*
3896 * Look out! "owner" is an entirely speculative pointer 4442 * Look out! "owner" is an entirely speculative pointer
3897 * access and not reliable. 4443 * access and not reliable.
3898 */ 4444 */
3899int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) 4445int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
3900{ 4446{
3901 unsigned int cpu;
3902 struct rq *rq;
3903
3904 if (!sched_feat(OWNER_SPIN)) 4447 if (!sched_feat(OWNER_SPIN))
3905 return 0; 4448 return 0;
3906 4449
3907#ifdef CONFIG_DEBUG_PAGEALLOC 4450 while (owner_running(lock, owner)) {
3908 /* 4451 if (need_resched())
3909 * Need to access the cpu field knowing that 4452 return 0;
3910 * DEBUG_PAGEALLOC could have unmapped it if
3911 * the mutex owner just released it and exited.
3912 */
3913 if (probe_kernel_address(&owner->cpu, cpu))
3914 return 0;
3915#else
3916 cpu = owner->cpu;
3917#endif
3918 4453
3919 /* 4454 arch_mutex_cpu_relax();
3920 * Even if the access succeeded (likely case), 4455 }
3921 * the cpu field may no longer be valid.
3922 */
3923 if (cpu >= nr_cpumask_bits)
3924 return 0;
3925 4456
3926 /* 4457 /*
3927 * We need to validate that we can do a 4458 * If the owner changed to another task there is likely
3928 * get_cpu() and that we have the percpu area. 4459 * heavy contention, stop spinning.
3929 */ 4460 */
3930 if (!cpu_online(cpu)) 4461 if (lock->owner)
3931 return 0; 4462 return 0;
3932 4463
3933 rq = cpu_rq(cpu);
3934
3935 for (;;) {
3936 /*
3937 * Owner changed, break to re-assess state.
3938 */
3939 if (lock->owner != owner) {
3940 /*
3941 * If the lock has switched to a different owner,
3942 * we likely have heavy contention. Return 0 to quit
3943 * optimistic spinning and not contend further:
3944 */
3945 if (lock->owner)
3946 return 0;
3947 break;
3948 }
3949
3950 /*
3951 * Is that owner really running on that cpu?
3952 */
3953 if (task_thread_info(rq->curr) != owner || need_resched())
3954 return 0;
3955
3956 cpu_relax();
3957 }
3958
3959 return 1; 4464 return 1;
3960} 4465}
3961#endif 4466#endif
@@ -4085,6 +4590,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
4085{ 4590{
4086 __wake_up_common(q, mode, 1, 0, key); 4591 __wake_up_common(q, mode, 1, 0, key);
4087} 4592}
4593EXPORT_SYMBOL_GPL(__wake_up_locked_key);
4088 4594
4089/** 4595/**
4090 * __wake_up_sync_key - wake up threads blocked on a waitqueue. 4596 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
@@ -4276,7 +4782,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
4276 * This waits for either a completion of a specific task to be signaled or for a 4782 * This waits for either a completion of a specific task to be signaled or for a
4277 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 4783 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4278 */ 4784 */
4279unsigned long __sched 4785long __sched
4280wait_for_completion_interruptible_timeout(struct completion *x, 4786wait_for_completion_interruptible_timeout(struct completion *x,
4281 unsigned long timeout) 4787 unsigned long timeout)
4282{ 4788{
@@ -4309,7 +4815,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
4309 * signaled or for a specified timeout to expire. It can be 4815 * signaled or for a specified timeout to expire. It can be
4310 * interrupted by a kill signal. The timeout is in jiffies. 4816 * interrupted by a kill signal. The timeout is in jiffies.
4311 */ 4817 */
4312unsigned long __sched 4818long __sched
4313wait_for_completion_killable_timeout(struct completion *x, 4819wait_for_completion_killable_timeout(struct completion *x,
4314 unsigned long timeout) 4820 unsigned long timeout)
4315{ 4821{
@@ -4425,18 +4931,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
4425 */ 4931 */
4426void rt_mutex_setprio(struct task_struct *p, int prio) 4932void rt_mutex_setprio(struct task_struct *p, int prio)
4427{ 4933{
4428 unsigned long flags;
4429 int oldprio, on_rq, running; 4934 int oldprio, on_rq, running;
4430 struct rq *rq; 4935 struct rq *rq;
4431 const struct sched_class *prev_class; 4936 const struct sched_class *prev_class;
4432 4937
4433 BUG_ON(prio < 0 || prio > MAX_PRIO); 4938 BUG_ON(prio < 0 || prio > MAX_PRIO);
4434 4939
4435 rq = task_rq_lock(p, &flags); 4940 rq = __task_rq_lock(p);
4436 4941
4942 trace_sched_pi_setprio(p, prio);
4437 oldprio = p->prio; 4943 oldprio = p->prio;
4438 prev_class = p->sched_class; 4944 prev_class = p->sched_class;
4439 on_rq = p->se.on_rq; 4945 on_rq = p->on_rq;
4440 running = task_current(rq, p); 4946 running = task_current(rq, p);
4441 if (on_rq) 4947 if (on_rq)
4442 dequeue_task(rq, p, 0); 4948 dequeue_task(rq, p, 0);
@@ -4452,12 +4958,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4452 4958
4453 if (running) 4959 if (running)
4454 p->sched_class->set_curr_task(rq); 4960 p->sched_class->set_curr_task(rq);
4455 if (on_rq) { 4961 if (on_rq)
4456 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4962 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4457 4963
4458 check_class_changed(rq, p, prev_class, oldprio, running); 4964 check_class_changed(rq, p, prev_class, oldprio);
4459 } 4965 __task_rq_unlock(rq);
4460 task_rq_unlock(rq, &flags);
4461} 4966}
4462 4967
4463#endif 4968#endif
@@ -4485,7 +4990,7 @@ void set_user_nice(struct task_struct *p, long nice)
4485 p->static_prio = NICE_TO_PRIO(nice); 4990 p->static_prio = NICE_TO_PRIO(nice);
4486 goto out_unlock; 4991 goto out_unlock;
4487 } 4992 }
4488 on_rq = p->se.on_rq; 4993 on_rq = p->on_rq;
4489 if (on_rq) 4994 if (on_rq)
4490 dequeue_task(rq, p, 0); 4995 dequeue_task(rq, p, 0);
4491 4996
@@ -4505,7 +5010,7 @@ void set_user_nice(struct task_struct *p, long nice)
4505 resched_task(rq->curr); 5010 resched_task(rq->curr);
4506 } 5011 }
4507out_unlock: 5012out_unlock:
4508 task_rq_unlock(rq, &flags); 5013 task_rq_unlock(rq, p, &flags);
4509} 5014}
4510EXPORT_SYMBOL(set_user_nice); 5015EXPORT_SYMBOL(set_user_nice);
4511 5016
@@ -4619,8 +5124,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
4619static void 5124static void
4620__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 5125__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4621{ 5126{
4622 BUG_ON(p->se.on_rq);
4623
4624 p->policy = policy; 5127 p->policy = policy;
4625 p->rt_priority = prio; 5128 p->rt_priority = prio;
4626 p->normal_prio = normal_prio(p); 5129 p->normal_prio = normal_prio(p);
@@ -4645,14 +5148,17 @@ static bool check_same_owner(struct task_struct *p)
4645 5148
4646 rcu_read_lock(); 5149 rcu_read_lock();
4647 pcred = __task_cred(p); 5150 pcred = __task_cred(p);
4648 match = (cred->euid == pcred->euid || 5151 if (cred->user->user_ns == pcred->user->user_ns)
4649 cred->euid == pcred->uid); 5152 match = (cred->euid == pcred->euid ||
5153 cred->euid == pcred->uid);
5154 else
5155 match = false;
4650 rcu_read_unlock(); 5156 rcu_read_unlock();
4651 return match; 5157 return match;
4652} 5158}
4653 5159
4654static int __sched_setscheduler(struct task_struct *p, int policy, 5160static int __sched_setscheduler(struct task_struct *p, int policy,
4655 struct sched_param *param, bool user) 5161 const struct sched_param *param, bool user)
4656{ 5162{
4657 int retval, oldprio, oldpolicy = -1, on_rq, running; 5163 int retval, oldprio, oldpolicy = -1, on_rq, running;
4658 unsigned long flags; 5164 unsigned long flags;
@@ -4708,12 +5214,15 @@ recheck:
4708 param->sched_priority > rlim_rtprio) 5214 param->sched_priority > rlim_rtprio)
4709 return -EPERM; 5215 return -EPERM;
4710 } 5216 }
5217
4711 /* 5218 /*
4712 * Like positive nice levels, dont allow tasks to 5219 * Treat SCHED_IDLE as nice 20. Only allow a switch to
4713 * move out of SCHED_IDLE either: 5220 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
4714 */ 5221 */
4715 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) 5222 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
4716 return -EPERM; 5223 if (!can_nice(p, TASK_NICE(p)))
5224 return -EPERM;
5225 }
4717 5226
4718 /* can't change other user's priorities */ 5227 /* can't change other user's priorities */
4719 if (!check_same_owner(p)) 5228 if (!check_same_owner(p))
@@ -4725,7 +5234,7 @@ recheck:
4725 } 5234 }
4726 5235
4727 if (user) { 5236 if (user) {
4728 retval = security_task_setscheduler(p, policy, param); 5237 retval = security_task_setscheduler(p);
4729 if (retval) 5238 if (retval)
4730 return retval; 5239 return retval;
4731 } 5240 }
@@ -4739,13 +5248,30 @@ recheck:
4739 /* 5248 /*
4740 * make sure no PI-waiters arrive (or leave) while we are 5249 * make sure no PI-waiters arrive (or leave) while we are
4741 * changing the priority of the task: 5250 * changing the priority of the task:
5251 *
5252 * To be able to change p->policy safely, the appropriate
5253 * runqueue lock must be held.
4742 */ 5254 */
4743 raw_spin_lock_irqsave(&p->pi_lock, flags); 5255 rq = task_rq_lock(p, &flags);
5256
4744 /* 5257 /*
4745 * To be able to change p->policy safely, the apropriate 5258 * Changing the policy of the stop threads its a very bad idea
4746 * runqueue lock must be held.
4747 */ 5259 */
4748 rq = __task_rq_lock(p); 5260 if (p == rq->stop) {
5261 task_rq_unlock(rq, p, &flags);
5262 return -EINVAL;
5263 }
5264
5265 /*
5266 * If not changing anything there's no need to proceed further:
5267 */
5268 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
5269 param->sched_priority == p->rt_priority))) {
5270
5271 __task_rq_unlock(rq);
5272 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5273 return 0;
5274 }
4749 5275
4750#ifdef CONFIG_RT_GROUP_SCHED 5276#ifdef CONFIG_RT_GROUP_SCHED
4751 if (user) { 5277 if (user) {
@@ -4754,9 +5280,9 @@ recheck:
4754 * assigned. 5280 * assigned.
4755 */ 5281 */
4756 if (rt_bandwidth_enabled() && rt_policy(policy) && 5282 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4757 task_group(p)->rt_bandwidth.rt_runtime == 0) { 5283 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4758 __task_rq_unlock(rq); 5284 !task_group_is_autogroup(task_group(p))) {
4759 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5285 task_rq_unlock(rq, p, &flags);
4760 return -EPERM; 5286 return -EPERM;
4761 } 5287 }
4762 } 5288 }
@@ -4765,11 +5291,10 @@ recheck:
4765 /* recheck policy now with rq lock held */ 5291 /* recheck policy now with rq lock held */
4766 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 5292 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4767 policy = oldpolicy = -1; 5293 policy = oldpolicy = -1;
4768 __task_rq_unlock(rq); 5294 task_rq_unlock(rq, p, &flags);
4769 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4770 goto recheck; 5295 goto recheck;
4771 } 5296 }
4772 on_rq = p->se.on_rq; 5297 on_rq = p->on_rq;
4773 running = task_current(rq, p); 5298 running = task_current(rq, p);
4774 if (on_rq) 5299 if (on_rq)
4775 deactivate_task(rq, p, 0); 5300 deactivate_task(rq, p, 0);
@@ -4793,13 +5318,11 @@ recheck:
4793 5318
4794 if (running) 5319 if (running)
4795 p->sched_class->set_curr_task(rq); 5320 p->sched_class->set_curr_task(rq);
4796 if (on_rq) { 5321 if (on_rq)
4797 activate_task(rq, p, 0); 5322 activate_task(rq, p, 0);
4798 5323
4799 check_class_changed(rq, p, prev_class, oldprio, running); 5324 check_class_changed(rq, p, prev_class, oldprio);
4800 } 5325 task_rq_unlock(rq, p, &flags);
4801 __task_rq_unlock(rq);
4802 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4803 5326
4804 rt_mutex_adjust_pi(p); 5327 rt_mutex_adjust_pi(p);
4805 5328
@@ -4815,7 +5338,7 @@ recheck:
4815 * NOTE that the task may be already dead. 5338 * NOTE that the task may be already dead.
4816 */ 5339 */
4817int sched_setscheduler(struct task_struct *p, int policy, 5340int sched_setscheduler(struct task_struct *p, int policy,
4818 struct sched_param *param) 5341 const struct sched_param *param)
4819{ 5342{
4820 return __sched_setscheduler(p, policy, param, true); 5343 return __sched_setscheduler(p, policy, param, true);
4821} 5344}
@@ -4833,7 +5356,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
4833 * but our caller might not have that capability. 5356 * but our caller might not have that capability.
4834 */ 5357 */
4835int sched_setscheduler_nocheck(struct task_struct *p, int policy, 5358int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4836 struct sched_param *param) 5359 const struct sched_param *param)
4837{ 5360{
4838 return __sched_setscheduler(p, policy, param, false); 5361 return __sched_setscheduler(p, policy, param, false);
4839} 5362}
@@ -4980,16 +5503,16 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4980 goto out_free_cpus_allowed; 5503 goto out_free_cpus_allowed;
4981 } 5504 }
4982 retval = -EPERM; 5505 retval = -EPERM;
4983 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5506 if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
4984 goto out_unlock; 5507 goto out_unlock;
4985 5508
4986 retval = security_task_setscheduler(p, 0, NULL); 5509 retval = security_task_setscheduler(p);
4987 if (retval) 5510 if (retval)
4988 goto out_unlock; 5511 goto out_unlock;
4989 5512
4990 cpuset_cpus_allowed(p, cpus_allowed); 5513 cpuset_cpus_allowed(p, cpus_allowed);
4991 cpumask_and(new_mask, in_mask, cpus_allowed); 5514 cpumask_and(new_mask, in_mask, cpus_allowed);
4992 again: 5515again:
4993 retval = set_cpus_allowed_ptr(p, new_mask); 5516 retval = set_cpus_allowed_ptr(p, new_mask);
4994 5517
4995 if (!retval) { 5518 if (!retval) {
@@ -5051,7 +5574,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5051{ 5574{
5052 struct task_struct *p; 5575 struct task_struct *p;
5053 unsigned long flags; 5576 unsigned long flags;
5054 struct rq *rq;
5055 int retval; 5577 int retval;
5056 5578
5057 get_online_cpus(); 5579 get_online_cpus();
@@ -5066,9 +5588,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5066 if (retval) 5588 if (retval)
5067 goto out_unlock; 5589 goto out_unlock;
5068 5590
5069 rq = task_rq_lock(p, &flags); 5591 raw_spin_lock_irqsave(&p->pi_lock, flags);
5070 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 5592 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5071 task_rq_unlock(rq, &flags); 5593 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5072 5594
5073out_unlock: 5595out_unlock:
5074 rcu_read_unlock(); 5596 rcu_read_unlock();
@@ -5215,6 +5737,67 @@ void __sched yield(void)
5215} 5737}
5216EXPORT_SYMBOL(yield); 5738EXPORT_SYMBOL(yield);
5217 5739
5740/**
5741 * yield_to - yield the current processor to another thread in
5742 * your thread group, or accelerate that thread toward the
5743 * processor it's on.
5744 * @p: target task
5745 * @preempt: whether task preemption is allowed or not
5746 *
5747 * It's the caller's job to ensure that the target task struct
5748 * can't go away on us before we can do any checks.
5749 *
5750 * Returns true if we indeed boosted the target task.
5751 */
5752bool __sched yield_to(struct task_struct *p, bool preempt)
5753{
5754 struct task_struct *curr = current;
5755 struct rq *rq, *p_rq;
5756 unsigned long flags;
5757 bool yielded = 0;
5758
5759 local_irq_save(flags);
5760 rq = this_rq();
5761
5762again:
5763 p_rq = task_rq(p);
5764 double_rq_lock(rq, p_rq);
5765 while (task_rq(p) != p_rq) {
5766 double_rq_unlock(rq, p_rq);
5767 goto again;
5768 }
5769
5770 if (!curr->sched_class->yield_to_task)
5771 goto out;
5772
5773 if (curr->sched_class != p->sched_class)
5774 goto out;
5775
5776 if (task_running(p_rq, p) || p->state)
5777 goto out;
5778
5779 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5780 if (yielded) {
5781 schedstat_inc(rq, yld_count);
5782 /*
5783 * Make p's CPU reschedule; pick_next_entity takes care of
5784 * fairness.
5785 */
5786 if (preempt && rq != p_rq)
5787 resched_task(p_rq->curr);
5788 }
5789
5790out:
5791 double_rq_unlock(rq, p_rq);
5792 local_irq_restore(flags);
5793
5794 if (yielded)
5795 schedule();
5796
5797 return yielded;
5798}
5799EXPORT_SYMBOL_GPL(yield_to);
5800
5218/* 5801/*
5219 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 5802 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
5220 * that process accounting knows that this is a task in IO wait state. 5803 * that process accounting knows that this is a task in IO wait state.
@@ -5225,6 +5808,7 @@ void __sched io_schedule(void)
5225 5808
5226 delayacct_blkio_start(); 5809 delayacct_blkio_start();
5227 atomic_inc(&rq->nr_iowait); 5810 atomic_inc(&rq->nr_iowait);
5811 blk_flush_plug(current);
5228 current->in_iowait = 1; 5812 current->in_iowait = 1;
5229 schedule(); 5813 schedule();
5230 current->in_iowait = 0; 5814 current->in_iowait = 0;
@@ -5240,6 +5824,7 @@ long __sched io_schedule_timeout(long timeout)
5240 5824
5241 delayacct_blkio_start(); 5825 delayacct_blkio_start();
5242 atomic_inc(&rq->nr_iowait); 5826 atomic_inc(&rq->nr_iowait);
5827 blk_flush_plug(current);
5243 current->in_iowait = 1; 5828 current->in_iowait = 1;
5244 ret = schedule_timeout(timeout); 5829 ret = schedule_timeout(timeout);
5245 current->in_iowait = 0; 5830 current->in_iowait = 0;
@@ -5330,7 +5915,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5330 5915
5331 rq = task_rq_lock(p, &flags); 5916 rq = task_rq_lock(p, &flags);
5332 time_slice = p->sched_class->get_rr_interval(rq, p); 5917 time_slice = p->sched_class->get_rr_interval(rq, p);
5333 task_rq_unlock(rq, &flags); 5918 task_rq_unlock(rq, p, &flags);
5334 5919
5335 rcu_read_unlock(); 5920 rcu_read_unlock();
5336 jiffies_to_timespec(time_slice, &t); 5921 jiffies_to_timespec(time_slice, &t);
@@ -5350,7 +5935,7 @@ void sched_show_task(struct task_struct *p)
5350 unsigned state; 5935 unsigned state;
5351 5936
5352 state = p->state ? __ffs(p->state) + 1 : 0; 5937 state = p->state ? __ffs(p->state) + 1 : 0;
5353 printk(KERN_INFO "%-13.13s %c", p->comm, 5938 printk(KERN_INFO "%-15.15s %c", p->comm,
5354 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5939 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5355#if BITS_PER_LONG == 32 5940#if BITS_PER_LONG == 32
5356 if (state == TASK_RUNNING) 5941 if (state == TASK_RUNNING)
@@ -5388,7 +5973,7 @@ void show_state_filter(unsigned long state_filter)
5388 do_each_thread(g, p) { 5973 do_each_thread(g, p) {
5389 /* 5974 /*
5390 * reset the NMI-timeout, listing all files on a slow 5975 * reset the NMI-timeout, listing all files on a slow
5391 * console might take alot of time: 5976 * console might take a lot of time:
5392 */ 5977 */
5393 touch_nmi_watchdog(); 5978 touch_nmi_watchdog();
5394 if (!state_filter || (p->state & state_filter)) 5979 if (!state_filter || (p->state & state_filter))
@@ -5432,26 +6017,35 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5432 idle->state = TASK_RUNNING; 6017 idle->state = TASK_RUNNING;
5433 idle->se.exec_start = sched_clock(); 6018 idle->se.exec_start = sched_clock();
5434 6019
5435 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 6020 do_set_cpus_allowed(idle, cpumask_of(cpu));
6021 /*
6022 * We're having a chicken and egg problem, even though we are
6023 * holding rq->lock, the cpu isn't yet set to this cpu so the
6024 * lockdep check in task_group() will fail.
6025 *
6026 * Similar case to sched_fork(). / Alternatively we could
6027 * use task_rq_lock() here and obtain the other rq->lock.
6028 *
6029 * Silence PROVE_RCU
6030 */
6031 rcu_read_lock();
5436 __set_task_cpu(idle, cpu); 6032 __set_task_cpu(idle, cpu);
6033 rcu_read_unlock();
5437 6034
5438 rq->curr = rq->idle = idle; 6035 rq->curr = rq->idle = idle;
5439#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 6036#if defined(CONFIG_SMP)
5440 idle->oncpu = 1; 6037 idle->on_cpu = 1;
5441#endif 6038#endif
5442 raw_spin_unlock_irqrestore(&rq->lock, flags); 6039 raw_spin_unlock_irqrestore(&rq->lock, flags);
5443 6040
5444 /* Set the preempt count _outside_ the spinlocks! */ 6041 /* Set the preempt count _outside_ the spinlocks! */
5445#if defined(CONFIG_PREEMPT)
5446 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5447#else
5448 task_thread_info(idle)->preempt_count = 0; 6042 task_thread_info(idle)->preempt_count = 0;
5449#endif 6043
5450 /* 6044 /*
5451 * The idle tasks have their own, simple scheduling class: 6045 * The idle tasks have their own, simple scheduling class:
5452 */ 6046 */
5453 idle->sched_class = &idle_sched_class; 6047 idle->sched_class = &idle_sched_class;
5454 ftrace_graph_init_task(idle); 6048 ftrace_graph_init_idle_task(idle, cpu);
5455} 6049}
5456 6050
5457/* 6051/*
@@ -5502,7 +6096,6 @@ static void update_sysctl(void)
5502 SET_SYSCTL(sched_min_granularity); 6096 SET_SYSCTL(sched_min_granularity);
5503 SET_SYSCTL(sched_latency); 6097 SET_SYSCTL(sched_latency);
5504 SET_SYSCTL(sched_wakeup_granularity); 6098 SET_SYSCTL(sched_wakeup_granularity);
5505 SET_SYSCTL(sched_shares_ratelimit);
5506#undef SET_SYSCTL 6099#undef SET_SYSCTL
5507} 6100}
5508 6101
@@ -5512,6 +6105,16 @@ static inline void sched_init_granularity(void)
5512} 6105}
5513 6106
5514#ifdef CONFIG_SMP 6107#ifdef CONFIG_SMP
6108void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
6109{
6110 if (p->sched_class && p->sched_class->set_cpus_allowed)
6111 p->sched_class->set_cpus_allowed(p, new_mask);
6112 else {
6113 cpumask_copy(&p->cpus_allowed, new_mask);
6114 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
6115 }
6116}
6117
5515/* 6118/*
5516 * This is how migration works: 6119 * This is how migration works:
5517 * 6120 *
@@ -5542,52 +6145,38 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5542 unsigned int dest_cpu; 6145 unsigned int dest_cpu;
5543 int ret = 0; 6146 int ret = 0;
5544 6147
5545 /*
5546 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5547 * drop the rq->lock and still rely on ->cpus_allowed.
5548 */
5549again:
5550 while (task_is_waking(p))
5551 cpu_relax();
5552 rq = task_rq_lock(p, &flags); 6148 rq = task_rq_lock(p, &flags);
5553 if (task_is_waking(p)) { 6149
5554 task_rq_unlock(rq, &flags); 6150 if (cpumask_equal(&p->cpus_allowed, new_mask))
5555 goto again; 6151 goto out;
5556 }
5557 6152
5558 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 6153 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5559 ret = -EINVAL; 6154 ret = -EINVAL;
5560 goto out; 6155 goto out;
5561 } 6156 }
5562 6157
5563 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 6158 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
5564 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5565 ret = -EINVAL; 6159 ret = -EINVAL;
5566 goto out; 6160 goto out;
5567 } 6161 }
5568 6162
5569 if (p->sched_class->set_cpus_allowed) 6163 do_set_cpus_allowed(p, new_mask);
5570 p->sched_class->set_cpus_allowed(p, new_mask);
5571 else {
5572 cpumask_copy(&p->cpus_allowed, new_mask);
5573 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5574 }
5575 6164
5576 /* Can the task run on the task's current CPU? If so, we're done */ 6165 /* Can the task run on the task's current CPU? If so, we're done */
5577 if (cpumask_test_cpu(task_cpu(p), new_mask)) 6166 if (cpumask_test_cpu(task_cpu(p), new_mask))
5578 goto out; 6167 goto out;
5579 6168
5580 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 6169 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5581 if (migrate_task(p, dest_cpu)) { 6170 if (p->on_rq) {
5582 struct migration_arg arg = { p, dest_cpu }; 6171 struct migration_arg arg = { p, dest_cpu };
5583 /* Need help from migration thread: drop lock and wait. */ 6172 /* Need help from migration thread: drop lock and wait. */
5584 task_rq_unlock(rq, &flags); 6173 task_rq_unlock(rq, p, &flags);
5585 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 6174 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5586 tlb_migrate_finish(p->mm); 6175 tlb_migrate_finish(p->mm);
5587 return 0; 6176 return 0;
5588 } 6177 }
5589out: 6178out:
5590 task_rq_unlock(rq, &flags); 6179 task_rq_unlock(rq, p, &flags);
5591 6180
5592 return ret; 6181 return ret;
5593} 6182}
@@ -5615,6 +6204,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5615 rq_src = cpu_rq(src_cpu); 6204 rq_src = cpu_rq(src_cpu);
5616 rq_dest = cpu_rq(dest_cpu); 6205 rq_dest = cpu_rq(dest_cpu);
5617 6206
6207 raw_spin_lock(&p->pi_lock);
5618 double_rq_lock(rq_src, rq_dest); 6208 double_rq_lock(rq_src, rq_dest);
5619 /* Already moved. */ 6209 /* Already moved. */
5620 if (task_cpu(p) != src_cpu) 6210 if (task_cpu(p) != src_cpu)
@@ -5627,7 +6217,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5627 * If we're not on a rq, the next wake-up will ensure we're 6217 * If we're not on a rq, the next wake-up will ensure we're
5628 * placed properly. 6218 * placed properly.
5629 */ 6219 */
5630 if (p->se.on_rq) { 6220 if (p->on_rq) {
5631 deactivate_task(rq_src, p, 0); 6221 deactivate_task(rq_src, p, 0);
5632 set_task_cpu(p, dest_cpu); 6222 set_task_cpu(p, dest_cpu);
5633 activate_task(rq_dest, p, 0); 6223 activate_task(rq_dest, p, 0);
@@ -5637,6 +6227,7 @@ done:
5637 ret = 1; 6227 ret = 1;
5638fail: 6228fail:
5639 double_rq_unlock(rq_src, rq_dest); 6229 double_rq_unlock(rq_src, rq_dest);
6230 raw_spin_unlock(&p->pi_lock);
5640 return ret; 6231 return ret;
5641} 6232}
5642 6233
@@ -5660,29 +6251,20 @@ static int migration_cpu_stop(void *data)
5660} 6251}
5661 6252
5662#ifdef CONFIG_HOTPLUG_CPU 6253#ifdef CONFIG_HOTPLUG_CPU
6254
5663/* 6255/*
5664 * Figure out where task on dead CPU should go, use force if necessary. 6256 * Ensures that the idle task is using init_mm right before its cpu goes
6257 * offline.
5665 */ 6258 */
5666void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 6259void idle_task_exit(void)
5667{ 6260{
5668 struct rq *rq = cpu_rq(dead_cpu); 6261 struct mm_struct *mm = current->active_mm;
5669 int needs_cpu, uninitialized_var(dest_cpu);
5670 unsigned long flags;
5671 6262
5672 local_irq_save(flags); 6263 BUG_ON(cpu_online(smp_processor_id()));
5673 6264
5674 raw_spin_lock(&rq->lock); 6265 if (mm != &init_mm)
5675 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); 6266 switch_mm(mm, &init_mm, current);
5676 if (needs_cpu) 6267 mmdrop(mm);
5677 dest_cpu = select_fallback_rq(dead_cpu, p);
5678 raw_spin_unlock(&rq->lock);
5679 /*
5680 * It can only fail if we race with set_cpus_allowed(),
5681 * in the racer should migrate the task anyway.
5682 */
5683 if (needs_cpu)
5684 __migrate_task(p, dead_cpu, dest_cpu);
5685 local_irq_restore(flags);
5686} 6268}
5687 6269
5688/* 6270/*
@@ -5695,128 +6277,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5695static void migrate_nr_uninterruptible(struct rq *rq_src) 6277static void migrate_nr_uninterruptible(struct rq *rq_src)
5696{ 6278{
5697 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 6279 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5698 unsigned long flags;
5699 6280
5700 local_irq_save(flags);
5701 double_rq_lock(rq_src, rq_dest);
5702 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 6281 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5703 rq_src->nr_uninterruptible = 0; 6282 rq_src->nr_uninterruptible = 0;
5704 double_rq_unlock(rq_src, rq_dest);
5705 local_irq_restore(flags);
5706}
5707
5708/* Run through task list and migrate tasks from the dead cpu. */
5709static void migrate_live_tasks(int src_cpu)
5710{
5711 struct task_struct *p, *t;
5712
5713 read_lock(&tasklist_lock);
5714
5715 do_each_thread(t, p) {
5716 if (p == current)
5717 continue;
5718
5719 if (task_cpu(p) == src_cpu)
5720 move_task_off_dead_cpu(src_cpu, p);
5721 } while_each_thread(t, p);
5722
5723 read_unlock(&tasklist_lock);
5724} 6283}
5725 6284
5726/* 6285/*
5727 * Schedules idle task to be the next runnable task on current CPU. 6286 * remove the tasks which were accounted by rq from calc_load_tasks.
5728 * It does so by boosting its priority to highest possible.
5729 * Used by CPU offline code.
5730 */ 6287 */
5731void sched_idle_next(void) 6288static void calc_global_load_remove(struct rq *rq)
5732{ 6289{
5733 int this_cpu = smp_processor_id(); 6290 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5734 struct rq *rq = cpu_rq(this_cpu); 6291 rq->calc_load_active = 0;
5735 struct task_struct *p = rq->idle;
5736 unsigned long flags;
5737
5738 /* cpu has to be offline */
5739 BUG_ON(cpu_online(this_cpu));
5740
5741 /*
5742 * Strictly not necessary since rest of the CPUs are stopped by now
5743 * and interrupts disabled on the current cpu.
5744 */
5745 raw_spin_lock_irqsave(&rq->lock, flags);
5746
5747 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5748
5749 activate_task(rq, p, 0);
5750
5751 raw_spin_unlock_irqrestore(&rq->lock, flags);
5752} 6292}
5753 6293
5754/* 6294/*
5755 * Ensures that the idle task is using init_mm right before its cpu goes 6295 * Migrate all tasks from the rq, sleeping tasks will be migrated by
5756 * offline. 6296 * try_to_wake_up()->select_task_rq().
6297 *
6298 * Called with rq->lock held even though we'er in stop_machine() and
6299 * there's no concurrency possible, we hold the required locks anyway
6300 * because of lock validation efforts.
5757 */ 6301 */
5758void idle_task_exit(void) 6302static void migrate_tasks(unsigned int dead_cpu)
5759{
5760 struct mm_struct *mm = current->active_mm;
5761
5762 BUG_ON(cpu_online(smp_processor_id()));
5763
5764 if (mm != &init_mm)
5765 switch_mm(mm, &init_mm, current);
5766 mmdrop(mm);
5767}
5768
5769/* called under rq->lock with disabled interrupts */
5770static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5771{ 6303{
5772 struct rq *rq = cpu_rq(dead_cpu); 6304 struct rq *rq = cpu_rq(dead_cpu);
5773 6305 struct task_struct *next, *stop = rq->stop;
5774 /* Must be exiting, otherwise would be on tasklist. */ 6306 int dest_cpu;
5775 BUG_ON(!p->exit_state);
5776
5777 /* Cannot have done final schedule yet: would have vanished. */
5778 BUG_ON(p->state == TASK_DEAD);
5779
5780 get_task_struct(p);
5781 6307
5782 /* 6308 /*
5783 * Drop lock around migration; if someone else moves it, 6309 * Fudge the rq selection such that the below task selection loop
5784 * that's OK. No task can be added to this CPU, so iteration is 6310 * doesn't get stuck on the currently eligible stop task.
5785 * fine. 6311 *
6312 * We're currently inside stop_machine() and the rq is either stuck
6313 * in the stop_machine_cpu_stop() loop, or we're executing this code,
6314 * either way we should never end up calling schedule() until we're
6315 * done here.
5786 */ 6316 */
5787 raw_spin_unlock_irq(&rq->lock); 6317 rq->stop = NULL;
5788 move_task_off_dead_cpu(dead_cpu, p);
5789 raw_spin_lock_irq(&rq->lock);
5790
5791 put_task_struct(p);
5792}
5793
5794/* release_task() removes task from tasklist, so we won't find dead tasks. */
5795static void migrate_dead_tasks(unsigned int dead_cpu)
5796{
5797 struct rq *rq = cpu_rq(dead_cpu);
5798 struct task_struct *next;
5799 6318
5800 for ( ; ; ) { 6319 for ( ; ; ) {
5801 if (!rq->nr_running) 6320 /*
6321 * There's this thread running, bail when that's the only
6322 * remaining thread.
6323 */
6324 if (rq->nr_running == 1)
5802 break; 6325 break;
6326
5803 next = pick_next_task(rq); 6327 next = pick_next_task(rq);
5804 if (!next) 6328 BUG_ON(!next);
5805 break;
5806 next->sched_class->put_prev_task(rq, next); 6329 next->sched_class->put_prev_task(rq, next);
5807 migrate_dead(dead_cpu, next);
5808 6330
6331 /* Find suitable destination for @next, with force if needed. */
6332 dest_cpu = select_fallback_rq(dead_cpu, next);
6333 raw_spin_unlock(&rq->lock);
6334
6335 __migrate_task(next, dead_cpu, dest_cpu);
6336
6337 raw_spin_lock(&rq->lock);
5809 } 6338 }
5810}
5811 6339
5812/* 6340 rq->stop = stop;
5813 * remove the tasks which were accounted by rq from calc_load_tasks.
5814 */
5815static void calc_global_load_remove(struct rq *rq)
5816{
5817 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5818 rq->calc_load_active = 0;
5819} 6341}
6342
5820#endif /* CONFIG_HOTPLUG_CPU */ 6343#endif /* CONFIG_HOTPLUG_CPU */
5821 6344
5822#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 6345#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6026,15 +6549,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6026 unsigned long flags; 6549 unsigned long flags;
6027 struct rq *rq = cpu_rq(cpu); 6550 struct rq *rq = cpu_rq(cpu);
6028 6551
6029 switch (action) { 6552 switch (action & ~CPU_TASKS_FROZEN) {
6030 6553
6031 case CPU_UP_PREPARE: 6554 case CPU_UP_PREPARE:
6032 case CPU_UP_PREPARE_FROZEN:
6033 rq->calc_load_update = calc_load_update; 6555 rq->calc_load_update = calc_load_update;
6034 break; 6556 break;
6035 6557
6036 case CPU_ONLINE: 6558 case CPU_ONLINE:
6037 case CPU_ONLINE_FROZEN:
6038 /* Update our root-domain */ 6559 /* Update our root-domain */
6039 raw_spin_lock_irqsave(&rq->lock, flags); 6560 raw_spin_lock_irqsave(&rq->lock, flags);
6040 if (rq->rd) { 6561 if (rq->rd) {
@@ -6046,33 +6567,26 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6046 break; 6567 break;
6047 6568
6048#ifdef CONFIG_HOTPLUG_CPU 6569#ifdef CONFIG_HOTPLUG_CPU
6049 case CPU_DEAD:
6050 case CPU_DEAD_FROZEN:
6051 migrate_live_tasks(cpu);
6052 /* Idle task back to normal (off runqueue, low prio) */
6053 raw_spin_lock_irq(&rq->lock);
6054 deactivate_task(rq, rq->idle, 0);
6055 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6056 rq->idle->sched_class = &idle_sched_class;
6057 migrate_dead_tasks(cpu);
6058 raw_spin_unlock_irq(&rq->lock);
6059 migrate_nr_uninterruptible(rq);
6060 BUG_ON(rq->nr_running != 0);
6061 calc_global_load_remove(rq);
6062 break;
6063
6064 case CPU_DYING: 6570 case CPU_DYING:
6065 case CPU_DYING_FROZEN: 6571 sched_ttwu_pending();
6066 /* Update our root-domain */ 6572 /* Update our root-domain */
6067 raw_spin_lock_irqsave(&rq->lock, flags); 6573 raw_spin_lock_irqsave(&rq->lock, flags);
6068 if (rq->rd) { 6574 if (rq->rd) {
6069 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6575 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6070 set_rq_offline(rq); 6576 set_rq_offline(rq);
6071 } 6577 }
6578 migrate_tasks(cpu);
6579 BUG_ON(rq->nr_running != 1); /* the migration thread */
6072 raw_spin_unlock_irqrestore(&rq->lock, flags); 6580 raw_spin_unlock_irqrestore(&rq->lock, flags);
6581
6582 migrate_nr_uninterruptible(rq);
6583 calc_global_load_remove(rq);
6073 break; 6584 break;
6074#endif 6585#endif
6075 } 6586 }
6587
6588 update_max_interval();
6589
6076 return NOTIFY_OK; 6590 return NOTIFY_OK;
6077} 6591}
6078 6592
@@ -6133,6 +6647,8 @@ early_initcall(migration_init);
6133 6647
6134#ifdef CONFIG_SMP 6648#ifdef CONFIG_SMP
6135 6649
6650static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
6651
6136#ifdef CONFIG_SCHED_DEBUG 6652#ifdef CONFIG_SCHED_DEBUG
6137 6653
6138static __read_mostly int sched_domain_debug_enabled; 6654static __read_mostly int sched_domain_debug_enabled;
@@ -6183,7 +6699,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6183 break; 6699 break;
6184 } 6700 }
6185 6701
6186 if (!group->cpu_power) { 6702 if (!group->sgp->power) {
6187 printk(KERN_CONT "\n"); 6703 printk(KERN_CONT "\n");
6188 printk(KERN_ERR "ERROR: domain->cpu_power not " 6704 printk(KERN_ERR "ERROR: domain->cpu_power not "
6189 "set\n"); 6705 "set\n");
@@ -6207,9 +6723,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6207 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 6723 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6208 6724
6209 printk(KERN_CONT " %s", str); 6725 printk(KERN_CONT " %s", str);
6210 if (group->cpu_power != SCHED_LOAD_SCALE) { 6726 if (group->sgp->power != SCHED_POWER_SCALE) {
6211 printk(KERN_CONT " (cpu_power = %d)", 6727 printk(KERN_CONT " (cpu_power = %d)",
6212 group->cpu_power); 6728 group->sgp->power);
6213 } 6729 }
6214 6730
6215 group = group->next; 6731 group = group->next;
@@ -6228,7 +6744,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6228 6744
6229static void sched_domain_debug(struct sched_domain *sd, int cpu) 6745static void sched_domain_debug(struct sched_domain *sd, int cpu)
6230{ 6746{
6231 cpumask_var_t groupmask;
6232 int level = 0; 6747 int level = 0;
6233 6748
6234 if (!sched_domain_debug_enabled) 6749 if (!sched_domain_debug_enabled)
@@ -6241,20 +6756,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6241 6756
6242 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6757 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6243 6758
6244 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6245 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6246 return;
6247 }
6248
6249 for (;;) { 6759 for (;;) {
6250 if (sched_domain_debug_one(sd, cpu, level, groupmask)) 6760 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
6251 break; 6761 break;
6252 level++; 6762 level++;
6253 sd = sd->parent; 6763 sd = sd->parent;
6254 if (!sd) 6764 if (!sd)
6255 break; 6765 break;
6256 } 6766 }
6257 free_cpumask_var(groupmask);
6258} 6767}
6259#else /* !CONFIG_SCHED_DEBUG */ 6768#else /* !CONFIG_SCHED_DEBUG */
6260# define sched_domain_debug(sd, cpu) do { } while (0) 6769# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6311,12 +6820,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6311 return 1; 6820 return 1;
6312} 6821}
6313 6822
6314static void free_rootdomain(struct root_domain *rd) 6823static void free_rootdomain(struct rcu_head *rcu)
6315{ 6824{
6316 synchronize_sched(); 6825 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
6317 6826
6318 cpupri_cleanup(&rd->cpupri); 6827 cpupri_cleanup(&rd->cpupri);
6319
6320 free_cpumask_var(rd->rto_mask); 6828 free_cpumask_var(rd->rto_mask);
6321 free_cpumask_var(rd->online); 6829 free_cpumask_var(rd->online);
6322 free_cpumask_var(rd->span); 6830 free_cpumask_var(rd->span);
@@ -6357,7 +6865,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6357 raw_spin_unlock_irqrestore(&rq->lock, flags); 6865 raw_spin_unlock_irqrestore(&rq->lock, flags);
6358 6866
6359 if (old_rd) 6867 if (old_rd)
6360 free_rootdomain(old_rd); 6868 call_rcu_sched(&old_rd->rcu, free_rootdomain);
6361} 6869}
6362 6870
6363static int init_rootdomain(struct root_domain *rd) 6871static int init_rootdomain(struct root_domain *rd)
@@ -6408,6 +6916,53 @@ static struct root_domain *alloc_rootdomain(void)
6408 return rd; 6916 return rd;
6409} 6917}
6410 6918
6919static void free_sched_groups(struct sched_group *sg, int free_sgp)
6920{
6921 struct sched_group *tmp, *first;
6922
6923 if (!sg)
6924 return;
6925
6926 first = sg;
6927 do {
6928 tmp = sg->next;
6929
6930 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
6931 kfree(sg->sgp);
6932
6933 kfree(sg);
6934 sg = tmp;
6935 } while (sg != first);
6936}
6937
6938static void free_sched_domain(struct rcu_head *rcu)
6939{
6940 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6941
6942 /*
6943 * If its an overlapping domain it has private groups, iterate and
6944 * nuke them all.
6945 */
6946 if (sd->flags & SD_OVERLAP) {
6947 free_sched_groups(sd->groups, 1);
6948 } else if (atomic_dec_and_test(&sd->groups->ref)) {
6949 kfree(sd->groups->sgp);
6950 kfree(sd->groups);
6951 }
6952 kfree(sd);
6953}
6954
6955static void destroy_sched_domain(struct sched_domain *sd, int cpu)
6956{
6957 call_rcu(&sd->rcu, free_sched_domain);
6958}
6959
6960static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6961{
6962 for (; sd; sd = sd->parent)
6963 destroy_sched_domain(sd, cpu);
6964}
6965
6411/* 6966/*
6412 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6967 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6413 * hold the hotplug lock. 6968 * hold the hotplug lock.
@@ -6418,9 +6973,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6418 struct rq *rq = cpu_rq(cpu); 6973 struct rq *rq = cpu_rq(cpu);
6419 struct sched_domain *tmp; 6974 struct sched_domain *tmp;
6420 6975
6421 for (tmp = sd; tmp; tmp = tmp->parent)
6422 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6423
6424 /* Remove the sched domains which do not contribute to scheduling. */ 6976 /* Remove the sched domains which do not contribute to scheduling. */
6425 for (tmp = sd; tmp; ) { 6977 for (tmp = sd; tmp; ) {
6426 struct sched_domain *parent = tmp->parent; 6978 struct sched_domain *parent = tmp->parent;
@@ -6431,12 +6983,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6431 tmp->parent = parent->parent; 6983 tmp->parent = parent->parent;
6432 if (parent->parent) 6984 if (parent->parent)
6433 parent->parent->child = tmp; 6985 parent->parent->child = tmp;
6986 destroy_sched_domain(parent, cpu);
6434 } else 6987 } else
6435 tmp = tmp->parent; 6988 tmp = tmp->parent;
6436 } 6989 }
6437 6990
6438 if (sd && sd_degenerate(sd)) { 6991 if (sd && sd_degenerate(sd)) {
6992 tmp = sd;
6439 sd = sd->parent; 6993 sd = sd->parent;
6994 destroy_sched_domain(tmp, cpu);
6440 if (sd) 6995 if (sd)
6441 sd->child = NULL; 6996 sd->child = NULL;
6442 } 6997 }
@@ -6444,7 +6999,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6444 sched_domain_debug(sd, cpu); 6999 sched_domain_debug(sd, cpu);
6445 7000
6446 rq_attach_root(rq, rd); 7001 rq_attach_root(rq, rd);
7002 tmp = rq->sd;
6447 rcu_assign_pointer(rq->sd, sd); 7003 rcu_assign_pointer(rq->sd, sd);
7004 destroy_sched_domains(tmp, cpu);
6448} 7005}
6449 7006
6450/* cpus with isolated domains */ 7007/* cpus with isolated domains */
@@ -6460,56 +7017,6 @@ static int __init isolated_cpu_setup(char *str)
6460 7017
6461__setup("isolcpus=", isolated_cpu_setup); 7018__setup("isolcpus=", isolated_cpu_setup);
6462 7019
6463/*
6464 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6465 * to a function which identifies what group(along with sched group) a CPU
6466 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6467 * (due to the fact that we keep track of groups covered with a struct cpumask).
6468 *
6469 * init_sched_build_groups will build a circular linked list of the groups
6470 * covered by the given span, and will set each group's ->cpumask correctly,
6471 * and ->cpu_power to 0.
6472 */
6473static void
6474init_sched_build_groups(const struct cpumask *span,
6475 const struct cpumask *cpu_map,
6476 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6477 struct sched_group **sg,
6478 struct cpumask *tmpmask),
6479 struct cpumask *covered, struct cpumask *tmpmask)
6480{
6481 struct sched_group *first = NULL, *last = NULL;
6482 int i;
6483
6484 cpumask_clear(covered);
6485
6486 for_each_cpu(i, span) {
6487 struct sched_group *sg;
6488 int group = group_fn(i, cpu_map, &sg, tmpmask);
6489 int j;
6490
6491 if (cpumask_test_cpu(i, covered))
6492 continue;
6493
6494 cpumask_clear(sched_group_cpus(sg));
6495 sg->cpu_power = 0;
6496
6497 for_each_cpu(j, span) {
6498 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6499 continue;
6500
6501 cpumask_set_cpu(j, covered);
6502 cpumask_set_cpu(j, sched_group_cpus(sg));
6503 }
6504 if (!first)
6505 first = sg;
6506 if (last)
6507 last->next = sg;
6508 last = sg;
6509 }
6510 last->next = first;
6511}
6512
6513#define SD_NODES_PER_DOMAIN 16 7020#define SD_NODES_PER_DOMAIN 16
6514 7021
6515#ifdef CONFIG_NUMA 7022#ifdef CONFIG_NUMA
@@ -6526,7 +7033,7 @@ init_sched_build_groups(const struct cpumask *span,
6526 */ 7033 */
6527static int find_next_best_node(int node, nodemask_t *used_nodes) 7034static int find_next_best_node(int node, nodemask_t *used_nodes)
6528{ 7035{
6529 int i, n, val, min_val, best_node = 0; 7036 int i, n, val, min_val, best_node = -1;
6530 7037
6531 min_val = INT_MAX; 7038 min_val = INT_MAX;
6532 7039
@@ -6550,7 +7057,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6550 } 7057 }
6551 } 7058 }
6552 7059
6553 node_set(best_node, *used_nodes); 7060 if (best_node != -1)
7061 node_set(best_node, *used_nodes);
6554 return best_node; 7062 return best_node;
6555} 7063}
6556 7064
@@ -6576,293 +7084,197 @@ static void sched_domain_node_span(int node, struct cpumask *span)
6576 7084
6577 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 7085 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6578 int next_node = find_next_best_node(node, &used_nodes); 7086 int next_node = find_next_best_node(node, &used_nodes);
6579 7087 if (next_node < 0)
7088 break;
6580 cpumask_or(span, span, cpumask_of_node(next_node)); 7089 cpumask_or(span, span, cpumask_of_node(next_node));
6581 } 7090 }
6582} 7091}
7092
7093static const struct cpumask *cpu_node_mask(int cpu)
7094{
7095 lockdep_assert_held(&sched_domains_mutex);
7096
7097 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
7098
7099 return sched_domains_tmpmask;
7100}
7101
7102static const struct cpumask *cpu_allnodes_mask(int cpu)
7103{
7104 return cpu_possible_mask;
7105}
6583#endif /* CONFIG_NUMA */ 7106#endif /* CONFIG_NUMA */
6584 7107
6585int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 7108static const struct cpumask *cpu_cpu_mask(int cpu)
7109{
7110 return cpumask_of_node(cpu_to_node(cpu));
7111}
6586 7112
6587/* 7113int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6588 * The cpus mask in sched_group and sched_domain hangs off the end.
6589 *
6590 * ( See the the comments in include/linux/sched.h:struct sched_group
6591 * and struct sched_domain. )
6592 */
6593struct static_sched_group {
6594 struct sched_group sg;
6595 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
6596};
6597 7114
6598struct static_sched_domain { 7115struct sd_data {
6599 struct sched_domain sd; 7116 struct sched_domain **__percpu sd;
6600 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 7117 struct sched_group **__percpu sg;
7118 struct sched_group_power **__percpu sgp;
6601}; 7119};
6602 7120
6603struct s_data { 7121struct s_data {
6604#ifdef CONFIG_NUMA 7122 struct sched_domain ** __percpu sd;
6605 int sd_allnodes;
6606 cpumask_var_t domainspan;
6607 cpumask_var_t covered;
6608 cpumask_var_t notcovered;
6609#endif
6610 cpumask_var_t nodemask;
6611 cpumask_var_t this_sibling_map;
6612 cpumask_var_t this_core_map;
6613 cpumask_var_t send_covered;
6614 cpumask_var_t tmpmask;
6615 struct sched_group **sched_group_nodes;
6616 struct root_domain *rd; 7123 struct root_domain *rd;
6617}; 7124};
6618 7125
6619enum s_alloc { 7126enum s_alloc {
6620 sa_sched_groups = 0,
6621 sa_rootdomain, 7127 sa_rootdomain,
6622 sa_tmpmask, 7128 sa_sd,
6623 sa_send_covered, 7129 sa_sd_storage,
6624 sa_this_core_map,
6625 sa_this_sibling_map,
6626 sa_nodemask,
6627 sa_sched_group_nodes,
6628#ifdef CONFIG_NUMA
6629 sa_notcovered,
6630 sa_covered,
6631 sa_domainspan,
6632#endif
6633 sa_none, 7130 sa_none,
6634}; 7131};
6635 7132
6636/* 7133struct sched_domain_topology_level;
6637 * SMT sched-domains:
6638 */
6639#ifdef CONFIG_SCHED_SMT
6640static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
6641static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
6642 7134
6643static int 7135typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
6644cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 7136typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
6645 struct sched_group **sg, struct cpumask *unused)
6646{
6647 if (sg)
6648 *sg = &per_cpu(sched_groups, cpu).sg;
6649 return cpu;
6650}
6651#endif /* CONFIG_SCHED_SMT */
6652 7137
6653/* 7138#define SDTL_OVERLAP 0x01
6654 * multi-core sched-domains:
6655 */
6656#ifdef CONFIG_SCHED_MC
6657static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
6658static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6659#endif /* CONFIG_SCHED_MC */
6660 7139
6661#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 7140struct sched_domain_topology_level {
6662static int 7141 sched_domain_init_f init;
6663cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 7142 sched_domain_mask_f mask;
6664 struct sched_group **sg, struct cpumask *mask) 7143 int flags;
6665{ 7144 struct sd_data data;
6666 int group; 7145};
6667 7146
6668 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6669 group = cpumask_first(mask);
6670 if (sg)
6671 *sg = &per_cpu(sched_group_core, group).sg;
6672 return group;
6673}
6674#elif defined(CONFIG_SCHED_MC)
6675static int 7147static int
6676cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 7148build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6677 struct sched_group **sg, struct cpumask *unused)
6678{ 7149{
6679 if (sg) 7150 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
6680 *sg = &per_cpu(sched_group_core, cpu).sg; 7151 const struct cpumask *span = sched_domain_span(sd);
6681 return cpu; 7152 struct cpumask *covered = sched_domains_tmpmask;
6682} 7153 struct sd_data *sdd = sd->private;
6683#endif 7154 struct sched_domain *child;
7155 int i;
6684 7156
6685static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 7157 cpumask_clear(covered);
6686static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
6687 7158
6688static int 7159 for_each_cpu(i, span) {
6689cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, 7160 struct cpumask *sg_span;
6690 struct sched_group **sg, struct cpumask *mask)
6691{
6692 int group;
6693#ifdef CONFIG_SCHED_MC
6694 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6695 group = cpumask_first(mask);
6696#elif defined(CONFIG_SCHED_SMT)
6697 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6698 group = cpumask_first(mask);
6699#else
6700 group = cpu;
6701#endif
6702 if (sg)
6703 *sg = &per_cpu(sched_group_phys, group).sg;
6704 return group;
6705}
6706 7161
6707#ifdef CONFIG_NUMA 7162 if (cpumask_test_cpu(i, covered))
6708/* 7163 continue;
6709 * The init_sched_build_groups can't handle what we want to do with node
6710 * groups, so roll our own. Now each node has its own list of groups which
6711 * gets dynamically allocated.
6712 */
6713static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
6714static struct sched_group ***sched_group_nodes_bycpu;
6715 7164
6716static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); 7165 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6717static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); 7166 GFP_KERNEL, cpu_to_node(i));
6718 7167
6719static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, 7168 if (!sg)
6720 struct sched_group **sg, 7169 goto fail;
6721 struct cpumask *nodemask)
6722{
6723 int group;
6724 7170
6725 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); 7171 sg_span = sched_group_cpus(sg);
6726 group = cpumask_first(nodemask);
6727 7172
6728 if (sg) 7173 child = *per_cpu_ptr(sdd->sd, i);
6729 *sg = &per_cpu(sched_group_allnodes, group).sg; 7174 if (child->child) {
6730 return group; 7175 child = child->child;
6731} 7176 cpumask_copy(sg_span, sched_domain_span(child));
7177 } else
7178 cpumask_set_cpu(i, sg_span);
6732 7179
6733static void init_numa_sched_groups_power(struct sched_group *group_head) 7180 cpumask_or(covered, covered, sg_span);
6734{
6735 struct sched_group *sg = group_head;
6736 int j;
6737 7181
6738 if (!sg) 7182 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
6739 return; 7183 atomic_inc(&sg->sgp->ref);
6740 do {
6741 for_each_cpu(j, sched_group_cpus(sg)) {
6742 struct sched_domain *sd;
6743 7184
6744 sd = &per_cpu(phys_domains, j).sd; 7185 if (cpumask_test_cpu(cpu, sg_span))
6745 if (j != group_first_cpu(sd->groups)) { 7186 groups = sg;
6746 /*
6747 * Only add "power" once for each
6748 * physical package.
6749 */
6750 continue;
6751 }
6752 7187
6753 sg->cpu_power += sd->groups->cpu_power; 7188 if (!first)
6754 } 7189 first = sg;
6755 sg = sg->next; 7190 if (last)
6756 } while (sg != group_head); 7191 last->next = sg;
7192 last = sg;
7193 last->next = first;
7194 }
7195 sd->groups = groups;
7196
7197 return 0;
7198
7199fail:
7200 free_sched_groups(first, 0);
7201
7202 return -ENOMEM;
6757} 7203}
6758 7204
6759static int build_numa_sched_groups(struct s_data *d, 7205static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6760 const struct cpumask *cpu_map, int num)
6761{ 7206{
6762 struct sched_domain *sd; 7207 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6763 struct sched_group *sg, *prev; 7208 struct sched_domain *child = sd->child;
6764 int n, j;
6765 7209
6766 cpumask_clear(d->covered); 7210 if (child)
6767 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); 7211 cpu = cpumask_first(sched_domain_span(child));
6768 if (cpumask_empty(d->nodemask)) { 7212
6769 d->sched_group_nodes[num] = NULL; 7213 if (sg) {
6770 goto out; 7214 *sg = *per_cpu_ptr(sdd->sg, cpu);
7215 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
7216 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
6771 } 7217 }
6772 7218
6773 sched_domain_node_span(num, d->domainspan); 7219 return cpu;
6774 cpumask_and(d->domainspan, d->domainspan, cpu_map); 7220}
6775 7221
6776 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 7222/*
6777 GFP_KERNEL, num); 7223 * build_sched_groups will build a circular linked list of the groups
6778 if (!sg) { 7224 * covered by the given span, and will set each group's ->cpumask correctly,
6779 printk(KERN_WARNING "Can not alloc domain group for node %d\n", 7225 * and ->cpu_power to 0.
6780 num); 7226 *
6781 return -ENOMEM; 7227 * Assumes the sched_domain tree is fully constructed
6782 } 7228 */
6783 d->sched_group_nodes[num] = sg; 7229static int
7230build_sched_groups(struct sched_domain *sd, int cpu)
7231{
7232 struct sched_group *first = NULL, *last = NULL;
7233 struct sd_data *sdd = sd->private;
7234 const struct cpumask *span = sched_domain_span(sd);
7235 struct cpumask *covered;
7236 int i;
6784 7237
6785 for_each_cpu(j, d->nodemask) { 7238 get_group(cpu, sdd, &sd->groups);
6786 sd = &per_cpu(node_domains, j).sd; 7239 atomic_inc(&sd->groups->ref);
6787 sd->groups = sg;
6788 }
6789 7240
6790 sg->cpu_power = 0; 7241 if (cpu != cpumask_first(sched_domain_span(sd)))
6791 cpumask_copy(sched_group_cpus(sg), d->nodemask); 7242 return 0;
6792 sg->next = sg;
6793 cpumask_or(d->covered, d->covered, d->nodemask);
6794 7243
6795 prev = sg; 7244 lockdep_assert_held(&sched_domains_mutex);
6796 for (j = 0; j < nr_node_ids; j++) { 7245 covered = sched_domains_tmpmask;
6797 n = (num + j) % nr_node_ids;
6798 cpumask_complement(d->notcovered, d->covered);
6799 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
6800 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
6801 if (cpumask_empty(d->tmpmask))
6802 break;
6803 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
6804 if (cpumask_empty(d->tmpmask))
6805 continue;
6806 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
6807 GFP_KERNEL, num);
6808 if (!sg) {
6809 printk(KERN_WARNING
6810 "Can not alloc domain group for node %d\n", j);
6811 return -ENOMEM;
6812 }
6813 sg->cpu_power = 0;
6814 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
6815 sg->next = prev->next;
6816 cpumask_or(d->covered, d->covered, d->tmpmask);
6817 prev->next = sg;
6818 prev = sg;
6819 }
6820out:
6821 return 0;
6822}
6823#endif /* CONFIG_NUMA */
6824 7246
6825#ifdef CONFIG_NUMA 7247 cpumask_clear(covered);
6826/* Free memory allocated for various sched_group structures */
6827static void free_sched_groups(const struct cpumask *cpu_map,
6828 struct cpumask *nodemask)
6829{
6830 int cpu, i;
6831 7248
6832 for_each_cpu(cpu, cpu_map) { 7249 for_each_cpu(i, span) {
6833 struct sched_group **sched_group_nodes 7250 struct sched_group *sg;
6834 = sched_group_nodes_bycpu[cpu]; 7251 int group = get_group(i, sdd, &sg);
7252 int j;
6835 7253
6836 if (!sched_group_nodes) 7254 if (cpumask_test_cpu(i, covered))
6837 continue; 7255 continue;
6838 7256
6839 for (i = 0; i < nr_node_ids; i++) { 7257 cpumask_clear(sched_group_cpus(sg));
6840 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7258 sg->sgp->power = 0;
6841 7259
6842 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 7260 for_each_cpu(j, span) {
6843 if (cpumask_empty(nodemask)) 7261 if (get_group(j, sdd, NULL) != group)
6844 continue; 7262 continue;
6845 7263
6846 if (sg == NULL) 7264 cpumask_set_cpu(j, covered);
6847 continue; 7265 cpumask_set_cpu(j, sched_group_cpus(sg));
6848 sg = sg->next;
6849next_sg:
6850 oldsg = sg;
6851 sg = sg->next;
6852 kfree(oldsg);
6853 if (oldsg != sched_group_nodes[i])
6854 goto next_sg;
6855 } 7266 }
6856 kfree(sched_group_nodes); 7267
6857 sched_group_nodes_bycpu[cpu] = NULL; 7268 if (!first)
7269 first = sg;
7270 if (last)
7271 last->next = sg;
7272 last = sg;
6858 } 7273 }
7274 last->next = first;
7275
7276 return 0;
6859} 7277}
6860#else /* !CONFIG_NUMA */
6861static void free_sched_groups(const struct cpumask *cpu_map,
6862 struct cpumask *nodemask)
6863{
6864}
6865#endif /* CONFIG_NUMA */
6866 7278
6867/* 7279/*
6868 * Initialize sched groups cpu_power. 7280 * Initialize sched groups cpu_power.
@@ -6876,46 +7288,19 @@ static void free_sched_groups(const struct cpumask *cpu_map,
6876 */ 7288 */
6877static void init_sched_groups_power(int cpu, struct sched_domain *sd) 7289static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6878{ 7290{
6879 struct sched_domain *child; 7291 struct sched_group *sg = sd->groups;
6880 struct sched_group *group;
6881 long power;
6882 int weight;
6883
6884 WARN_ON(!sd || !sd->groups);
6885
6886 if (cpu != group_first_cpu(sd->groups))
6887 return;
6888 7292
6889 child = sd->child; 7293 WARN_ON(!sd || !sg);
6890 7294
6891 sd->groups->cpu_power = 0; 7295 do {
7296 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
7297 sg = sg->next;
7298 } while (sg != sd->groups);
6892 7299
6893 if (!child) { 7300 if (cpu != group_first_cpu(sg))
6894 power = SCHED_LOAD_SCALE;
6895 weight = cpumask_weight(sched_domain_span(sd));
6896 /*
6897 * SMT siblings share the power of a single core.
6898 * Usually multiple threads get a better yield out of
6899 * that one core than a single thread would have,
6900 * reflect that in sd->smt_gain.
6901 */
6902 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
6903 power *= sd->smt_gain;
6904 power /= weight;
6905 power >>= SCHED_LOAD_SHIFT;
6906 }
6907 sd->groups->cpu_power += power;
6908 return; 7301 return;
6909 }
6910 7302
6911 /* 7303 update_group_power(sd, cpu);
6912 * Add cpu_power of each child group to this groups cpu_power.
6913 */
6914 group = child->groups;
6915 do {
6916 sd->groups->cpu_power += group->cpu_power;
6917 group = group->next;
6918 } while (group != child->groups);
6919} 7304}
6920 7305
6921/* 7306/*
@@ -6929,15 +7314,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6929# define SD_INIT_NAME(sd, type) do { } while (0) 7314# define SD_INIT_NAME(sd, type) do { } while (0)
6930#endif 7315#endif
6931 7316
6932#define SD_INIT(sd, type) sd_init_##type(sd) 7317#define SD_INIT_FUNC(type) \
6933 7318static noinline struct sched_domain * \
6934#define SD_INIT_FUNC(type) \ 7319sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6935static noinline void sd_init_##type(struct sched_domain *sd) \ 7320{ \
6936{ \ 7321 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
6937 memset(sd, 0, sizeof(*sd)); \ 7322 *sd = SD_##type##_INIT; \
6938 *sd = SD_##type##_INIT; \ 7323 SD_INIT_NAME(sd, type); \
6939 sd->level = SD_LV_##type; \ 7324 sd->private = &tl->data; \
6940 SD_INIT_NAME(sd, type); \ 7325 return sd; \
6941} 7326}
6942 7327
6943SD_INIT_FUNC(CPU) 7328SD_INIT_FUNC(CPU)
@@ -6951,15 +7336,19 @@ SD_INIT_FUNC(CPU)
6951#ifdef CONFIG_SCHED_MC 7336#ifdef CONFIG_SCHED_MC
6952 SD_INIT_FUNC(MC) 7337 SD_INIT_FUNC(MC)
6953#endif 7338#endif
7339#ifdef CONFIG_SCHED_BOOK
7340 SD_INIT_FUNC(BOOK)
7341#endif
6954 7342
6955static int default_relax_domain_level = -1; 7343static int default_relax_domain_level = -1;
7344int sched_domain_level_max;
6956 7345
6957static int __init setup_relax_domain_level(char *str) 7346static int __init setup_relax_domain_level(char *str)
6958{ 7347{
6959 unsigned long val; 7348 unsigned long val;
6960 7349
6961 val = simple_strtoul(str, NULL, 0); 7350 val = simple_strtoul(str, NULL, 0);
6962 if (val < SD_LV_MAX) 7351 if (val < sched_domain_level_max)
6963 default_relax_domain_level = val; 7352 default_relax_domain_level = val;
6964 7353
6965 return 1; 7354 return 1;
@@ -6987,35 +7376,20 @@ static void set_domain_attribute(struct sched_domain *sd,
6987 } 7376 }
6988} 7377}
6989 7378
7379static void __sdt_free(const struct cpumask *cpu_map);
7380static int __sdt_alloc(const struct cpumask *cpu_map);
7381
6990static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 7382static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6991 const struct cpumask *cpu_map) 7383 const struct cpumask *cpu_map)
6992{ 7384{
6993 switch (what) { 7385 switch (what) {
6994 case sa_sched_groups:
6995 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
6996 d->sched_group_nodes = NULL;
6997 case sa_rootdomain: 7386 case sa_rootdomain:
6998 free_rootdomain(d->rd); /* fall through */ 7387 if (!atomic_read(&d->rd->refcount))
6999 case sa_tmpmask: 7388 free_rootdomain(&d->rd->rcu); /* fall through */
7000 free_cpumask_var(d->tmpmask); /* fall through */ 7389 case sa_sd:
7001 case sa_send_covered: 7390 free_percpu(d->sd); /* fall through */
7002 free_cpumask_var(d->send_covered); /* fall through */ 7391 case sa_sd_storage:
7003 case sa_this_core_map: 7392 __sdt_free(cpu_map); /* fall through */
7004 free_cpumask_var(d->this_core_map); /* fall through */
7005 case sa_this_sibling_map:
7006 free_cpumask_var(d->this_sibling_map); /* fall through */
7007 case sa_nodemask:
7008 free_cpumask_var(d->nodemask); /* fall through */
7009 case sa_sched_group_nodes:
7010#ifdef CONFIG_NUMA
7011 kfree(d->sched_group_nodes); /* fall through */
7012 case sa_notcovered:
7013 free_cpumask_var(d->notcovered); /* fall through */
7014 case sa_covered:
7015 free_cpumask_var(d->covered); /* fall through */
7016 case sa_domainspan:
7017 free_cpumask_var(d->domainspan); /* fall through */
7018#endif
7019 case sa_none: 7393 case sa_none:
7020 break; 7394 break;
7021 } 7395 }
@@ -7024,270 +7398,233 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7024static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 7398static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7025 const struct cpumask *cpu_map) 7399 const struct cpumask *cpu_map)
7026{ 7400{
7027#ifdef CONFIG_NUMA 7401 memset(d, 0, sizeof(*d));
7028 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) 7402
7029 return sa_none; 7403 if (__sdt_alloc(cpu_map))
7030 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) 7404 return sa_sd_storage;
7031 return sa_domainspan; 7405 d->sd = alloc_percpu(struct sched_domain *);
7032 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) 7406 if (!d->sd)
7033 return sa_covered; 7407 return sa_sd_storage;
7034 /* Allocate the per-node list of sched groups */
7035 d->sched_group_nodes = kcalloc(nr_node_ids,
7036 sizeof(struct sched_group *), GFP_KERNEL);
7037 if (!d->sched_group_nodes) {
7038 printk(KERN_WARNING "Can not alloc sched group node list\n");
7039 return sa_notcovered;
7040 }
7041 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
7042#endif
7043 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
7044 return sa_sched_group_nodes;
7045 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
7046 return sa_nodemask;
7047 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
7048 return sa_this_sibling_map;
7049 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7050 return sa_this_core_map;
7051 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
7052 return sa_send_covered;
7053 d->rd = alloc_rootdomain(); 7408 d->rd = alloc_rootdomain();
7054 if (!d->rd) { 7409 if (!d->rd)
7055 printk(KERN_WARNING "Cannot alloc root domain\n"); 7410 return sa_sd;
7056 return sa_tmpmask;
7057 }
7058 return sa_rootdomain; 7411 return sa_rootdomain;
7059} 7412}
7060 7413
7061static struct sched_domain *__build_numa_sched_domains(struct s_data *d, 7414/*
7062 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) 7415 * NULL the sd_data elements we've used to build the sched_domain and
7416 * sched_group structure so that the subsequent __free_domain_allocs()
7417 * will not free the data we're using.
7418 */
7419static void claim_allocations(int cpu, struct sched_domain *sd)
7063{ 7420{
7064 struct sched_domain *sd = NULL; 7421 struct sd_data *sdd = sd->private;
7065#ifdef CONFIG_NUMA
7066 struct sched_domain *parent;
7067
7068 d->sd_allnodes = 0;
7069 if (cpumask_weight(cpu_map) >
7070 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
7071 sd = &per_cpu(allnodes_domains, i).sd;
7072 SD_INIT(sd, ALLNODES);
7073 set_domain_attribute(sd, attr);
7074 cpumask_copy(sched_domain_span(sd), cpu_map);
7075 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
7076 d->sd_allnodes = 1;
7077 }
7078 parent = sd;
7079
7080 sd = &per_cpu(node_domains, i).sd;
7081 SD_INIT(sd, NODE);
7082 set_domain_attribute(sd, attr);
7083 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7084 sd->parent = parent;
7085 if (parent)
7086 parent->child = sd;
7087 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
7088#endif
7089 return sd;
7090}
7091 7422
7092static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, 7423 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7093 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7424 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7094 struct sched_domain *parent, int i)
7095{
7096 struct sched_domain *sd;
7097 sd = &per_cpu(phys_domains, i).sd;
7098 SD_INIT(sd, CPU);
7099 set_domain_attribute(sd, attr);
7100 cpumask_copy(sched_domain_span(sd), d->nodemask);
7101 sd->parent = parent;
7102 if (parent)
7103 parent->child = sd;
7104 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
7105 return sd;
7106}
7107 7425
7108static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7426 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
7109 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7427 *per_cpu_ptr(sdd->sg, cpu) = NULL;
7110 struct sched_domain *parent, int i) 7428
7111{ 7429 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
7112 struct sched_domain *sd = parent; 7430 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
7113#ifdef CONFIG_SCHED_MC
7114 sd = &per_cpu(core_domains, i).sd;
7115 SD_INIT(sd, MC);
7116 set_domain_attribute(sd, attr);
7117 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
7118 sd->parent = parent;
7119 parent->child = sd;
7120 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
7121#endif
7122 return sd;
7123} 7431}
7124 7432
7125static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
7126 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7127 struct sched_domain *parent, int i)
7128{
7129 struct sched_domain *sd = parent;
7130#ifdef CONFIG_SCHED_SMT 7433#ifdef CONFIG_SCHED_SMT
7131 sd = &per_cpu(cpu_domains, i).sd; 7434static const struct cpumask *cpu_smt_mask(int cpu)
7132 SD_INIT(sd, SIBLING); 7435{
7133 set_domain_attribute(sd, attr); 7436 return topology_thread_cpumask(cpu);
7134 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
7135 sd->parent = parent;
7136 parent->child = sd;
7137 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
7138#endif
7139 return sd;
7140} 7437}
7438#endif
7141 7439
7142static void build_sched_groups(struct s_data *d, enum sched_domain_level l, 7440/*
7143 const struct cpumask *cpu_map, int cpu) 7441 * Topology list, bottom-up.
7144{ 7442 */
7145 switch (l) { 7443static struct sched_domain_topology_level default_topology[] = {
7146#ifdef CONFIG_SCHED_SMT 7444#ifdef CONFIG_SCHED_SMT
7147 case SD_LV_SIBLING: /* set up CPU (sibling) groups */ 7445 { sd_init_SIBLING, cpu_smt_mask, },
7148 cpumask_and(d->this_sibling_map, cpu_map,
7149 topology_thread_cpumask(cpu));
7150 if (cpu == cpumask_first(d->this_sibling_map))
7151 init_sched_build_groups(d->this_sibling_map, cpu_map,
7152 &cpu_to_cpu_group,
7153 d->send_covered, d->tmpmask);
7154 break;
7155#endif 7446#endif
7156#ifdef CONFIG_SCHED_MC 7447#ifdef CONFIG_SCHED_MC
7157 case SD_LV_MC: /* set up multi-core groups */ 7448 { sd_init_MC, cpu_coregroup_mask, },
7158 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
7159 if (cpu == cpumask_first(d->this_core_map))
7160 init_sched_build_groups(d->this_core_map, cpu_map,
7161 &cpu_to_core_group,
7162 d->send_covered, d->tmpmask);
7163 break;
7164#endif 7449#endif
7165 case SD_LV_CPU: /* set up physical groups */ 7450#ifdef CONFIG_SCHED_BOOK
7166 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); 7451 { sd_init_BOOK, cpu_book_mask, },
7167 if (!cpumask_empty(d->nodemask)) 7452#endif
7168 init_sched_build_groups(d->nodemask, cpu_map, 7453 { sd_init_CPU, cpu_cpu_mask, },
7169 &cpu_to_phys_group,
7170 d->send_covered, d->tmpmask);
7171 break;
7172#ifdef CONFIG_NUMA 7454#ifdef CONFIG_NUMA
7173 case SD_LV_ALLNODES: 7455 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
7174 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, 7456 { sd_init_ALLNODES, cpu_allnodes_mask, },
7175 d->send_covered, d->tmpmask);
7176 break;
7177#endif 7457#endif
7178 default: 7458 { NULL, },
7179 break; 7459};
7460
7461static struct sched_domain_topology_level *sched_domain_topology = default_topology;
7462
7463static int __sdt_alloc(const struct cpumask *cpu_map)
7464{
7465 struct sched_domain_topology_level *tl;
7466 int j;
7467
7468 for (tl = sched_domain_topology; tl->init; tl++) {
7469 struct sd_data *sdd = &tl->data;
7470
7471 sdd->sd = alloc_percpu(struct sched_domain *);
7472 if (!sdd->sd)
7473 return -ENOMEM;
7474
7475 sdd->sg = alloc_percpu(struct sched_group *);
7476 if (!sdd->sg)
7477 return -ENOMEM;
7478
7479 sdd->sgp = alloc_percpu(struct sched_group_power *);
7480 if (!sdd->sgp)
7481 return -ENOMEM;
7482
7483 for_each_cpu(j, cpu_map) {
7484 struct sched_domain *sd;
7485 struct sched_group *sg;
7486 struct sched_group_power *sgp;
7487
7488 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7489 GFP_KERNEL, cpu_to_node(j));
7490 if (!sd)
7491 return -ENOMEM;
7492
7493 *per_cpu_ptr(sdd->sd, j) = sd;
7494
7495 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7496 GFP_KERNEL, cpu_to_node(j));
7497 if (!sg)
7498 return -ENOMEM;
7499
7500 *per_cpu_ptr(sdd->sg, j) = sg;
7501
7502 sgp = kzalloc_node(sizeof(struct sched_group_power),
7503 GFP_KERNEL, cpu_to_node(j));
7504 if (!sgp)
7505 return -ENOMEM;
7506
7507 *per_cpu_ptr(sdd->sgp, j) = sgp;
7508 }
7509 }
7510
7511 return 0;
7512}
7513
7514static void __sdt_free(const struct cpumask *cpu_map)
7515{
7516 struct sched_domain_topology_level *tl;
7517 int j;
7518
7519 for (tl = sched_domain_topology; tl->init; tl++) {
7520 struct sd_data *sdd = &tl->data;
7521
7522 for_each_cpu(j, cpu_map) {
7523 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
7524 if (sd && (sd->flags & SD_OVERLAP))
7525 free_sched_groups(sd->groups, 0);
7526 kfree(*per_cpu_ptr(sdd->sg, j));
7527 kfree(*per_cpu_ptr(sdd->sgp, j));
7528 }
7529 free_percpu(sdd->sd);
7530 free_percpu(sdd->sg);
7531 free_percpu(sdd->sgp);
7532 }
7533}
7534
7535struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
7536 struct s_data *d, const struct cpumask *cpu_map,
7537 struct sched_domain_attr *attr, struct sched_domain *child,
7538 int cpu)
7539{
7540 struct sched_domain *sd = tl->init(tl, cpu);
7541 if (!sd)
7542 return child;
7543
7544 set_domain_attribute(sd, attr);
7545 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
7546 if (child) {
7547 sd->level = child->level + 1;
7548 sched_domain_level_max = max(sched_domain_level_max, sd->level);
7549 child->parent = sd;
7180 } 7550 }
7551 sd->child = child;
7552
7553 return sd;
7181} 7554}
7182 7555
7183/* 7556/*
7184 * Build sched domains for a given set of cpus and attach the sched domains 7557 * Build sched domains for a given set of cpus and attach the sched domains
7185 * to the individual cpus 7558 * to the individual cpus
7186 */ 7559 */
7187static int __build_sched_domains(const struct cpumask *cpu_map, 7560static int build_sched_domains(const struct cpumask *cpu_map,
7188 struct sched_domain_attr *attr) 7561 struct sched_domain_attr *attr)
7189{ 7562{
7190 enum s_alloc alloc_state = sa_none; 7563 enum s_alloc alloc_state = sa_none;
7191 struct s_data d;
7192 struct sched_domain *sd; 7564 struct sched_domain *sd;
7193 int i; 7565 struct s_data d;
7194#ifdef CONFIG_NUMA 7566 int i, ret = -ENOMEM;
7195 d.sd_allnodes = 0;
7196#endif
7197 7567
7198 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7568 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7199 if (alloc_state != sa_rootdomain) 7569 if (alloc_state != sa_rootdomain)
7200 goto error; 7570 goto error;
7201 alloc_state = sa_sched_groups;
7202
7203 /*
7204 * Set up domains for cpus specified by the cpu_map.
7205 */
7206 for_each_cpu(i, cpu_map) {
7207 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
7208 cpu_map);
7209
7210 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
7211 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
7212 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7213 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7214 }
7215 7571
7572 /* Set up domains for cpus specified by the cpu_map. */
7216 for_each_cpu(i, cpu_map) { 7573 for_each_cpu(i, cpu_map) {
7217 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); 7574 struct sched_domain_topology_level *tl;
7218 build_sched_groups(&d, SD_LV_MC, cpu_map, i); 7575
7219 } 7576 sd = NULL;
7220 7577 for (tl = sched_domain_topology; tl->init; tl++) {
7221 /* Set up physical groups */ 7578 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
7222 for (i = 0; i < nr_node_ids; i++) 7579 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
7223 build_sched_groups(&d, SD_LV_CPU, cpu_map, i); 7580 sd->flags |= SD_OVERLAP;
7224 7581 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
7225#ifdef CONFIG_NUMA 7582 break;
7226 /* Set up node groups */ 7583 }
7227 if (d.sd_allnodes)
7228 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
7229 7584
7230 for (i = 0; i < nr_node_ids; i++) 7585 while (sd->child)
7231 if (build_numa_sched_groups(&d, cpu_map, i)) 7586 sd = sd->child;
7232 goto error;
7233#endif
7234 7587
7235 /* Calculate CPU power for physical packages and nodes */ 7588 *per_cpu_ptr(d.sd, i) = sd;
7236#ifdef CONFIG_SCHED_SMT
7237 for_each_cpu(i, cpu_map) {
7238 sd = &per_cpu(cpu_domains, i).sd;
7239 init_sched_groups_power(i, sd);
7240 }
7241#endif
7242#ifdef CONFIG_SCHED_MC
7243 for_each_cpu(i, cpu_map) {
7244 sd = &per_cpu(core_domains, i).sd;
7245 init_sched_groups_power(i, sd);
7246 } 7589 }
7247#endif
7248 7590
7591 /* Build the groups for the domains */
7249 for_each_cpu(i, cpu_map) { 7592 for_each_cpu(i, cpu_map) {
7250 sd = &per_cpu(phys_domains, i).sd; 7593 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7251 init_sched_groups_power(i, sd); 7594 sd->span_weight = cpumask_weight(sched_domain_span(sd));
7595 if (sd->flags & SD_OVERLAP) {
7596 if (build_overlap_sched_groups(sd, i))
7597 goto error;
7598 } else {
7599 if (build_sched_groups(sd, i))
7600 goto error;
7601 }
7602 }
7252 } 7603 }
7253 7604
7254#ifdef CONFIG_NUMA 7605 /* Calculate CPU power for physical packages and nodes */
7255 for (i = 0; i < nr_node_ids; i++) 7606 for (i = nr_cpumask_bits-1; i >= 0; i--) {
7256 init_numa_sched_groups_power(d.sched_group_nodes[i]); 7607 if (!cpumask_test_cpu(i, cpu_map))
7257 7608 continue;
7258 if (d.sd_allnodes) {
7259 struct sched_group *sg;
7260 7609
7261 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 7610 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7262 d.tmpmask); 7611 claim_allocations(i, sd);
7263 init_numa_sched_groups_power(sg); 7612 init_sched_groups_power(i, sd);
7613 }
7264 } 7614 }
7265#endif
7266 7615
7267 /* Attach the domains */ 7616 /* Attach the domains */
7617 rcu_read_lock();
7268 for_each_cpu(i, cpu_map) { 7618 for_each_cpu(i, cpu_map) {
7269#ifdef CONFIG_SCHED_SMT 7619 sd = *per_cpu_ptr(d.sd, i);
7270 sd = &per_cpu(cpu_domains, i).sd;
7271#elif defined(CONFIG_SCHED_MC)
7272 sd = &per_cpu(core_domains, i).sd;
7273#else
7274 sd = &per_cpu(phys_domains, i).sd;
7275#endif
7276 cpu_attach_domain(sd, d.rd, i); 7620 cpu_attach_domain(sd, d.rd, i);
7277 } 7621 }
7622 rcu_read_unlock();
7278 7623
7279 d.sched_group_nodes = NULL; /* don't free this we still need it */ 7624 ret = 0;
7280 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
7281 return 0;
7282
7283error: 7625error:
7284 __free_domain_allocs(&d, alloc_state, cpu_map); 7626 __free_domain_allocs(&d, alloc_state, cpu_map);
7285 return -ENOMEM; 7627 return ret;
7286}
7287
7288static int build_sched_domains(const struct cpumask *cpu_map)
7289{
7290 return __build_sched_domains(cpu_map, NULL);
7291} 7628}
7292 7629
7293static cpumask_var_t *doms_cur; /* current sched domains */ 7630static cpumask_var_t *doms_cur; /* current sched domains */
@@ -7342,7 +7679,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7342 * For now this just excludes isolated cpus, but could be used to 7679 * For now this just excludes isolated cpus, but could be used to
7343 * exclude other special cases in the future. 7680 * exclude other special cases in the future.
7344 */ 7681 */
7345static int arch_init_sched_domains(const struct cpumask *cpu_map) 7682static int init_sched_domains(const struct cpumask *cpu_map)
7346{ 7683{
7347 int err; 7684 int err;
7348 7685
@@ -7353,32 +7690,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
7353 doms_cur = &fallback_doms; 7690 doms_cur = &fallback_doms;
7354 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 7691 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7355 dattr_cur = NULL; 7692 dattr_cur = NULL;
7356 err = build_sched_domains(doms_cur[0]); 7693 err = build_sched_domains(doms_cur[0], NULL);
7357 register_sched_domain_sysctl(); 7694 register_sched_domain_sysctl();
7358 7695
7359 return err; 7696 return err;
7360} 7697}
7361 7698
7362static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7363 struct cpumask *tmpmask)
7364{
7365 free_sched_groups(cpu_map, tmpmask);
7366}
7367
7368/* 7699/*
7369 * Detach sched domains from a group of cpus specified in cpu_map 7700 * Detach sched domains from a group of cpus specified in cpu_map
7370 * These cpus will now be attached to the NULL domain 7701 * These cpus will now be attached to the NULL domain
7371 */ 7702 */
7372static void detach_destroy_domains(const struct cpumask *cpu_map) 7703static void detach_destroy_domains(const struct cpumask *cpu_map)
7373{ 7704{
7374 /* Save because hotplug lock held. */
7375 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7376 int i; 7705 int i;
7377 7706
7707 rcu_read_lock();
7378 for_each_cpu(i, cpu_map) 7708 for_each_cpu(i, cpu_map)
7379 cpu_attach_domain(NULL, &def_root_domain, i); 7709 cpu_attach_domain(NULL, &def_root_domain, i);
7380 synchronize_sched(); 7710 rcu_read_unlock();
7381 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7382} 7711}
7383 7712
7384/* handle null as "default" */ 7713/* handle null as "default" */
@@ -7467,8 +7796,7 @@ match1:
7467 goto match2; 7796 goto match2;
7468 } 7797 }
7469 /* no match - add a new doms_new */ 7798 /* no match - add a new doms_new */
7470 __build_sched_domains(doms_new[i], 7799 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7471 dattr_new ? dattr_new + i : NULL);
7472match2: 7800match2:
7473 ; 7801 ;
7474 } 7802 }
@@ -7487,7 +7815,7 @@ match2:
7487} 7815}
7488 7816
7489#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7817#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7490static void arch_reinit_sched_domains(void) 7818static void reinit_sched_domains(void)
7491{ 7819{
7492 get_online_cpus(); 7820 get_online_cpus();
7493 7821
@@ -7520,7 +7848,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7520 else 7848 else
7521 sched_mc_power_savings = level; 7849 sched_mc_power_savings = level;
7522 7850
7523 arch_reinit_sched_domains(); 7851 reinit_sched_domains();
7524 7852
7525 return count; 7853 return count;
7526} 7854}
@@ -7639,14 +7967,9 @@ void __init sched_init_smp(void)
7639 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7967 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7640 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7968 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7641 7969
7642#if defined(CONFIG_NUMA)
7643 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7644 GFP_KERNEL);
7645 BUG_ON(sched_group_nodes_bycpu == NULL);
7646#endif
7647 get_online_cpus(); 7970 get_online_cpus();
7648 mutex_lock(&sched_domains_mutex); 7971 mutex_lock(&sched_domains_mutex);
7649 arch_init_sched_domains(cpu_active_mask); 7972 init_sched_domains(cpu_active_mask);
7650 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7973 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7651 if (cpumask_empty(non_isolated_cpus)) 7974 if (cpumask_empty(non_isolated_cpus))
7652 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7975 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -7691,8 +8014,15 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7691 INIT_LIST_HEAD(&cfs_rq->tasks); 8014 INIT_LIST_HEAD(&cfs_rq->tasks);
7692#ifdef CONFIG_FAIR_GROUP_SCHED 8015#ifdef CONFIG_FAIR_GROUP_SCHED
7693 cfs_rq->rq = rq; 8016 cfs_rq->rq = rq;
8017 /* allow initial update_cfs_load() to truncate */
8018#ifdef CONFIG_SMP
8019 cfs_rq->load_stamp = 1;
8020#endif
7694#endif 8021#endif
7695 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 8022 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8023#ifndef CONFIG_64BIT
8024 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8025#endif
7696} 8026}
7697 8027
7698static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) 8028static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
@@ -7733,18 +8063,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7733 8063
7734#ifdef CONFIG_FAIR_GROUP_SCHED 8064#ifdef CONFIG_FAIR_GROUP_SCHED
7735static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 8065static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7736 struct sched_entity *se, int cpu, int add, 8066 struct sched_entity *se, int cpu,
7737 struct sched_entity *parent) 8067 struct sched_entity *parent)
7738{ 8068{
7739 struct rq *rq = cpu_rq(cpu); 8069 struct rq *rq = cpu_rq(cpu);
7740 tg->cfs_rq[cpu] = cfs_rq; 8070 tg->cfs_rq[cpu] = cfs_rq;
7741 init_cfs_rq(cfs_rq, rq); 8071 init_cfs_rq(cfs_rq, rq);
7742 cfs_rq->tg = tg; 8072 cfs_rq->tg = tg;
7743 if (add)
7744 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7745 8073
7746 tg->se[cpu] = se; 8074 tg->se[cpu] = se;
7747 /* se could be NULL for init_task_group */ 8075 /* se could be NULL for root_task_group */
7748 if (!se) 8076 if (!se)
7749 return; 8077 return;
7750 8078
@@ -7754,15 +8082,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7754 se->cfs_rq = parent->my_q; 8082 se->cfs_rq = parent->my_q;
7755 8083
7756 se->my_q = cfs_rq; 8084 se->my_q = cfs_rq;
7757 se->load.weight = tg->shares; 8085 update_load_set(&se->load, 0);
7758 se->load.inv_weight = 0;
7759 se->parent = parent; 8086 se->parent = parent;
7760} 8087}
7761#endif 8088#endif
7762 8089
7763#ifdef CONFIG_RT_GROUP_SCHED 8090#ifdef CONFIG_RT_GROUP_SCHED
7764static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 8091static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7765 struct sched_rt_entity *rt_se, int cpu, int add, 8092 struct sched_rt_entity *rt_se, int cpu,
7766 struct sched_rt_entity *parent) 8093 struct sched_rt_entity *parent)
7767{ 8094{
7768 struct rq *rq = cpu_rq(cpu); 8095 struct rq *rq = cpu_rq(cpu);
@@ -7771,8 +8098,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7771 init_rt_rq(rt_rq, rq); 8098 init_rt_rq(rt_rq, rq);
7772 rt_rq->tg = tg; 8099 rt_rq->tg = tg;
7773 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 8100 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7774 if (add)
7775 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7776 8101
7777 tg->rt_se[cpu] = rt_se; 8102 tg->rt_se[cpu] = rt_se;
7778 if (!rt_se) 8103 if (!rt_se)
@@ -7807,18 +8132,18 @@ void __init sched_init(void)
7807 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 8132 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7808 8133
7809#ifdef CONFIG_FAIR_GROUP_SCHED 8134#ifdef CONFIG_FAIR_GROUP_SCHED
7810 init_task_group.se = (struct sched_entity **)ptr; 8135 root_task_group.se = (struct sched_entity **)ptr;
7811 ptr += nr_cpu_ids * sizeof(void **); 8136 ptr += nr_cpu_ids * sizeof(void **);
7812 8137
7813 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 8138 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7814 ptr += nr_cpu_ids * sizeof(void **); 8139 ptr += nr_cpu_ids * sizeof(void **);
7815 8140
7816#endif /* CONFIG_FAIR_GROUP_SCHED */ 8141#endif /* CONFIG_FAIR_GROUP_SCHED */
7817#ifdef CONFIG_RT_GROUP_SCHED 8142#ifdef CONFIG_RT_GROUP_SCHED
7818 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 8143 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
7819 ptr += nr_cpu_ids * sizeof(void **); 8144 ptr += nr_cpu_ids * sizeof(void **);
7820 8145
7821 init_task_group.rt_rq = (struct rt_rq **)ptr; 8146 root_task_group.rt_rq = (struct rt_rq **)ptr;
7822 ptr += nr_cpu_ids * sizeof(void **); 8147 ptr += nr_cpu_ids * sizeof(void **);
7823 8148
7824#endif /* CONFIG_RT_GROUP_SCHED */ 8149#endif /* CONFIG_RT_GROUP_SCHED */
@@ -7838,20 +8163,16 @@ void __init sched_init(void)
7838 global_rt_period(), global_rt_runtime()); 8163 global_rt_period(), global_rt_runtime());
7839 8164
7840#ifdef CONFIG_RT_GROUP_SCHED 8165#ifdef CONFIG_RT_GROUP_SCHED
7841 init_rt_bandwidth(&init_task_group.rt_bandwidth, 8166 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7842 global_rt_period(), global_rt_runtime()); 8167 global_rt_period(), global_rt_runtime());
7843#endif /* CONFIG_RT_GROUP_SCHED */ 8168#endif /* CONFIG_RT_GROUP_SCHED */
7844 8169
7845#ifdef CONFIG_CGROUP_SCHED 8170#ifdef CONFIG_CGROUP_SCHED
7846 list_add(&init_task_group.list, &task_groups); 8171 list_add(&root_task_group.list, &task_groups);
7847 INIT_LIST_HEAD(&init_task_group.children); 8172 INIT_LIST_HEAD(&root_task_group.children);
7848 8173 autogroup_init(&init_task);
7849#endif /* CONFIG_CGROUP_SCHED */ 8174#endif /* CONFIG_CGROUP_SCHED */
7850 8175
7851#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7852 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
7853 __alignof__(unsigned long));
7854#endif
7855 for_each_possible_cpu(i) { 8176 for_each_possible_cpu(i) {
7856 struct rq *rq; 8177 struct rq *rq;
7857 8178
@@ -7863,38 +8184,34 @@ void __init sched_init(void)
7863 init_cfs_rq(&rq->cfs, rq); 8184 init_cfs_rq(&rq->cfs, rq);
7864 init_rt_rq(&rq->rt, rq); 8185 init_rt_rq(&rq->rt, rq);
7865#ifdef CONFIG_FAIR_GROUP_SCHED 8186#ifdef CONFIG_FAIR_GROUP_SCHED
7866 init_task_group.shares = init_task_group_load; 8187 root_task_group.shares = root_task_group_load;
7867 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 8188 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7868#ifdef CONFIG_CGROUP_SCHED
7869 /* 8189 /*
7870 * How much cpu bandwidth does init_task_group get? 8190 * How much cpu bandwidth does root_task_group get?
7871 * 8191 *
7872 * In case of task-groups formed thr' the cgroup filesystem, it 8192 * In case of task-groups formed thr' the cgroup filesystem, it
7873 * gets 100% of the cpu resources in the system. This overall 8193 * gets 100% of the cpu resources in the system. This overall
7874 * system cpu resource is divided among the tasks of 8194 * system cpu resource is divided among the tasks of
7875 * init_task_group and its child task-groups in a fair manner, 8195 * root_task_group and its child task-groups in a fair manner,
7876 * based on each entity's (task or task-group's) weight 8196 * based on each entity's (task or task-group's) weight
7877 * (se->load.weight). 8197 * (se->load.weight).
7878 * 8198 *
7879 * In other words, if init_task_group has 10 tasks of weight 8199 * In other words, if root_task_group has 10 tasks of weight
7880 * 1024) and two child groups A0 and A1 (of weight 1024 each), 8200 * 1024) and two child groups A0 and A1 (of weight 1024 each),
7881 * then A0's share of the cpu resource is: 8201 * then A0's share of the cpu resource is:
7882 * 8202 *
7883 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 8203 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
7884 * 8204 *
7885 * We achieve this by letting init_task_group's tasks sit 8205 * We achieve this by letting root_task_group's tasks sit
7886 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 8206 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
7887 */ 8207 */
7888 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 8208 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
7889#endif
7890#endif /* CONFIG_FAIR_GROUP_SCHED */ 8209#endif /* CONFIG_FAIR_GROUP_SCHED */
7891 8210
7892 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 8211 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7893#ifdef CONFIG_RT_GROUP_SCHED 8212#ifdef CONFIG_RT_GROUP_SCHED
7894 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 8213 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7895#ifdef CONFIG_CGROUP_SCHED 8214 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
7896 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
7897#endif
7898#endif 8215#endif
7899 8216
7900 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 8217 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -7905,7 +8222,7 @@ void __init sched_init(void)
7905#ifdef CONFIG_SMP 8222#ifdef CONFIG_SMP
7906 rq->sd = NULL; 8223 rq->sd = NULL;
7907 rq->rd = NULL; 8224 rq->rd = NULL;
7908 rq->cpu_power = SCHED_LOAD_SCALE; 8225 rq->cpu_power = SCHED_POWER_SCALE;
7909 rq->post_schedule = 0; 8226 rq->post_schedule = 0;
7910 rq->active_balance = 0; 8227 rq->active_balance = 0;
7911 rq->next_balance = jiffies; 8228 rq->next_balance = jiffies;
@@ -7962,6 +8279,7 @@ void __init sched_init(void)
7962 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 8279 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
7963 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 8280 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
7964#ifdef CONFIG_SMP 8281#ifdef CONFIG_SMP
8282 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
7965#ifdef CONFIG_NO_HZ 8283#ifdef CONFIG_NO_HZ
7966 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 8284 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7967 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); 8285 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@ -7974,8 +8292,6 @@ void __init sched_init(void)
7974 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 8292 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7975#endif /* SMP */ 8293#endif /* SMP */
7976 8294
7977 perf_event_init();
7978
7979 scheduler_running = 1; 8295 scheduler_running = 1;
7980} 8296}
7981 8297
@@ -7984,7 +8300,7 @@ static inline int preempt_count_equals(int preempt_offset)
7984{ 8300{
7985 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 8301 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
7986 8302
7987 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 8303 return (nested == preempt_offset);
7988} 8304}
7989 8305
7990void __might_sleep(const char *file, int line, int preempt_offset) 8306void __might_sleep(const char *file, int line, int preempt_offset)
@@ -8019,9 +8335,11 @@ EXPORT_SYMBOL(__might_sleep);
8019#ifdef CONFIG_MAGIC_SYSRQ 8335#ifdef CONFIG_MAGIC_SYSRQ
8020static void normalize_task(struct rq *rq, struct task_struct *p) 8336static void normalize_task(struct rq *rq, struct task_struct *p)
8021{ 8337{
8338 const struct sched_class *prev_class = p->sched_class;
8339 int old_prio = p->prio;
8022 int on_rq; 8340 int on_rq;
8023 8341
8024 on_rq = p->se.on_rq; 8342 on_rq = p->on_rq;
8025 if (on_rq) 8343 if (on_rq)
8026 deactivate_task(rq, p, 0); 8344 deactivate_task(rq, p, 0);
8027 __setscheduler(rq, p, SCHED_NORMAL, 0); 8345 __setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8029,6 +8347,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8029 activate_task(rq, p, 0); 8347 activate_task(rq, p, 0);
8030 resched_task(rq->curr); 8348 resched_task(rq->curr);
8031 } 8349 }
8350
8351 check_class_changed(rq, p, prev_class, old_prio);
8032} 8352}
8033 8353
8034void normalize_rt_tasks(void) 8354void normalize_rt_tasks(void)
@@ -8144,7 +8464,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8144{ 8464{
8145 struct cfs_rq *cfs_rq; 8465 struct cfs_rq *cfs_rq;
8146 struct sched_entity *se; 8466 struct sched_entity *se;
8147 struct rq *rq;
8148 int i; 8467 int i;
8149 8468
8150 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 8469 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8157,8 +8476,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8157 tg->shares = NICE_0_LOAD; 8476 tg->shares = NICE_0_LOAD;
8158 8477
8159 for_each_possible_cpu(i) { 8478 for_each_possible_cpu(i) {
8160 rq = cpu_rq(i);
8161
8162 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8479 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8163 GFP_KERNEL, cpu_to_node(i)); 8480 GFP_KERNEL, cpu_to_node(i));
8164 if (!cfs_rq) 8481 if (!cfs_rq)
@@ -8169,26 +8486,32 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8169 if (!se) 8486 if (!se)
8170 goto err_free_rq; 8487 goto err_free_rq;
8171 8488
8172 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 8489 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8173 } 8490 }
8174 8491
8175 return 1; 8492 return 1;
8176 8493
8177 err_free_rq: 8494err_free_rq:
8178 kfree(cfs_rq); 8495 kfree(cfs_rq);
8179 err: 8496err:
8180 return 0; 8497 return 0;
8181} 8498}
8182 8499
8183static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8184{
8185 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8186 &cpu_rq(cpu)->leaf_cfs_rq_list);
8187}
8188
8189static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8500static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8190{ 8501{
8191 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8502 struct rq *rq = cpu_rq(cpu);
8503 unsigned long flags;
8504
8505 /*
8506 * Only empty task groups can be destroyed; so we can speculatively
8507 * check on_list without danger of it being re-added.
8508 */
8509 if (!tg->cfs_rq[cpu]->on_list)
8510 return;
8511
8512 raw_spin_lock_irqsave(&rq->lock, flags);
8513 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8514 raw_spin_unlock_irqrestore(&rq->lock, flags);
8192} 8515}
8193#else /* !CONFG_FAIR_GROUP_SCHED */ 8516#else /* !CONFG_FAIR_GROUP_SCHED */
8194static inline void free_fair_sched_group(struct task_group *tg) 8517static inline void free_fair_sched_group(struct task_group *tg)
@@ -8201,10 +8524,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8201 return 1; 8524 return 1;
8202} 8525}
8203 8526
8204static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8205{
8206}
8207
8208static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8527static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8209{ 8528{
8210} 8529}
@@ -8233,7 +8552,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8233{ 8552{
8234 struct rt_rq *rt_rq; 8553 struct rt_rq *rt_rq;
8235 struct sched_rt_entity *rt_se; 8554 struct sched_rt_entity *rt_se;
8236 struct rq *rq;
8237 int i; 8555 int i;
8238 8556
8239 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 8557 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8247,8 +8565,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8247 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 8565 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8248 8566
8249 for_each_possible_cpu(i) { 8567 for_each_possible_cpu(i) {
8250 rq = cpu_rq(i);
8251
8252 rt_rq = kzalloc_node(sizeof(struct rt_rq), 8568 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8253 GFP_KERNEL, cpu_to_node(i)); 8569 GFP_KERNEL, cpu_to_node(i));
8254 if (!rt_rq) 8570 if (!rt_rq)
@@ -8259,27 +8575,16 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8259 if (!rt_se) 8575 if (!rt_se)
8260 goto err_free_rq; 8576 goto err_free_rq;
8261 8577
8262 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 8578 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8263 } 8579 }
8264 8580
8265 return 1; 8581 return 1;
8266 8582
8267 err_free_rq: 8583err_free_rq:
8268 kfree(rt_rq); 8584 kfree(rt_rq);
8269 err: 8585err:
8270 return 0; 8586 return 0;
8271} 8587}
8272
8273static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8274{
8275 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8276 &cpu_rq(cpu)->leaf_rt_rq_list);
8277}
8278
8279static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8280{
8281 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8282}
8283#else /* !CONFIG_RT_GROUP_SCHED */ 8588#else /* !CONFIG_RT_GROUP_SCHED */
8284static inline void free_rt_sched_group(struct task_group *tg) 8589static inline void free_rt_sched_group(struct task_group *tg)
8285{ 8590{
@@ -8290,14 +8595,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8290{ 8595{
8291 return 1; 8596 return 1;
8292} 8597}
8293
8294static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8295{
8296}
8297
8298static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8299{
8300}
8301#endif /* CONFIG_RT_GROUP_SCHED */ 8598#endif /* CONFIG_RT_GROUP_SCHED */
8302 8599
8303#ifdef CONFIG_CGROUP_SCHED 8600#ifdef CONFIG_CGROUP_SCHED
@@ -8305,6 +8602,7 @@ static void free_sched_group(struct task_group *tg)
8305{ 8602{
8306 free_fair_sched_group(tg); 8603 free_fair_sched_group(tg);
8307 free_rt_sched_group(tg); 8604 free_rt_sched_group(tg);
8605 autogroup_free(tg);
8308 kfree(tg); 8606 kfree(tg);
8309} 8607}
8310 8608
@@ -8313,7 +8611,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8313{ 8611{
8314 struct task_group *tg; 8612 struct task_group *tg;
8315 unsigned long flags; 8613 unsigned long flags;
8316 int i;
8317 8614
8318 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 8615 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8319 if (!tg) 8616 if (!tg)
@@ -8326,10 +8623,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8326 goto err; 8623 goto err;
8327 8624
8328 spin_lock_irqsave(&task_group_lock, flags); 8625 spin_lock_irqsave(&task_group_lock, flags);
8329 for_each_possible_cpu(i) {
8330 register_fair_sched_group(tg, i);
8331 register_rt_sched_group(tg, i);
8332 }
8333 list_add_rcu(&tg->list, &task_groups); 8626 list_add_rcu(&tg->list, &task_groups);
8334 8627
8335 WARN_ON(!parent); /* root should already exist */ 8628 WARN_ON(!parent); /* root should already exist */
@@ -8359,11 +8652,11 @@ void sched_destroy_group(struct task_group *tg)
8359 unsigned long flags; 8652 unsigned long flags;
8360 int i; 8653 int i;
8361 8654
8362 spin_lock_irqsave(&task_group_lock, flags); 8655 /* end participation in shares distribution */
8363 for_each_possible_cpu(i) { 8656 for_each_possible_cpu(i)
8364 unregister_fair_sched_group(tg, i); 8657 unregister_fair_sched_group(tg, i);
8365 unregister_rt_sched_group(tg, i); 8658
8366 } 8659 spin_lock_irqsave(&task_group_lock, flags);
8367 list_del_rcu(&tg->list); 8660 list_del_rcu(&tg->list);
8368 list_del_rcu(&tg->siblings); 8661 list_del_rcu(&tg->siblings);
8369 spin_unlock_irqrestore(&task_group_lock, flags); 8662 spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8386,57 +8679,30 @@ void sched_move_task(struct task_struct *tsk)
8386 rq = task_rq_lock(tsk, &flags); 8679 rq = task_rq_lock(tsk, &flags);
8387 8680
8388 running = task_current(rq, tsk); 8681 running = task_current(rq, tsk);
8389 on_rq = tsk->se.on_rq; 8682 on_rq = tsk->on_rq;
8390 8683
8391 if (on_rq) 8684 if (on_rq)
8392 dequeue_task(rq, tsk, 0); 8685 dequeue_task(rq, tsk, 0);
8393 if (unlikely(running)) 8686 if (unlikely(running))
8394 tsk->sched_class->put_prev_task(rq, tsk); 8687 tsk->sched_class->put_prev_task(rq, tsk);
8395 8688
8396 set_task_rq(tsk, task_cpu(tsk));
8397
8398#ifdef CONFIG_FAIR_GROUP_SCHED 8689#ifdef CONFIG_FAIR_GROUP_SCHED
8399 if (tsk->sched_class->moved_group) 8690 if (tsk->sched_class->task_move_group)
8400 tsk->sched_class->moved_group(tsk, on_rq); 8691 tsk->sched_class->task_move_group(tsk, on_rq);
8692 else
8401#endif 8693#endif
8694 set_task_rq(tsk, task_cpu(tsk));
8402 8695
8403 if (unlikely(running)) 8696 if (unlikely(running))
8404 tsk->sched_class->set_curr_task(rq); 8697 tsk->sched_class->set_curr_task(rq);
8405 if (on_rq) 8698 if (on_rq)
8406 enqueue_task(rq, tsk, 0); 8699 enqueue_task(rq, tsk, 0);
8407 8700
8408 task_rq_unlock(rq, &flags); 8701 task_rq_unlock(rq, tsk, &flags);
8409} 8702}
8410#endif /* CONFIG_CGROUP_SCHED */ 8703#endif /* CONFIG_CGROUP_SCHED */
8411 8704
8412#ifdef CONFIG_FAIR_GROUP_SCHED 8705#ifdef CONFIG_FAIR_GROUP_SCHED
8413static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8414{
8415 struct cfs_rq *cfs_rq = se->cfs_rq;
8416 int on_rq;
8417
8418 on_rq = se->on_rq;
8419 if (on_rq)
8420 dequeue_entity(cfs_rq, se, 0);
8421
8422 se->load.weight = shares;
8423 se->load.inv_weight = 0;
8424
8425 if (on_rq)
8426 enqueue_entity(cfs_rq, se, 0);
8427}
8428
8429static void set_se_shares(struct sched_entity *se, unsigned long shares)
8430{
8431 struct cfs_rq *cfs_rq = se->cfs_rq;
8432 struct rq *rq = cfs_rq->rq;
8433 unsigned long flags;
8434
8435 raw_spin_lock_irqsave(&rq->lock, flags);
8436 __set_se_shares(se, shares);
8437 raw_spin_unlock_irqrestore(&rq->lock, flags);
8438}
8439
8440static DEFINE_MUTEX(shares_mutex); 8706static DEFINE_MUTEX(shares_mutex);
8441 8707
8442int sched_group_set_shares(struct task_group *tg, unsigned long shares) 8708int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8450,46 +8716,25 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8450 if (!tg->se[0]) 8716 if (!tg->se[0])
8451 return -EINVAL; 8717 return -EINVAL;
8452 8718
8453 if (shares < MIN_SHARES) 8719 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8454 shares = MIN_SHARES;
8455 else if (shares > MAX_SHARES)
8456 shares = MAX_SHARES;
8457 8720
8458 mutex_lock(&shares_mutex); 8721 mutex_lock(&shares_mutex);
8459 if (tg->shares == shares) 8722 if (tg->shares == shares)
8460 goto done; 8723 goto done;
8461 8724
8462 spin_lock_irqsave(&task_group_lock, flags);
8463 for_each_possible_cpu(i)
8464 unregister_fair_sched_group(tg, i);
8465 list_del_rcu(&tg->siblings);
8466 spin_unlock_irqrestore(&task_group_lock, flags);
8467
8468 /* wait for any ongoing reference to this group to finish */
8469 synchronize_sched();
8470
8471 /*
8472 * Now we are free to modify the group's share on each cpu
8473 * w/o tripping rebalance_share or load_balance_fair.
8474 */
8475 tg->shares = shares; 8725 tg->shares = shares;
8476 for_each_possible_cpu(i) { 8726 for_each_possible_cpu(i) {
8477 /* 8727 struct rq *rq = cpu_rq(i);
8478 * force a rebalance 8728 struct sched_entity *se;
8479 */ 8729
8480 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8730 se = tg->se[i];
8481 set_se_shares(tg->se[i], shares); 8731 /* Propagate contribution to hierarchy */
8732 raw_spin_lock_irqsave(&rq->lock, flags);
8733 for_each_sched_entity(se)
8734 update_cfs_shares(group_cfs_rq(se));
8735 raw_spin_unlock_irqrestore(&rq->lock, flags);
8482 } 8736 }
8483 8737
8484 /*
8485 * Enable load balance activity on this group, by inserting it back on
8486 * each cpu's rq->leaf_cfs_rq_list.
8487 */
8488 spin_lock_irqsave(&task_group_lock, flags);
8489 for_each_possible_cpu(i)
8490 register_fair_sched_group(tg, i);
8491 list_add_rcu(&tg->siblings, &tg->parent->children);
8492 spin_unlock_irqrestore(&task_group_lock, flags);
8493done: 8738done:
8494 mutex_unlock(&shares_mutex); 8739 mutex_unlock(&shares_mutex);
8495 return 0; 8740 return 0;
@@ -8624,7 +8869,7 @@ static int tg_set_bandwidth(struct task_group *tg,
8624 raw_spin_unlock(&rt_rq->rt_runtime_lock); 8869 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8625 } 8870 }
8626 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8871 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8627 unlock: 8872unlock:
8628 read_unlock(&tasklist_lock); 8873 read_unlock(&tasklist_lock);
8629 mutex_unlock(&rt_constraints_mutex); 8874 mutex_unlock(&rt_constraints_mutex);
8630 8875
@@ -8788,7 +9033,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8788 9033
8789 if (!cgrp->parent) { 9034 if (!cgrp->parent) {
8790 /* This is early initialization for the top cgroup */ 9035 /* This is early initialization for the top cgroup */
8791 return &init_task_group.css; 9036 return &root_task_group.css;
8792 } 9037 }
8793 9038
8794 parent = cgroup_tg(cgrp->parent); 9039 parent = cgroup_tg(cgrp->parent);
@@ -8821,56 +9066,39 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
8821 return 0; 9066 return 0;
8822} 9067}
8823 9068
8824static int 9069static void
8825cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 9070cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
8826 struct task_struct *tsk, bool threadgroup)
8827{ 9071{
8828 int retval = cpu_cgroup_can_attach_task(cgrp, tsk); 9072 sched_move_task(tsk);
8829 if (retval)
8830 return retval;
8831 if (threadgroup) {
8832 struct task_struct *c;
8833 rcu_read_lock();
8834 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
8835 retval = cpu_cgroup_can_attach_task(cgrp, c);
8836 if (retval) {
8837 rcu_read_unlock();
8838 return retval;
8839 }
8840 }
8841 rcu_read_unlock();
8842 }
8843 return 0;
8844} 9073}
8845 9074
8846static void 9075static void
8847cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 9076cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
8848 struct cgroup *old_cont, struct task_struct *tsk, 9077 struct cgroup *old_cgrp, struct task_struct *task)
8849 bool threadgroup)
8850{ 9078{
8851 sched_move_task(tsk); 9079 /*
8852 if (threadgroup) { 9080 * cgroup_exit() is called in the copy_process() failure path.
8853 struct task_struct *c; 9081 * Ignore this case since the task hasn't ran yet, this avoids
8854 rcu_read_lock(); 9082 * trying to poke a half freed task state from generic code.
8855 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 9083 */
8856 sched_move_task(c); 9084 if (!(task->flags & PF_EXITING))
8857 } 9085 return;
8858 rcu_read_unlock(); 9086
8859 } 9087 sched_move_task(task);
8860} 9088}
8861 9089
8862#ifdef CONFIG_FAIR_GROUP_SCHED 9090#ifdef CONFIG_FAIR_GROUP_SCHED
8863static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 9091static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
8864 u64 shareval) 9092 u64 shareval)
8865{ 9093{
8866 return sched_group_set_shares(cgroup_tg(cgrp), shareval); 9094 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
8867} 9095}
8868 9096
8869static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 9097static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8870{ 9098{
8871 struct task_group *tg = cgroup_tg(cgrp); 9099 struct task_group *tg = cgroup_tg(cgrp);
8872 9100
8873 return (u64) tg->shares; 9101 return (u64) scale_load_down(tg->shares);
8874} 9102}
8875#endif /* CONFIG_FAIR_GROUP_SCHED */ 9103#endif /* CONFIG_FAIR_GROUP_SCHED */
8876 9104
@@ -8929,8 +9157,9 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8929 .name = "cpu", 9157 .name = "cpu",
8930 .create = cpu_cgroup_create, 9158 .create = cpu_cgroup_create,
8931 .destroy = cpu_cgroup_destroy, 9159 .destroy = cpu_cgroup_destroy,
8932 .can_attach = cpu_cgroup_can_attach, 9160 .can_attach_task = cpu_cgroup_can_attach_task,
8933 .attach = cpu_cgroup_attach, 9161 .attach_task = cpu_cgroup_attach_task,
9162 .exit = cpu_cgroup_exit,
8934 .populate = cpu_cgroup_populate, 9163 .populate = cpu_cgroup_populate,
8935 .subsys_id = cpu_cgroup_subsys_id, 9164 .subsys_id = cpu_cgroup_subsys_id,
8936 .early_init = 1, 9165 .early_init = 1,
@@ -9215,72 +9444,3 @@ struct cgroup_subsys cpuacct_subsys = {
9215}; 9444};
9216#endif /* CONFIG_CGROUP_CPUACCT */ 9445#endif /* CONFIG_CGROUP_CPUACCT */
9217 9446
9218#ifndef CONFIG_SMP
9219
9220void synchronize_sched_expedited(void)
9221{
9222 barrier();
9223}
9224EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9225
9226#else /* #ifndef CONFIG_SMP */
9227
9228static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9229
9230static int synchronize_sched_expedited_cpu_stop(void *data)
9231{
9232 /*
9233 * There must be a full memory barrier on each affected CPU
9234 * between the time that try_stop_cpus() is called and the
9235 * time that it returns.
9236 *
9237 * In the current initial implementation of cpu_stop, the
9238 * above condition is already met when the control reaches
9239 * this point and the following smp_mb() is not strictly
9240 * necessary. Do smp_mb() anyway for documentation and
9241 * robustness against future implementation changes.
9242 */
9243 smp_mb(); /* See above comment block. */
9244 return 0;
9245}
9246
9247/*
9248 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
9249 * approach to force grace period to end quickly. This consumes
9250 * significant time on all CPUs, and is thus not recommended for
9251 * any sort of common-case code.
9252 *
9253 * Note that it is illegal to call this function while holding any
9254 * lock that is acquired by a CPU-hotplug notifier. Failing to
9255 * observe this restriction will result in deadlock.
9256 */
9257void synchronize_sched_expedited(void)
9258{
9259 int snap, trycount = 0;
9260
9261 smp_mb(); /* ensure prior mod happens before capturing snap. */
9262 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9263 get_online_cpus();
9264 while (try_stop_cpus(cpu_online_mask,
9265 synchronize_sched_expedited_cpu_stop,
9266 NULL) == -EAGAIN) {
9267 put_online_cpus();
9268 if (trycount++ < 10)
9269 udelay(trycount * num_online_cpus());
9270 else {
9271 synchronize_sched();
9272 return;
9273 }
9274 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9275 smp_mb(); /* ensure test happens before caller kfree */
9276 return;
9277 }
9278 get_online_cpus();
9279 }
9280 atomic_inc(&synchronize_sched_expedited_count);
9281 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9282 put_online_cpus();
9283}
9284EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9285
9286#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
new file mode 100644
index 000000000000..429242f3c484
--- /dev/null
+++ b/kernel/sched_autogroup.c
@@ -0,0 +1,275 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3#include <linux/proc_fs.h>
4#include <linux/seq_file.h>
5#include <linux/kallsyms.h>
6#include <linux/utsname.h>
7
8unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
9static struct autogroup autogroup_default;
10static atomic_t autogroup_seq_nr;
11
12static void __init autogroup_init(struct task_struct *init_task)
13{
14 autogroup_default.tg = &root_task_group;
15 kref_init(&autogroup_default.kref);
16 init_rwsem(&autogroup_default.lock);
17 init_task->signal->autogroup = &autogroup_default;
18}
19
20static inline void autogroup_free(struct task_group *tg)
21{
22 kfree(tg->autogroup);
23}
24
25static inline void autogroup_destroy(struct kref *kref)
26{
27 struct autogroup *ag = container_of(kref, struct autogroup, kref);
28
29#ifdef CONFIG_RT_GROUP_SCHED
30 /* We've redirected RT tasks to the root task group... */
31 ag->tg->rt_se = NULL;
32 ag->tg->rt_rq = NULL;
33#endif
34 sched_destroy_group(ag->tg);
35}
36
37static inline void autogroup_kref_put(struct autogroup *ag)
38{
39 kref_put(&ag->kref, autogroup_destroy);
40}
41
42static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
43{
44 kref_get(&ag->kref);
45 return ag;
46}
47
48static inline struct autogroup *autogroup_task_get(struct task_struct *p)
49{
50 struct autogroup *ag;
51 unsigned long flags;
52
53 if (!lock_task_sighand(p, &flags))
54 return autogroup_kref_get(&autogroup_default);
55
56 ag = autogroup_kref_get(p->signal->autogroup);
57 unlock_task_sighand(p, &flags);
58
59 return ag;
60}
61
62#ifdef CONFIG_RT_GROUP_SCHED
63static void free_rt_sched_group(struct task_group *tg);
64#endif
65
66static inline struct autogroup *autogroup_create(void)
67{
68 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
69 struct task_group *tg;
70
71 if (!ag)
72 goto out_fail;
73
74 tg = sched_create_group(&root_task_group);
75
76 if (IS_ERR(tg))
77 goto out_free;
78
79 kref_init(&ag->kref);
80 init_rwsem(&ag->lock);
81 ag->id = atomic_inc_return(&autogroup_seq_nr);
82 ag->tg = tg;
83#ifdef CONFIG_RT_GROUP_SCHED
84 /*
85 * Autogroup RT tasks are redirected to the root task group
86 * so we don't have to move tasks around upon policy change,
87 * or flail around trying to allocate bandwidth on the fly.
88 * A bandwidth exception in __sched_setscheduler() allows
89 * the policy change to proceed. Thereafter, task_group()
90 * returns &root_task_group, so zero bandwidth is required.
91 */
92 free_rt_sched_group(tg);
93 tg->rt_se = root_task_group.rt_se;
94 tg->rt_rq = root_task_group.rt_rq;
95#endif
96 tg->autogroup = ag;
97
98 return ag;
99
100out_free:
101 kfree(ag);
102out_fail:
103 if (printk_ratelimit()) {
104 printk(KERN_WARNING "autogroup_create: %s failure.\n",
105 ag ? "sched_create_group()" : "kmalloc()");
106 }
107
108 return autogroup_kref_get(&autogroup_default);
109}
110
111static inline bool
112task_wants_autogroup(struct task_struct *p, struct task_group *tg)
113{
114 if (tg != &root_task_group)
115 return false;
116
117 if (p->sched_class != &fair_sched_class)
118 return false;
119
120 /*
121 * We can only assume the task group can't go away on us if
122 * autogroup_move_group() can see us on ->thread_group list.
123 */
124 if (p->flags & PF_EXITING)
125 return false;
126
127 return true;
128}
129
130static inline bool task_group_is_autogroup(struct task_group *tg)
131{
132 return !!tg->autogroup;
133}
134
135static inline struct task_group *
136autogroup_task_group(struct task_struct *p, struct task_group *tg)
137{
138 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
139
140 if (enabled && task_wants_autogroup(p, tg))
141 return p->signal->autogroup->tg;
142
143 return tg;
144}
145
146static void
147autogroup_move_group(struct task_struct *p, struct autogroup *ag)
148{
149 struct autogroup *prev;
150 struct task_struct *t;
151 unsigned long flags;
152
153 BUG_ON(!lock_task_sighand(p, &flags));
154
155 prev = p->signal->autogroup;
156 if (prev == ag) {
157 unlock_task_sighand(p, &flags);
158 return;
159 }
160
161 p->signal->autogroup = autogroup_kref_get(ag);
162
163 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
164 goto out;
165
166 t = p;
167 do {
168 sched_move_task(t);
169 } while_each_thread(p, t);
170
171out:
172 unlock_task_sighand(p, &flags);
173 autogroup_kref_put(prev);
174}
175
176/* Allocates GFP_KERNEL, cannot be called under any spinlock */
177void sched_autogroup_create_attach(struct task_struct *p)
178{
179 struct autogroup *ag = autogroup_create();
180
181 autogroup_move_group(p, ag);
182 /* drop extra reference added by autogroup_create() */
183 autogroup_kref_put(ag);
184}
185EXPORT_SYMBOL(sched_autogroup_create_attach);
186
187/* Cannot be called under siglock. Currently has no users */
188void sched_autogroup_detach(struct task_struct *p)
189{
190 autogroup_move_group(p, &autogroup_default);
191}
192EXPORT_SYMBOL(sched_autogroup_detach);
193
194void sched_autogroup_fork(struct signal_struct *sig)
195{
196 sig->autogroup = autogroup_task_get(current);
197}
198
199void sched_autogroup_exit(struct signal_struct *sig)
200{
201 autogroup_kref_put(sig->autogroup);
202}
203
204static int __init setup_autogroup(char *str)
205{
206 sysctl_sched_autogroup_enabled = 0;
207
208 return 1;
209}
210
211__setup("noautogroup", setup_autogroup);
212
213#ifdef CONFIG_PROC_FS
214
215int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
216{
217 static unsigned long next = INITIAL_JIFFIES;
218 struct autogroup *ag;
219 int err;
220
221 if (*nice < -20 || *nice > 19)
222 return -EINVAL;
223
224 err = security_task_setnice(current, *nice);
225 if (err)
226 return err;
227
228 if (*nice < 0 && !can_nice(current, *nice))
229 return -EPERM;
230
231 /* this is a heavy operation taking global locks.. */
232 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
233 return -EAGAIN;
234
235 next = HZ / 10 + jiffies;
236 ag = autogroup_task_get(p);
237
238 down_write(&ag->lock);
239 err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
240 if (!err)
241 ag->nice = *nice;
242 up_write(&ag->lock);
243
244 autogroup_kref_put(ag);
245
246 return err;
247}
248
249void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
250{
251 struct autogroup *ag = autogroup_task_get(p);
252
253 if (!task_group_is_autogroup(ag->tg))
254 goto out;
255
256 down_read(&ag->lock);
257 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
258 up_read(&ag->lock);
259
260out:
261 autogroup_kref_put(ag);
262}
263#endif /* CONFIG_PROC_FS */
264
265#ifdef CONFIG_SCHED_DEBUG
266static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
267{
268 if (!task_group_is_autogroup(tg))
269 return 0;
270
271 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
272}
273#endif /* CONFIG_SCHED_DEBUG */
274
275#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
new file mode 100644
index 000000000000..05577055cfca
--- /dev/null
+++ b/kernel/sched_autogroup.h
@@ -0,0 +1,41 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3struct autogroup {
4 /*
5 * reference doesn't mean how many thread attach to this
6 * autogroup now. It just stands for the number of task
7 * could use this autogroup.
8 */
9 struct kref kref;
10 struct task_group *tg;
11 struct rw_semaphore lock;
12 unsigned long id;
13 int nice;
14};
15
16static inline struct task_group *
17autogroup_task_group(struct task_struct *p, struct task_group *tg);
18
19#else /* !CONFIG_SCHED_AUTOGROUP */
20
21static inline void autogroup_init(struct task_struct *init_task) { }
22static inline void autogroup_free(struct task_group *tg) { }
23static inline bool task_group_is_autogroup(struct task_group *tg)
24{
25 return 0;
26}
27
28static inline struct task_group *
29autogroup_task_group(struct task_struct *p, struct task_group *tg)
30{
31 return tg;
32}
33
34#ifdef CONFIG_SCHED_DEBUG
35static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
36{
37 return 0;
38}
39#endif
40
41#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 52f1a149bfb1..9d8af0b3fb64 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
79} 79}
80EXPORT_SYMBOL_GPL(sched_clock); 80EXPORT_SYMBOL_GPL(sched_clock);
81 81
82static __read_mostly int sched_clock_running; 82__read_mostly int sched_clock_running;
83 83
84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
85__read_mostly int sched_clock_stable; 85__read_mostly int sched_clock_stable;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 2e1b0d17dd9b..a6710a112b4f 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -16,6 +16,8 @@
16#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
17#include <linux/utsname.h> 17#include <linux/utsname.h>
18 18
19static DEFINE_SPINLOCK(sched_debug_lock);
20
19/* 21/*
20 * This allows printing both to /proc/sched_debug and 22 * This allows printing both to /proc/sched_debug and
21 * to the console 23 * to the console
@@ -54,8 +56,7 @@ static unsigned long nsec_low(unsigned long long nsec)
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x) 56#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55 57
56#ifdef CONFIG_FAIR_GROUP_SCHED 58#ifdef CONFIG_FAIR_GROUP_SCHED
57static void print_cfs_group_stats(struct seq_file *m, int cpu, 59static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
58 struct task_group *tg)
59{ 60{
60 struct sched_entity *se = tg->se[cpu]; 61 struct sched_entity *se = tg->se[cpu];
61 if (!se) 62 if (!se)
@@ -87,6 +88,26 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
87} 88}
88#endif 89#endif
89 90
91#ifdef CONFIG_CGROUP_SCHED
92static char group_path[PATH_MAX];
93
94static char *task_group_path(struct task_group *tg)
95{
96 if (autogroup_path(tg, group_path, PATH_MAX))
97 return group_path;
98
99 /*
100 * May be NULL if the underlying cgroup isn't fully-created yet
101 */
102 if (!tg->css.cgroup) {
103 group_path[0] = '\0';
104 return group_path;
105 }
106 cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
107 return group_path;
108}
109#endif
110
90static void 111static void
91print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 112print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
92{ 113{
@@ -109,17 +130,10 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
109 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 130 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 131 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
111#endif 132#endif
112
113#ifdef CONFIG_CGROUP_SCHED 133#ifdef CONFIG_CGROUP_SCHED
114 { 134 SEQ_printf(m, " %s", task_group_path(task_group(p)));
115 char path[64];
116
117 rcu_read_lock();
118 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
119 rcu_read_unlock();
120 SEQ_printf(m, " %s", path);
121 }
122#endif 135#endif
136
123 SEQ_printf(m, "\n"); 137 SEQ_printf(m, "\n");
124} 138}
125 139
@@ -138,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
138 read_lock_irqsave(&tasklist_lock, flags); 152 read_lock_irqsave(&tasklist_lock, flags);
139 153
140 do_each_thread(g, p) { 154 do_each_thread(g, p) {
141 if (!p->se.on_rq || task_cpu(p) != rq_cpu) 155 if (!p->on_rq || task_cpu(p) != rq_cpu)
142 continue; 156 continue;
143 157
144 print_task(m, rq, p); 158 print_task(m, rq, p);
@@ -147,19 +161,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
147 read_unlock_irqrestore(&tasklist_lock, flags); 161 read_unlock_irqrestore(&tasklist_lock, flags);
148} 162}
149 163
150#if defined(CONFIG_CGROUP_SCHED) && \
151 (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
152static void task_group_path(struct task_group *tg, char *buf, int buflen)
153{
154 /* may be NULL if the underlying cgroup isn't fully-created yet */
155 if (!tg->css.cgroup) {
156 buf[0] = '\0';
157 return;
158 }
159 cgroup_path(tg->css.cgroup, buf, buflen);
160}
161#endif
162
163void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 164void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
164{ 165{
165 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, 166 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -168,13 +169,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
168 struct sched_entity *last; 169 struct sched_entity *last;
169 unsigned long flags; 170 unsigned long flags;
170 171
171#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) 172#ifdef CONFIG_FAIR_GROUP_SCHED
172 char path[128]; 173 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
173 struct task_group *tg = cfs_rq->tg;
174
175 task_group_path(tg, path, sizeof(path));
176
177 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
178#else 174#else
179 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 175 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
180#endif 176#endif
@@ -183,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
183 179
184 raw_spin_lock_irqsave(&rq->lock, flags); 180 raw_spin_lock_irqsave(&rq->lock, flags);
185 if (cfs_rq->rb_leftmost) 181 if (cfs_rq->rb_leftmost)
186 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; 182 MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
187 last = __pick_last_entity(cfs_rq); 183 last = __pick_last_entity(cfs_rq);
188 if (last) 184 if (last)
189 max_vruntime = last->vruntime; 185 max_vruntime = last->vruntime;
@@ -202,33 +198,34 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
202 spread0 = min_vruntime - rq0_min_vruntime; 198 spread0 = min_vruntime - rq0_min_vruntime;
203 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", 199 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
204 SPLIT_NS(spread0)); 200 SPLIT_NS(spread0));
205 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207
208 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", 201 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
209 cfs_rq->nr_spread_over); 202 cfs_rq->nr_spread_over);
203 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
204 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
210#ifdef CONFIG_FAIR_GROUP_SCHED 205#ifdef CONFIG_FAIR_GROUP_SCHED
211#ifdef CONFIG_SMP 206#ifdef CONFIG_SMP
212 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); 207 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
208 SPLIT_NS(cfs_rq->load_avg));
209 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
210 SPLIT_NS(cfs_rq->load_period));
211 SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
212 cfs_rq->load_contribution);
213 SEQ_printf(m, " .%-30s: %d\n", "load_tg",
214 atomic_read(&cfs_rq->tg->load_weight));
213#endif 215#endif
216
214 print_cfs_group_stats(m, cpu, cfs_rq->tg); 217 print_cfs_group_stats(m, cpu, cfs_rq->tg);
215#endif 218#endif
216} 219}
217 220
218void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) 221void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
219{ 222{
220#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) 223#ifdef CONFIG_RT_GROUP_SCHED
221 char path[128]; 224 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
222 struct task_group *tg = rt_rq->tg;
223
224 task_group_path(tg, path, sizeof(path));
225
226 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
227#else 225#else
228 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); 226 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
229#endif 227#endif
230 228
231
232#define P(x) \ 229#define P(x) \
233 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) 230 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
234#define PN(x) \ 231#define PN(x) \
@@ -243,9 +240,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
243#undef P 240#undef P
244} 241}
245 242
243extern __read_mostly int sched_clock_running;
244
246static void print_cpu(struct seq_file *m, int cpu) 245static void print_cpu(struct seq_file *m, int cpu)
247{ 246{
248 struct rq *rq = cpu_rq(cpu); 247 struct rq *rq = cpu_rq(cpu);
248 unsigned long flags;
249 249
250#ifdef CONFIG_X86 250#ifdef CONFIG_X86
251 { 251 {
@@ -296,14 +296,17 @@ static void print_cpu(struct seq_file *m, int cpu)
296 P(ttwu_count); 296 P(ttwu_count);
297 P(ttwu_local); 297 P(ttwu_local);
298 298
299 P(bkl_count);
300
301#undef P 299#undef P
300#undef P64
302#endif 301#endif
302 spin_lock_irqsave(&sched_debug_lock, flags);
303 print_cfs_stats(m, cpu); 303 print_cfs_stats(m, cpu);
304 print_rt_stats(m, cpu); 304 print_rt_stats(m, cpu);
305 305
306 rcu_read_lock();
306 print_rq(m, rq, cpu); 307 print_rq(m, rq, cpu);
308 rcu_read_unlock();
309 spin_unlock_irqrestore(&sched_debug_lock, flags);
307} 310}
308 311
309static const char *sched_tunable_scaling_names[] = { 312static const char *sched_tunable_scaling_names[] = {
@@ -314,21 +317,42 @@ static const char *sched_tunable_scaling_names[] = {
314 317
315static int sched_debug_show(struct seq_file *m, void *v) 318static int sched_debug_show(struct seq_file *m, void *v)
316{ 319{
317 u64 now = ktime_to_ns(ktime_get()); 320 u64 ktime, sched_clk, cpu_clk;
321 unsigned long flags;
318 int cpu; 322 int cpu;
319 323
320 SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", 324 local_irq_save(flags);
325 ktime = ktime_to_ns(ktime_get());
326 sched_clk = sched_clock();
327 cpu_clk = local_clock();
328 local_irq_restore(flags);
329
330 SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
321 init_utsname()->release, 331 init_utsname()->release,
322 (int)strcspn(init_utsname()->version, " "), 332 (int)strcspn(init_utsname()->version, " "),
323 init_utsname()->version); 333 init_utsname()->version);
324 334
325 SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); 335#define P(x) \
336 SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
337#define PN(x) \
338 SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
339 PN(ktime);
340 PN(sched_clk);
341 PN(cpu_clk);
342 P(jiffies);
343#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
344 P(sched_clock_stable);
345#endif
346#undef PN
347#undef P
348
349 SEQ_printf(m, "\n");
350 SEQ_printf(m, "sysctl_sched\n");
326 351
327#define P(x) \ 352#define P(x) \
328 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) 353 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
329#define PN(x) \ 354#define PN(x) \
330 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) 355 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
331 P(jiffies);
332 PN(sysctl_sched_latency); 356 PN(sysctl_sched_latency);
333 PN(sysctl_sched_min_granularity); 357 PN(sysctl_sched_min_granularity);
334 PN(sysctl_sched_wakeup_granularity); 358 PN(sysctl_sched_wakeup_granularity);
@@ -414,7 +438,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
414 P(se.statistics.wait_count); 438 P(se.statistics.wait_count);
415 PN(se.statistics.iowait_sum); 439 PN(se.statistics.iowait_sum);
416 P(se.statistics.iowait_count); 440 P(se.statistics.iowait_count);
417 P(sched_info.bkl_count);
418 P(se.nr_migrations); 441 P(se.nr_migrations);
419 P(se.statistics.nr_migrations_cold); 442 P(se.statistics.nr_migrations_cold);
420 P(se.statistics.nr_failed_migrations_affine); 443 P(se.statistics.nr_failed_migrations_affine);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e0e8d5ca3c98..334eb474af93 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -22,10 +22,11 @@
22 22
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/cpumask.h>
25 26
26/* 27/*
27 * Targeted preemption latency for CPU-bound tasks: 28 * Targeted preemption latency for CPU-bound tasks:
28 * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) 29 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
29 * 30 *
30 * NOTE: this latency value is not the same as the concept of 31 * NOTE: this latency value is not the same as the concept of
31 * 'timeslice length' - timeslices in CFS are of variable length 32 * 'timeslice length' - timeslices in CFS are of variable length
@@ -52,7 +53,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
52 53
53/* 54/*
54 * Minimal preemption granularity for CPU-bound tasks: 55 * Minimal preemption granularity for CPU-bound tasks:
55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) 56 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
56 */ 57 */
57unsigned int sysctl_sched_min_granularity = 750000ULL; 58unsigned int sysctl_sched_min_granularity = 750000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; 59unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
@@ -69,14 +70,6 @@ static unsigned int sched_nr_latency = 8;
69unsigned int sysctl_sched_child_runs_first __read_mostly; 70unsigned int sysctl_sched_child_runs_first __read_mostly;
70 71
71/* 72/*
72 * sys_sched_yield() compat mode
73 *
74 * This option switches the agressive yield implementation of the
75 * old scheduler back on.
76 */
77unsigned int __read_mostly sysctl_sched_compat_yield;
78
79/*
80 * SCHED_OTHER wake-up granularity. 73 * SCHED_OTHER wake-up granularity.
81 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 74 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
82 * 75 *
@@ -89,6 +82,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
89 82
90const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 83const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
91 84
85/*
86 * The exponential sliding window over which load is averaged for shares
87 * distribution.
88 * (default: 10msec)
89 */
90unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
91
92static const struct sched_class fair_sched_class; 92static const struct sched_class fair_sched_class;
93 93
94/************************************************************** 94/**************************************************************
@@ -143,6 +143,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
143 return cfs_rq->tg->cfs_rq[this_cpu]; 143 return cfs_rq->tg->cfs_rq[this_cpu];
144} 144}
145 145
146static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
147{
148 if (!cfs_rq->on_list) {
149 /*
150 * Ensure we either appear before our parent (if already
151 * enqueued) or force our parent to appear after us when it is
152 * enqueued. The fact that we always enqueue bottom-up
153 * reduces this to two cases.
154 */
155 if (cfs_rq->tg->parent &&
156 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
157 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
158 &rq_of(cfs_rq)->leaf_cfs_rq_list);
159 } else {
160 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
161 &rq_of(cfs_rq)->leaf_cfs_rq_list);
162 }
163
164 cfs_rq->on_list = 1;
165 }
166}
167
168static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
169{
170 if (cfs_rq->on_list) {
171 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
172 cfs_rq->on_list = 0;
173 }
174}
175
146/* Iterate thr' all leaf cfs_rq's on a runqueue */ 176/* Iterate thr' all leaf cfs_rq's on a runqueue */
147#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 177#define for_each_leaf_cfs_rq(rq, cfs_rq) \
148 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 178 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +276,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
246 return &cpu_rq(this_cpu)->cfs; 276 return &cpu_rq(this_cpu)->cfs;
247} 277}
248 278
279static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
280{
281}
282
283static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
284{
285}
286
249#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 287#define for_each_leaf_cfs_rq(rq, cfs_rq) \
250 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 288 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
251 289
@@ -320,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
320 } 358 }
321 359
322 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); 360 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
361#ifndef CONFIG_64BIT
362 smp_wmb();
363 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
364#endif
323} 365}
324 366
325/* 367/*
@@ -374,7 +416,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
374 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 416 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
375} 417}
376 418
377static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) 419static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
378{ 420{
379 struct rb_node *left = cfs_rq->rb_leftmost; 421 struct rb_node *left = cfs_rq->rb_leftmost;
380 422
@@ -384,6 +426,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
384 return rb_entry(left, struct sched_entity, run_node); 426 return rb_entry(left, struct sched_entity, run_node);
385} 427}
386 428
429static struct sched_entity *__pick_next_entity(struct sched_entity *se)
430{
431 struct rb_node *next = rb_next(&se->run_node);
432
433 if (!next)
434 return NULL;
435
436 return rb_entry(next, struct sched_entity, run_node);
437}
438
439#ifdef CONFIG_SCHED_DEBUG
387static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 440static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
388{ 441{
389 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 442 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -398,7 +451,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
398 * Scheduling class statistics methods: 451 * Scheduling class statistics methods:
399 */ 452 */
400 453
401#ifdef CONFIG_SCHED_DEBUG
402int sched_proc_update_handler(struct ctl_table *table, int write, 454int sched_proc_update_handler(struct ctl_table *table, int write,
403 void __user *buffer, size_t *lenp, 455 void __user *buffer, size_t *lenp,
404 loff_t *ppos) 456 loff_t *ppos)
@@ -417,7 +469,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
417 WRT_SYSCTL(sched_min_granularity); 469 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency); 470 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity); 471 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL 472#undef WRT_SYSCTL
422 473
423 return 0; 474 return 0;
@@ -495,6 +546,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
495 return calc_delta_fair(sched_slice(cfs_rq, se), se); 546 return calc_delta_fair(sched_slice(cfs_rq, se), se);
496} 547}
497 548
549static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
550static void update_cfs_shares(struct cfs_rq *cfs_rq);
551
498/* 552/*
499 * Update the current task's runtime statistics. Skip current tasks that 553 * Update the current task's runtime statistics. Skip current tasks that
500 * are not in our scheduling class. 554 * are not in our scheduling class.
@@ -514,12 +568,16 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
514 568
515 curr->vruntime += delta_exec_weighted; 569 curr->vruntime += delta_exec_weighted;
516 update_min_vruntime(cfs_rq); 570 update_min_vruntime(cfs_rq);
571
572#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
573 cfs_rq->load_unacc_exec_time += delta_exec;
574#endif
517} 575}
518 576
519static void update_curr(struct cfs_rq *cfs_rq) 577static void update_curr(struct cfs_rq *cfs_rq)
520{ 578{
521 struct sched_entity *curr = cfs_rq->curr; 579 struct sched_entity *curr = cfs_rq->curr;
522 u64 now = rq_of(cfs_rq)->clock; 580 u64 now = rq_of(cfs_rq)->clock_task;
523 unsigned long delta_exec; 581 unsigned long delta_exec;
524 582
525 if (unlikely(!curr)) 583 if (unlikely(!curr))
@@ -602,7 +660,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
602 /* 660 /*
603 * We are starting a new run period: 661 * We are starting a new run period:
604 */ 662 */
605 se->exec_start = rq_of(cfs_rq)->clock; 663 se->exec_start = rq_of(cfs_rq)->clock_task;
606} 664}
607 665
608/************************************************** 666/**************************************************
@@ -633,7 +691,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 list_add(&se->group_node, &cfs_rq->tasks); 691 list_add(&se->group_node, &cfs_rq->tasks);
634 } 692 }
635 cfs_rq->nr_running++; 693 cfs_rq->nr_running++;
636 se->on_rq = 1;
637} 694}
638 695
639static void 696static void
@@ -647,9 +704,164 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
647 list_del_init(&se->group_node); 704 list_del_init(&se->group_node);
648 } 705 }
649 cfs_rq->nr_running--; 706 cfs_rq->nr_running--;
650 se->on_rq = 0;
651} 707}
652 708
709#ifdef CONFIG_FAIR_GROUP_SCHED
710# ifdef CONFIG_SMP
711static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
712 int global_update)
713{
714 struct task_group *tg = cfs_rq->tg;
715 long load_avg;
716
717 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
718 load_avg -= cfs_rq->load_contribution;
719
720 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
721 atomic_add(load_avg, &tg->load_weight);
722 cfs_rq->load_contribution += load_avg;
723 }
724}
725
726static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
727{
728 u64 period = sysctl_sched_shares_window;
729 u64 now, delta;
730 unsigned long load = cfs_rq->load.weight;
731
732 if (cfs_rq->tg == &root_task_group)
733 return;
734
735 now = rq_of(cfs_rq)->clock_task;
736 delta = now - cfs_rq->load_stamp;
737
738 /* truncate load history at 4 idle periods */
739 if (cfs_rq->load_stamp > cfs_rq->load_last &&
740 now - cfs_rq->load_last > 4 * period) {
741 cfs_rq->load_period = 0;
742 cfs_rq->load_avg = 0;
743 delta = period - 1;
744 }
745
746 cfs_rq->load_stamp = now;
747 cfs_rq->load_unacc_exec_time = 0;
748 cfs_rq->load_period += delta;
749 if (load) {
750 cfs_rq->load_last = now;
751 cfs_rq->load_avg += delta * load;
752 }
753
754 /* consider updating load contribution on each fold or truncate */
755 if (global_update || cfs_rq->load_period > period
756 || !cfs_rq->load_period)
757 update_cfs_rq_load_contribution(cfs_rq, global_update);
758
759 while (cfs_rq->load_period > period) {
760 /*
761 * Inline assembly required to prevent the compiler
762 * optimising this loop into a divmod call.
763 * See __iter_div_u64_rem() for another example of this.
764 */
765 asm("" : "+rm" (cfs_rq->load_period));
766 cfs_rq->load_period /= 2;
767 cfs_rq->load_avg /= 2;
768 }
769
770 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
771 list_del_leaf_cfs_rq(cfs_rq);
772}
773
774static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
775{
776 long load_weight, load, shares;
777
778 load = cfs_rq->load.weight;
779
780 load_weight = atomic_read(&tg->load_weight);
781 load_weight += load;
782 load_weight -= cfs_rq->load_contribution;
783
784 shares = (tg->shares * load);
785 if (load_weight)
786 shares /= load_weight;
787
788 if (shares < MIN_SHARES)
789 shares = MIN_SHARES;
790 if (shares > tg->shares)
791 shares = tg->shares;
792
793 return shares;
794}
795
796static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
797{
798 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
799 update_cfs_load(cfs_rq, 0);
800 update_cfs_shares(cfs_rq);
801 }
802}
803# else /* CONFIG_SMP */
804static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
805{
806}
807
808static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
809{
810 return tg->shares;
811}
812
813static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
814{
815}
816# endif /* CONFIG_SMP */
817static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
818 unsigned long weight)
819{
820 if (se->on_rq) {
821 /* commit outstanding execution time */
822 if (cfs_rq->curr == se)
823 update_curr(cfs_rq);
824 account_entity_dequeue(cfs_rq, se);
825 }
826
827 update_load_set(&se->load, weight);
828
829 if (se->on_rq)
830 account_entity_enqueue(cfs_rq, se);
831}
832
833static void update_cfs_shares(struct cfs_rq *cfs_rq)
834{
835 struct task_group *tg;
836 struct sched_entity *se;
837 long shares;
838
839 tg = cfs_rq->tg;
840 se = tg->se[cpu_of(rq_of(cfs_rq))];
841 if (!se)
842 return;
843#ifndef CONFIG_SMP
844 if (likely(se->load.weight == tg->shares))
845 return;
846#endif
847 shares = calc_cfs_shares(cfs_rq, tg);
848
849 reweight_entity(cfs_rq_of(se), se, shares);
850}
851#else /* CONFIG_FAIR_GROUP_SCHED */
852static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
853{
854}
855
856static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
857{
858}
859
860static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
861{
862}
863#endif /* CONFIG_FAIR_GROUP_SCHED */
864
653static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 865static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
654{ 866{
655#ifdef CONFIG_SCHEDSTATS 867#ifdef CONFIG_SCHEDSTATS
@@ -771,7 +983,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
771 * Update run-time statistics of the 'current'. 983 * Update run-time statistics of the 'current'.
772 */ 984 */
773 update_curr(cfs_rq); 985 update_curr(cfs_rq);
986 update_cfs_load(cfs_rq, 0);
774 account_entity_enqueue(cfs_rq, se); 987 account_entity_enqueue(cfs_rq, se);
988 update_cfs_shares(cfs_rq);
775 989
776 if (flags & ENQUEUE_WAKEUP) { 990 if (flags & ENQUEUE_WAKEUP) {
777 place_entity(cfs_rq, se, 0); 991 place_entity(cfs_rq, se, 0);
@@ -782,21 +996,55 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
782 check_spread(cfs_rq, se); 996 check_spread(cfs_rq, se);
783 if (se != cfs_rq->curr) 997 if (se != cfs_rq->curr)
784 __enqueue_entity(cfs_rq, se); 998 __enqueue_entity(cfs_rq, se);
999 se->on_rq = 1;
1000
1001 if (cfs_rq->nr_running == 1)
1002 list_add_leaf_cfs_rq(cfs_rq);
1003}
1004
1005static void __clear_buddies_last(struct sched_entity *se)
1006{
1007 for_each_sched_entity(se) {
1008 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1009 if (cfs_rq->last == se)
1010 cfs_rq->last = NULL;
1011 else
1012 break;
1013 }
785} 1014}
786 1015
787static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 1016static void __clear_buddies_next(struct sched_entity *se)
788{ 1017{
789 if (!se || cfs_rq->last == se) 1018 for_each_sched_entity(se) {
790 cfs_rq->last = NULL; 1019 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1020 if (cfs_rq->next == se)
1021 cfs_rq->next = NULL;
1022 else
1023 break;
1024 }
1025}
791 1026
792 if (!se || cfs_rq->next == se) 1027static void __clear_buddies_skip(struct sched_entity *se)
793 cfs_rq->next = NULL; 1028{
1029 for_each_sched_entity(se) {
1030 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1031 if (cfs_rq->skip == se)
1032 cfs_rq->skip = NULL;
1033 else
1034 break;
1035 }
794} 1036}
795 1037
796static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 1038static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
797{ 1039{
798 for_each_sched_entity(se) 1040 if (cfs_rq->last == se)
799 __clear_buddies(cfs_rq_of(se), se); 1041 __clear_buddies_last(se);
1042
1043 if (cfs_rq->next == se)
1044 __clear_buddies_next(se);
1045
1046 if (cfs_rq->skip == se)
1047 __clear_buddies_skip(se);
800} 1048}
801 1049
802static void 1050static void
@@ -825,8 +1073,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
825 1073
826 if (se != cfs_rq->curr) 1074 if (se != cfs_rq->curr)
827 __dequeue_entity(cfs_rq, se); 1075 __dequeue_entity(cfs_rq, se);
1076 se->on_rq = 0;
1077 update_cfs_load(cfs_rq, 0);
828 account_entity_dequeue(cfs_rq, se); 1078 account_entity_dequeue(cfs_rq, se);
829 update_min_vruntime(cfs_rq);
830 1079
831 /* 1080 /*
832 * Normalize the entity after updating the min_vruntime because the 1081 * Normalize the entity after updating the min_vruntime because the
@@ -835,6 +1084,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
835 */ 1084 */
836 if (!(flags & DEQUEUE_SLEEP)) 1085 if (!(flags & DEQUEUE_SLEEP))
837 se->vruntime -= cfs_rq->min_vruntime; 1086 se->vruntime -= cfs_rq->min_vruntime;
1087
1088 update_min_vruntime(cfs_rq);
1089 update_cfs_shares(cfs_rq);
838} 1090}
839 1091
840/* 1092/*
@@ -869,9 +1121,12 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
869 return; 1121 return;
870 1122
871 if (cfs_rq->nr_running > 1) { 1123 if (cfs_rq->nr_running > 1) {
872 struct sched_entity *se = __pick_next_entity(cfs_rq); 1124 struct sched_entity *se = __pick_first_entity(cfs_rq);
873 s64 delta = curr->vruntime - se->vruntime; 1125 s64 delta = curr->vruntime - se->vruntime;
874 1126
1127 if (delta < 0)
1128 return;
1129
875 if (delta > ideal_runtime) 1130 if (delta > ideal_runtime)
876 resched_task(rq_of(cfs_rq)->curr); 1131 resched_task(rq_of(cfs_rq)->curr);
877 } 1132 }
@@ -910,13 +1165,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
910static int 1165static int
911wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); 1166wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
912 1167
1168/*
1169 * Pick the next process, keeping these things in mind, in this order:
1170 * 1) keep things fair between processes/task groups
1171 * 2) pick the "next" process, since someone really wants that to run
1172 * 3) pick the "last" process, for cache locality
1173 * 4) do not run the "skip" process, if something else is available
1174 */
913static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 1175static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
914{ 1176{
915 struct sched_entity *se = __pick_next_entity(cfs_rq); 1177 struct sched_entity *se = __pick_first_entity(cfs_rq);
916 struct sched_entity *left = se; 1178 struct sched_entity *left = se;
917 1179
918 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) 1180 /*
919 se = cfs_rq->next; 1181 * Avoid running the skip buddy, if running something else can
1182 * be done without getting too unfair.
1183 */
1184 if (cfs_rq->skip == se) {
1185 struct sched_entity *second = __pick_next_entity(se);
1186 if (second && wakeup_preempt_entity(second, left) < 1)
1187 se = second;
1188 }
920 1189
921 /* 1190 /*
922 * Prefer last buddy, try to return the CPU to a preempted task. 1191 * Prefer last buddy, try to return the CPU to a preempted task.
@@ -924,6 +1193,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
924 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) 1193 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
925 se = cfs_rq->last; 1194 se = cfs_rq->last;
926 1195
1196 /*
1197 * Someone really wants this to run. If it's not unfair, run it.
1198 */
1199 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1200 se = cfs_rq->next;
1201
927 clear_buddies(cfs_rq, se); 1202 clear_buddies(cfs_rq, se);
928 1203
929 return se; 1204 return se;
@@ -955,6 +1230,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
955 */ 1230 */
956 update_curr(cfs_rq); 1231 update_curr(cfs_rq);
957 1232
1233 /*
1234 * Update share accounting for long-running entities.
1235 */
1236 update_entity_shares_tick(cfs_rq);
1237
958#ifdef CONFIG_SCHED_HRTICK 1238#ifdef CONFIG_SCHED_HRTICK
959 /* 1239 /*
960 * queued ticks are scheduled to match the slice, so don't bother 1240 * queued ticks are scheduled to match the slice, so don't bother
@@ -1055,9 +1335,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1055 flags = ENQUEUE_WAKEUP; 1335 flags = ENQUEUE_WAKEUP;
1056 } 1336 }
1057 1337
1338 for_each_sched_entity(se) {
1339 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1340
1341 update_cfs_load(cfs_rq, 0);
1342 update_cfs_shares(cfs_rq);
1343 }
1344
1058 hrtick_update(rq); 1345 hrtick_update(rq);
1059} 1346}
1060 1347
1348static void set_next_buddy(struct sched_entity *se);
1349
1061/* 1350/*
1062 * The dequeue_task method is called before nr_running is 1351 * The dequeue_task method is called before nr_running is
1063 * decreased. We remove the task from the rbtree and 1352 * decreased. We remove the task from the rbtree and
@@ -1067,73 +1356,56 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1067{ 1356{
1068 struct cfs_rq *cfs_rq; 1357 struct cfs_rq *cfs_rq;
1069 struct sched_entity *se = &p->se; 1358 struct sched_entity *se = &p->se;
1359 int task_sleep = flags & DEQUEUE_SLEEP;
1070 1360
1071 for_each_sched_entity(se) { 1361 for_each_sched_entity(se) {
1072 cfs_rq = cfs_rq_of(se); 1362 cfs_rq = cfs_rq_of(se);
1073 dequeue_entity(cfs_rq, se, flags); 1363 dequeue_entity(cfs_rq, se, flags);
1364
1074 /* Don't dequeue parent if it has other entities besides us */ 1365 /* Don't dequeue parent if it has other entities besides us */
1075 if (cfs_rq->load.weight) 1366 if (cfs_rq->load.weight) {
1367 /*
1368 * Bias pick_next to pick a task from this cfs_rq, as
1369 * p is sleeping when it is within its sched_slice.
1370 */
1371 if (task_sleep && parent_entity(se))
1372 set_next_buddy(parent_entity(se));
1076 break; 1373 break;
1374 }
1077 flags |= DEQUEUE_SLEEP; 1375 flags |= DEQUEUE_SLEEP;
1078 } 1376 }
1079 1377
1080 hrtick_update(rq); 1378 for_each_sched_entity(se) {
1081} 1379 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1082
1083/*
1084 * sched_yield() support is very simple - we dequeue and enqueue.
1085 *
1086 * If compat_yield is turned on then we requeue to the end of the tree.
1087 */
1088static void yield_task_fair(struct rq *rq)
1089{
1090 struct task_struct *curr = rq->curr;
1091 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1092 struct sched_entity *rightmost, *se = &curr->se;
1093
1094 /*
1095 * Are we the only task in the tree?
1096 */
1097 if (unlikely(cfs_rq->nr_running == 1))
1098 return;
1099
1100 clear_buddies(cfs_rq, se);
1101
1102 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
1103 update_rq_clock(rq);
1104 /*
1105 * Update run-time statistics of the 'current'.
1106 */
1107 update_curr(cfs_rq);
1108 1380
1109 return; 1381 update_cfs_load(cfs_rq, 0);
1382 update_cfs_shares(cfs_rq);
1110 } 1383 }
1111 /*
1112 * Find the rightmost entry in the rbtree:
1113 */
1114 rightmost = __pick_last_entity(cfs_rq);
1115 /*
1116 * Already in the rightmost position?
1117 */
1118 if (unlikely(!rightmost || entity_before(rightmost, se)))
1119 return;
1120 1384
1121 /* 1385 hrtick_update(rq);
1122 * Minimally necessary key value to be last in the tree:
1123 * Upon rescheduling, sched_class::put_prev_task() will place
1124 * 'current' within the tree based on its new key value.
1125 */
1126 se->vruntime = rightmost->vruntime + 1;
1127} 1386}
1128 1387
1129#ifdef CONFIG_SMP 1388#ifdef CONFIG_SMP
1130 1389
1131static void task_waking_fair(struct rq *rq, struct task_struct *p) 1390static void task_waking_fair(struct task_struct *p)
1132{ 1391{
1133 struct sched_entity *se = &p->se; 1392 struct sched_entity *se = &p->se;
1134 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1393 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1394 u64 min_vruntime;
1135 1395
1136 se->vruntime -= cfs_rq->min_vruntime; 1396#ifndef CONFIG_64BIT
1397 u64 min_vruntime_copy;
1398
1399 do {
1400 min_vruntime_copy = cfs_rq->min_vruntime_copy;
1401 smp_rmb();
1402 min_vruntime = cfs_rq->min_vruntime;
1403 } while (min_vruntime != min_vruntime_copy);
1404#else
1405 min_vruntime = cfs_rq->min_vruntime;
1406#endif
1407
1408 se->vruntime -= min_vruntime;
1137} 1409}
1138 1410
1139#ifdef CONFIG_FAIR_GROUP_SCHED 1411#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1143,67 +1415,36 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
1143 * Adding load to a group doesn't make a group heavier, but can cause movement 1415 * Adding load to a group doesn't make a group heavier, but can cause movement
1144 * of group shares between cpus. Assuming the shares were perfectly aligned one 1416 * of group shares between cpus. Assuming the shares were perfectly aligned one
1145 * can calculate the shift in shares. 1417 * can calculate the shift in shares.
1146 *
1147 * The problem is that perfectly aligning the shares is rather expensive, hence
1148 * we try to avoid doing that too often - see update_shares(), which ratelimits
1149 * this change.
1150 *
1151 * We compensate this by not only taking the current delta into account, but
1152 * also considering the delta between when the shares were last adjusted and
1153 * now.
1154 *
1155 * We still saw a performance dip, some tracing learned us that between
1156 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
1157 * significantly. Therefore try to bias the error in direction of failing
1158 * the affine wakeup.
1159 *
1160 */ 1418 */
1161static long effective_load(struct task_group *tg, int cpu, 1419static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1162 long wl, long wg)
1163{ 1420{
1164 struct sched_entity *se = tg->se[cpu]; 1421 struct sched_entity *se = tg->se[cpu];
1165 1422
1166 if (!tg->parent) 1423 if (!tg->parent)
1167 return wl; 1424 return wl;
1168 1425
1169 /*
1170 * By not taking the decrease of shares on the other cpu into
1171 * account our error leans towards reducing the affine wakeups.
1172 */
1173 if (!wl && sched_feat(ASYM_EFF_LOAD))
1174 return wl;
1175
1176 for_each_sched_entity(se) { 1426 for_each_sched_entity(se) {
1177 long S, rw, s, a, b; 1427 long lw, w;
1178 long more_w;
1179 1428
1180 /* 1429 tg = se->my_q->tg;
1181 * Instead of using this increment, also add the difference 1430 w = se->my_q->load.weight;
1182 * between when the shares were last updated and now.
1183 */
1184 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1185 wl += more_w;
1186 wg += more_w;
1187 1431
1188 S = se->my_q->tg->shares; 1432 /* use this cpu's instantaneous contribution */
1189 s = se->my_q->shares; 1433 lw = atomic_read(&tg->load_weight);
1190 rw = se->my_q->rq_weight; 1434 lw -= se->my_q->load_contribution;
1435 lw += w + wg;
1191 1436
1192 a = S*(rw + wl); 1437 wl += w;
1193 b = S*rw + s*wg;
1194 1438
1195 wl = s*(a-b); 1439 if (lw > 0 && wl < lw)
1196 1440 wl = (wl * tg->shares) / lw;
1197 if (likely(b)) 1441 else
1198 wl /= b; 1442 wl = tg->shares;
1199 1443
1200 /* 1444 /* zero point is MIN_SHARES */
1201 * Assume the group is already running and will 1445 if (wl < MIN_SHARES)
1202 * thus already be accounted for in the weight. 1446 wl = MIN_SHARES;
1203 * 1447 wl -= se->load.weight;
1204 * That is, moving shares between CPUs, does not
1205 * alter the group weight.
1206 */
1207 wg = 0; 1448 wg = 0;
1208 } 1449 }
1209 1450
@@ -1222,7 +1463,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1222 1463
1223static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 1464static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1224{ 1465{
1225 unsigned long this_load, load; 1466 s64 this_load, load;
1226 int idx, this_cpu, prev_cpu; 1467 int idx, this_cpu, prev_cpu;
1227 unsigned long tl_per_task; 1468 unsigned long tl_per_task;
1228 struct task_group *tg; 1469 struct task_group *tg;
@@ -1261,8 +1502,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1261 * Otherwise check if either cpus are near enough in load to allow this 1502 * Otherwise check if either cpus are near enough in load to allow this
1262 * task to be woken on this_cpu. 1503 * task to be woken on this_cpu.
1263 */ 1504 */
1264 if (this_load) { 1505 if (this_load > 0) {
1265 unsigned long this_eff_load, prev_eff_load; 1506 s64 this_eff_load, prev_eff_load;
1266 1507
1267 this_eff_load = 100; 1508 this_eff_load = 100;
1268 this_eff_load *= power_of(prev_cpu); 1509 this_eff_load *= power_of(prev_cpu);
@@ -1344,7 +1585,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1344 } 1585 }
1345 1586
1346 /* Adjust by relative CPU power of the group */ 1587 /* Adjust by relative CPU power of the group */
1347 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 1588 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
1348 1589
1349 if (local_group) { 1590 if (local_group) {
1350 this_load = avg_load; 1591 this_load = avg_load;
@@ -1409,6 +1650,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1409 /* 1650 /*
1410 * Otherwise, iterate the domains and find an elegible idle cpu. 1651 * Otherwise, iterate the domains and find an elegible idle cpu.
1411 */ 1652 */
1653 rcu_read_lock();
1412 for_each_domain(target, sd) { 1654 for_each_domain(target, sd) {
1413 if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) 1655 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1414 break; 1656 break;
@@ -1428,6 +1670,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1428 cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) 1670 cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
1429 break; 1671 break;
1430 } 1672 }
1673 rcu_read_unlock();
1431 1674
1432 return target; 1675 return target;
1433} 1676}
@@ -1444,7 +1687,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1444 * preempt must be disabled. 1687 * preempt must be disabled.
1445 */ 1688 */
1446static int 1689static int
1447select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) 1690select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1448{ 1691{
1449 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 1692 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1450 int cpu = smp_processor_id(); 1693 int cpu = smp_processor_id();
@@ -1460,6 +1703,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1460 new_cpu = prev_cpu; 1703 new_cpu = prev_cpu;
1461 } 1704 }
1462 1705
1706 rcu_read_lock();
1463 for_each_domain(cpu, tmp) { 1707 for_each_domain(cpu, tmp) {
1464 if (!(tmp->flags & SD_LOAD_BALANCE)) 1708 if (!(tmp->flags & SD_LOAD_BALANCE))
1465 continue; 1709 continue;
@@ -1479,7 +1723,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1479 nr_running += cpu_rq(i)->cfs.nr_running; 1723 nr_running += cpu_rq(i)->cfs.nr_running;
1480 } 1724 }
1481 1725
1482 capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 1726 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
1483 1727
1484 if (tmp->flags & SD_POWERSAVINGS_BALANCE) 1728 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1485 nr_running /= 2; 1729 nr_running /= 2;
@@ -1508,28 +1752,12 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1508 sd = tmp; 1752 sd = tmp;
1509 } 1753 }
1510 1754
1511#ifdef CONFIG_FAIR_GROUP_SCHED
1512 if (sched_feat(LB_SHARES_UPDATE)) {
1513 /*
1514 * Pick the largest domain to update shares over
1515 */
1516 tmp = sd;
1517 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1518 tmp = affine_sd;
1519
1520 if (tmp) {
1521 raw_spin_unlock(&rq->lock);
1522 update_shares(tmp);
1523 raw_spin_lock(&rq->lock);
1524 }
1525 }
1526#endif
1527
1528 if (affine_sd) { 1755 if (affine_sd) {
1529 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1756 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1530 return select_idle_sibling(p, cpu); 1757 prev_cpu = cpu;
1531 else 1758
1532 return select_idle_sibling(p, prev_cpu); 1759 new_cpu = select_idle_sibling(p, prev_cpu);
1760 goto unlock;
1533 } 1761 }
1534 1762
1535 while (sd) { 1763 while (sd) {
@@ -1570,6 +1798,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1570 } 1798 }
1571 /* while loop will break here if sd == NULL */ 1799 /* while loop will break here if sd == NULL */
1572 } 1800 }
1801unlock:
1802 rcu_read_unlock();
1573 1803
1574 return new_cpu; 1804 return new_cpu;
1575} 1805}
@@ -1593,10 +1823,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1593 * This is especially important for buddies when the leftmost 1823 * This is especially important for buddies when the leftmost
1594 * task is higher priority than the buddy. 1824 * task is higher priority than the buddy.
1595 */ 1825 */
1596 if (unlikely(se->load.weight != NICE_0_LOAD)) 1826 return calc_delta_fair(gran, se);
1597 gran = calc_delta_fair(gran, se);
1598
1599 return gran;
1600} 1827}
1601 1828
1602/* 1829/*
@@ -1630,18 +1857,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1630 1857
1631static void set_last_buddy(struct sched_entity *se) 1858static void set_last_buddy(struct sched_entity *se)
1632{ 1859{
1633 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1860 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
1634 for_each_sched_entity(se) 1861 return;
1635 cfs_rq_of(se)->last = se; 1862
1636 } 1863 for_each_sched_entity(se)
1864 cfs_rq_of(se)->last = se;
1637} 1865}
1638 1866
1639static void set_next_buddy(struct sched_entity *se) 1867static void set_next_buddy(struct sched_entity *se)
1640{ 1868{
1641 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1869 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
1642 for_each_sched_entity(se) 1870 return;
1643 cfs_rq_of(se)->next = se; 1871
1644 } 1872 for_each_sched_entity(se)
1873 cfs_rq_of(se)->next = se;
1874}
1875
1876static void set_skip_buddy(struct sched_entity *se)
1877{
1878 for_each_sched_entity(se)
1879 cfs_rq_of(se)->skip = se;
1645} 1880}
1646 1881
1647/* 1882/*
@@ -1653,18 +1888,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1653 struct sched_entity *se = &curr->se, *pse = &p->se; 1888 struct sched_entity *se = &curr->se, *pse = &p->se;
1654 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1889 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1655 int scale = cfs_rq->nr_running >= sched_nr_latency; 1890 int scale = cfs_rq->nr_running >= sched_nr_latency;
1891 int next_buddy_marked = 0;
1656 1892
1657 if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS) 1893 if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS)
1658 goto preempt; 1894 goto preempt;
1659 1895
1660 if (unlikely(p->sched_class != &fair_sched_class))
1661 return;
1662
1663 if (unlikely(se == pse)) 1896 if (unlikely(se == pse))
1664 return; 1897 return;
1665 1898
1666 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) 1899 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
1667 set_next_buddy(pse); 1900 set_next_buddy(pse);
1901 next_buddy_marked = 1;
1902 }
1668 1903
1669 /* 1904 /*
1670 * We can come here with TIF_NEED_RESCHED already set from new task 1905 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1673,16 +1908,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1673 if (test_tsk_need_resched(curr)) 1908 if (test_tsk_need_resched(curr))
1674 return; 1909 return;
1675 1910
1911 /* Idle tasks are by definition preempted by non-idle tasks. */
1912 if (unlikely(curr->policy == SCHED_IDLE) &&
1913 likely(p->policy != SCHED_IDLE))
1914 goto preempt;
1915
1676 /* 1916 /*
1677 * Batch and idle tasks do not preempt (their preemption is driven by 1917 * Batch and idle tasks do not preempt non-idle tasks (their preemption
1678 * the tick): 1918 * is driven by the tick):
1679 */ 1919 */
1680 if (unlikely(p->policy != SCHED_NORMAL)) 1920 if (unlikely(p->policy != SCHED_NORMAL))
1681 return; 1921 return;
1682 1922
1683 /* Idle tasks are by definition preempted by everybody. */
1684 if (unlikely(curr->policy == SCHED_IDLE))
1685 goto preempt;
1686 1923
1687 if (!sched_feat(WAKEUP_PREEMPT)) 1924 if (!sched_feat(WAKEUP_PREEMPT))
1688 return; 1925 return;
@@ -1690,8 +1927,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1690 update_curr(cfs_rq); 1927 update_curr(cfs_rq);
1691 find_matching_se(&se, &pse); 1928 find_matching_se(&se, &pse);
1692 BUG_ON(!pse); 1929 BUG_ON(!pse);
1693 if (wakeup_preempt_entity(se, pse) == 1) 1930 if (wakeup_preempt_entity(se, pse) == 1) {
1931 /*
1932 * Bias pick_next to pick the sched entity that is
1933 * triggering this preemption.
1934 */
1935 if (!next_buddy_marked)
1936 set_next_buddy(pse);
1694 goto preempt; 1937 goto preempt;
1938 }
1695 1939
1696 return; 1940 return;
1697 1941
@@ -1748,6 +1992,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1748 } 1992 }
1749} 1993}
1750 1994
1995/*
1996 * sched_yield() is very simple
1997 *
1998 * The magic of dealing with the ->skip buddy is in pick_next_entity.
1999 */
2000static void yield_task_fair(struct rq *rq)
2001{
2002 struct task_struct *curr = rq->curr;
2003 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
2004 struct sched_entity *se = &curr->se;
2005
2006 /*
2007 * Are we the only task in the tree?
2008 */
2009 if (unlikely(rq->nr_running == 1))
2010 return;
2011
2012 clear_buddies(cfs_rq, se);
2013
2014 if (curr->policy != SCHED_BATCH) {
2015 update_rq_clock(rq);
2016 /*
2017 * Update run-time statistics of the 'current'.
2018 */
2019 update_curr(cfs_rq);
2020 }
2021
2022 set_skip_buddy(se);
2023}
2024
2025static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
2026{
2027 struct sched_entity *se = &p->se;
2028
2029 if (!se->on_rq)
2030 return false;
2031
2032 /* Tell the scheduler that we'd really like pse to run next. */
2033 set_next_buddy(se);
2034
2035 yield_task_fair(rq);
2036
2037 return true;
2038}
2039
1751#ifdef CONFIG_SMP 2040#ifdef CONFIG_SMP
1752/************************************************** 2041/**************************************************
1753 * Fair scheduling class load-balancing methods: 2042 * Fair scheduling class load-balancing methods:
@@ -1798,7 +2087,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1798 * 2) too many balance attempts have failed. 2087 * 2) too many balance attempts have failed.
1799 */ 2088 */
1800 2089
1801 tsk_cache_hot = task_hot(p, rq->clock, sd); 2090 tsk_cache_hot = task_hot(p, rq->clock_task, sd);
1802 if (!tsk_cache_hot || 2091 if (!tsk_cache_hot ||
1803 sd->nr_balance_failed > sd->cache_nice_tries) { 2092 sd->nr_balance_failed > sd->cache_nice_tries) {
1804#ifdef CONFIG_SCHEDSTATS 2093#ifdef CONFIG_SCHEDSTATS
@@ -1857,23 +2146,22 @@ static unsigned long
1857balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 2146balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1858 unsigned long max_load_move, struct sched_domain *sd, 2147 unsigned long max_load_move, struct sched_domain *sd,
1859 enum cpu_idle_type idle, int *all_pinned, 2148 enum cpu_idle_type idle, int *all_pinned,
1860 int *this_best_prio, struct cfs_rq *busiest_cfs_rq) 2149 struct cfs_rq *busiest_cfs_rq)
1861{ 2150{
1862 int loops = 0, pulled = 0, pinned = 0; 2151 int loops = 0, pulled = 0;
1863 long rem_load_move = max_load_move; 2152 long rem_load_move = max_load_move;
1864 struct task_struct *p, *n; 2153 struct task_struct *p, *n;
1865 2154
1866 if (max_load_move == 0) 2155 if (max_load_move == 0)
1867 goto out; 2156 goto out;
1868 2157
1869 pinned = 1;
1870
1871 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { 2158 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
1872 if (loops++ > sysctl_sched_nr_migrate) 2159 if (loops++ > sysctl_sched_nr_migrate)
1873 break; 2160 break;
1874 2161
1875 if ((p->se.load.weight >> 1) > rem_load_move || 2162 if ((p->se.load.weight >> 1) > rem_load_move ||
1876 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) 2163 !can_migrate_task(p, busiest, this_cpu, sd, idle,
2164 all_pinned))
1877 continue; 2165 continue;
1878 2166
1879 pull_task(busiest, p, this_rq, this_cpu); 2167 pull_task(busiest, p, this_rq, this_cpu);
@@ -1896,9 +2184,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1896 */ 2184 */
1897 if (rem_load_move <= 0) 2185 if (rem_load_move <= 0)
1898 break; 2186 break;
1899
1900 if (p->prio < *this_best_prio)
1901 *this_best_prio = p->prio;
1902 } 2187 }
1903out: 2188out:
1904 /* 2189 /*
@@ -1908,18 +2193,57 @@ out:
1908 */ 2193 */
1909 schedstat_add(sd, lb_gained[idle], pulled); 2194 schedstat_add(sd, lb_gained[idle], pulled);
1910 2195
1911 if (all_pinned)
1912 *all_pinned = pinned;
1913
1914 return max_load_move - rem_load_move; 2196 return max_load_move - rem_load_move;
1915} 2197}
1916 2198
1917#ifdef CONFIG_FAIR_GROUP_SCHED 2199#ifdef CONFIG_FAIR_GROUP_SCHED
2200/*
2201 * update tg->load_weight by folding this cpu's load_avg
2202 */
2203static int update_shares_cpu(struct task_group *tg, int cpu)
2204{
2205 struct cfs_rq *cfs_rq;
2206 unsigned long flags;
2207 struct rq *rq;
2208
2209 if (!tg->se[cpu])
2210 return 0;
2211
2212 rq = cpu_rq(cpu);
2213 cfs_rq = tg->cfs_rq[cpu];
2214
2215 raw_spin_lock_irqsave(&rq->lock, flags);
2216
2217 update_rq_clock(rq);
2218 update_cfs_load(cfs_rq, 1);
2219
2220 /*
2221 * We need to update shares after updating tg->load_weight in
2222 * order to adjust the weight of groups with long running tasks.
2223 */
2224 update_cfs_shares(cfs_rq);
2225
2226 raw_spin_unlock_irqrestore(&rq->lock, flags);
2227
2228 return 0;
2229}
2230
2231static void update_shares(int cpu)
2232{
2233 struct cfs_rq *cfs_rq;
2234 struct rq *rq = cpu_rq(cpu);
2235
2236 rcu_read_lock();
2237 for_each_leaf_cfs_rq(rq, cfs_rq)
2238 update_shares_cpu(cfs_rq->tg, cpu);
2239 rcu_read_unlock();
2240}
2241
1918static unsigned long 2242static unsigned long
1919load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2243load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1920 unsigned long max_load_move, 2244 unsigned long max_load_move,
1921 struct sched_domain *sd, enum cpu_idle_type idle, 2245 struct sched_domain *sd, enum cpu_idle_type idle,
1922 int *all_pinned, int *this_best_prio) 2246 int *all_pinned)
1923{ 2247{
1924 long rem_load_move = max_load_move; 2248 long rem_load_move = max_load_move;
1925 int busiest_cpu = cpu_of(busiest); 2249 int busiest_cpu = cpu_of(busiest);
@@ -1944,7 +2268,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1944 rem_load = div_u64(rem_load, busiest_h_load + 1); 2268 rem_load = div_u64(rem_load, busiest_h_load + 1);
1945 2269
1946 moved_load = balance_tasks(this_rq, this_cpu, busiest, 2270 moved_load = balance_tasks(this_rq, this_cpu, busiest,
1947 rem_load, sd, idle, all_pinned, this_best_prio, 2271 rem_load, sd, idle, all_pinned,
1948 busiest_cfs_rq); 2272 busiest_cfs_rq);
1949 2273
1950 if (!moved_load) 2274 if (!moved_load)
@@ -1962,15 +2286,19 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1962 return max_load_move - rem_load_move; 2286 return max_load_move - rem_load_move;
1963} 2287}
1964#else 2288#else
2289static inline void update_shares(int cpu)
2290{
2291}
2292
1965static unsigned long 2293static unsigned long
1966load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2294load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1967 unsigned long max_load_move, 2295 unsigned long max_load_move,
1968 struct sched_domain *sd, enum cpu_idle_type idle, 2296 struct sched_domain *sd, enum cpu_idle_type idle,
1969 int *all_pinned, int *this_best_prio) 2297 int *all_pinned)
1970{ 2298{
1971 return balance_tasks(this_rq, this_cpu, busiest, 2299 return balance_tasks(this_rq, this_cpu, busiest,
1972 max_load_move, sd, idle, all_pinned, 2300 max_load_move, sd, idle, all_pinned,
1973 this_best_prio, &busiest->cfs); 2301 &busiest->cfs);
1974} 2302}
1975#endif 2303#endif
1976 2304
@@ -1987,12 +2315,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1987 int *all_pinned) 2315 int *all_pinned)
1988{ 2316{
1989 unsigned long total_load_moved = 0, load_moved; 2317 unsigned long total_load_moved = 0, load_moved;
1990 int this_best_prio = this_rq->curr->prio;
1991 2318
1992 do { 2319 do {
1993 load_moved = load_balance_fair(this_rq, this_cpu, busiest, 2320 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
1994 max_load_move - total_load_moved, 2321 max_load_move - total_load_moved,
1995 sd, idle, all_pinned, &this_best_prio); 2322 sd, idle, all_pinned);
1996 2323
1997 total_load_moved += load_moved; 2324 total_load_moved += load_moved;
1998 2325
@@ -2030,12 +2357,17 @@ struct sd_lb_stats {
2030 unsigned long this_load; 2357 unsigned long this_load;
2031 unsigned long this_load_per_task; 2358 unsigned long this_load_per_task;
2032 unsigned long this_nr_running; 2359 unsigned long this_nr_running;
2360 unsigned long this_has_capacity;
2361 unsigned int this_idle_cpus;
2033 2362
2034 /* Statistics of the busiest group */ 2363 /* Statistics of the busiest group */
2364 unsigned int busiest_idle_cpus;
2035 unsigned long max_load; 2365 unsigned long max_load;
2036 unsigned long busiest_load_per_task; 2366 unsigned long busiest_load_per_task;
2037 unsigned long busiest_nr_running; 2367 unsigned long busiest_nr_running;
2038 unsigned long busiest_group_capacity; 2368 unsigned long busiest_group_capacity;
2369 unsigned long busiest_has_capacity;
2370 unsigned int busiest_group_weight;
2039 2371
2040 int group_imb; /* Is there imbalance in this sd */ 2372 int group_imb; /* Is there imbalance in this sd */
2041#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2373#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2057,7 +2389,10 @@ struct sg_lb_stats {
2057 unsigned long sum_nr_running; /* Nr tasks running in the group */ 2389 unsigned long sum_nr_running; /* Nr tasks running in the group */
2058 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 2390 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2059 unsigned long group_capacity; 2391 unsigned long group_capacity;
2392 unsigned long idle_cpus;
2393 unsigned long group_weight;
2060 int group_imb; /* Is there an imbalance in the group ? */ 2394 int group_imb; /* Is there an imbalance in the group ? */
2395 int group_has_capacity; /* Is there extra capacity in the group? */
2061}; 2396};
2062 2397
2063/** 2398/**
@@ -2239,7 +2574,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2239 2574
2240unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 2575unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
2241{ 2576{
2242 return SCHED_LOAD_SCALE; 2577 return SCHED_POWER_SCALE;
2243} 2578}
2244 2579
2245unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) 2580unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
@@ -2268,12 +2603,18 @@ unsigned long scale_rt_power(int cpu)
2268 u64 total, available; 2603 u64 total, available;
2269 2604
2270 total = sched_avg_period() + (rq->clock - rq->age_stamp); 2605 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2271 available = total - rq->rt_avg;
2272 2606
2273 if (unlikely((s64)total < SCHED_LOAD_SCALE)) 2607 if (unlikely(total < rq->rt_avg)) {
2274 total = SCHED_LOAD_SCALE; 2608 /* Ensures that power won't end up being negative */
2609 available = 0;
2610 } else {
2611 available = total - rq->rt_avg;
2612 }
2613
2614 if (unlikely((s64)total < SCHED_POWER_SCALE))
2615 total = SCHED_POWER_SCALE;
2275 2616
2276 total >>= SCHED_LOAD_SHIFT; 2617 total >>= SCHED_POWER_SHIFT;
2277 2618
2278 return div_u64(available, total); 2619 return div_u64(available, total);
2279} 2620}
@@ -2281,7 +2622,7 @@ unsigned long scale_rt_power(int cpu)
2281static void update_cpu_power(struct sched_domain *sd, int cpu) 2622static void update_cpu_power(struct sched_domain *sd, int cpu)
2282{ 2623{
2283 unsigned long weight = sd->span_weight; 2624 unsigned long weight = sd->span_weight;
2284 unsigned long power = SCHED_LOAD_SCALE; 2625 unsigned long power = SCHED_POWER_SCALE;
2285 struct sched_group *sdg = sd->groups; 2626 struct sched_group *sdg = sd->groups;
2286 2627
2287 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 2628 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
@@ -2290,26 +2631,26 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2290 else 2631 else
2291 power *= default_scale_smt_power(sd, cpu); 2632 power *= default_scale_smt_power(sd, cpu);
2292 2633
2293 power >>= SCHED_LOAD_SHIFT; 2634 power >>= SCHED_POWER_SHIFT;
2294 } 2635 }
2295 2636
2296 sdg->cpu_power_orig = power; 2637 sdg->sgp->power_orig = power;
2297 2638
2298 if (sched_feat(ARCH_POWER)) 2639 if (sched_feat(ARCH_POWER))
2299 power *= arch_scale_freq_power(sd, cpu); 2640 power *= arch_scale_freq_power(sd, cpu);
2300 else 2641 else
2301 power *= default_scale_freq_power(sd, cpu); 2642 power *= default_scale_freq_power(sd, cpu);
2302 2643
2303 power >>= SCHED_LOAD_SHIFT; 2644 power >>= SCHED_POWER_SHIFT;
2304 2645
2305 power *= scale_rt_power(cpu); 2646 power *= scale_rt_power(cpu);
2306 power >>= SCHED_LOAD_SHIFT; 2647 power >>= SCHED_POWER_SHIFT;
2307 2648
2308 if (!power) 2649 if (!power)
2309 power = 1; 2650 power = 1;
2310 2651
2311 cpu_rq(cpu)->cpu_power = power; 2652 cpu_rq(cpu)->cpu_power = power;
2312 sdg->cpu_power = power; 2653 sdg->sgp->power = power;
2313} 2654}
2314 2655
2315static void update_group_power(struct sched_domain *sd, int cpu) 2656static void update_group_power(struct sched_domain *sd, int cpu)
@@ -2327,11 +2668,11 @@ static void update_group_power(struct sched_domain *sd, int cpu)
2327 2668
2328 group = child->groups; 2669 group = child->groups;
2329 do { 2670 do {
2330 power += group->cpu_power; 2671 power += group->sgp->power;
2331 group = group->next; 2672 group = group->next;
2332 } while (group != child->groups); 2673 } while (group != child->groups);
2333 2674
2334 sdg->cpu_power = power; 2675 sdg->sgp->power = power;
2335} 2676}
2336 2677
2337/* 2678/*
@@ -2345,15 +2686,15 @@ static inline int
2345fix_small_capacity(struct sched_domain *sd, struct sched_group *group) 2686fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2346{ 2687{
2347 /* 2688 /*
2348 * Only siblings can have significantly less than SCHED_LOAD_SCALE 2689 * Only siblings can have significantly less than SCHED_POWER_SCALE
2349 */ 2690 */
2350 if (sd->level != SD_LV_SIBLING) 2691 if (!(sd->flags & SD_SHARE_CPUPOWER))
2351 return 0; 2692 return 0;
2352 2693
2353 /* 2694 /*
2354 * If ~90% of the cpu_power is still there, we're good. 2695 * If ~90% of the cpu_power is still there, we're good.
2355 */ 2696 */
2356 if (group->cpu_power * 32 > group->cpu_power_orig * 29) 2697 if (group->sgp->power * 32 > group->sgp->power_orig * 29)
2357 return 1; 2698 return 1;
2358 2699
2359 return 0; 2700 return 0;
@@ -2366,7 +2707,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2366 * @this_cpu: Cpu for which load balance is currently performed. 2707 * @this_cpu: Cpu for which load balance is currently performed.
2367 * @idle: Idle status of this_cpu 2708 * @idle: Idle status of this_cpu
2368 * @load_idx: Load index of sched_domain of this_cpu for load calc. 2709 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2369 * @sd_idle: Idle status of the sched_domain containing group.
2370 * @local_group: Does group contain this_cpu. 2710 * @local_group: Does group contain this_cpu.
2371 * @cpus: Set of cpus considered for load balancing. 2711 * @cpus: Set of cpus considered for load balancing.
2372 * @balance: Should we balance. 2712 * @balance: Should we balance.
@@ -2374,11 +2714,11 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2374 */ 2714 */
2375static inline void update_sg_lb_stats(struct sched_domain *sd, 2715static inline void update_sg_lb_stats(struct sched_domain *sd,
2376 struct sched_group *group, int this_cpu, 2716 struct sched_group *group, int this_cpu,
2377 enum cpu_idle_type idle, int load_idx, int *sd_idle, 2717 enum cpu_idle_type idle, int load_idx,
2378 int local_group, const struct cpumask *cpus, 2718 int local_group, const struct cpumask *cpus,
2379 int *balance, struct sg_lb_stats *sgs) 2719 int *balance, struct sg_lb_stats *sgs)
2380{ 2720{
2381 unsigned long load, max_cpu_load, min_cpu_load; 2721 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
2382 int i; 2722 int i;
2383 unsigned int balance_cpu = -1, first_idle_cpu = 0; 2723 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2384 unsigned long avg_load_per_task = 0; 2724 unsigned long avg_load_per_task = 0;
@@ -2389,13 +2729,11 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2389 /* Tally up the load of all CPUs in the group */ 2729 /* Tally up the load of all CPUs in the group */
2390 max_cpu_load = 0; 2730 max_cpu_load = 0;
2391 min_cpu_load = ~0UL; 2731 min_cpu_load = ~0UL;
2732 max_nr_running = 0;
2392 2733
2393 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 2734 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2394 struct rq *rq = cpu_rq(i); 2735 struct rq *rq = cpu_rq(i);
2395 2736
2396 if (*sd_idle && rq->nr_running)
2397 *sd_idle = 0;
2398
2399 /* Bias balancing toward cpus of our domain */ 2737 /* Bias balancing toward cpus of our domain */
2400 if (local_group) { 2738 if (local_group) {
2401 if (idle_cpu(i) && !first_idle_cpu) { 2739 if (idle_cpu(i) && !first_idle_cpu) {
@@ -2406,8 +2744,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2406 load = target_load(i, load_idx); 2744 load = target_load(i, load_idx);
2407 } else { 2745 } else {
2408 load = source_load(i, load_idx); 2746 load = source_load(i, load_idx);
2409 if (load > max_cpu_load) 2747 if (load > max_cpu_load) {
2410 max_cpu_load = load; 2748 max_cpu_load = load;
2749 max_nr_running = rq->nr_running;
2750 }
2411 if (min_cpu_load > load) 2751 if (min_cpu_load > load)
2412 min_cpu_load = load; 2752 min_cpu_load = load;
2413 } 2753 }
@@ -2415,7 +2755,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2415 sgs->group_load += load; 2755 sgs->group_load += load;
2416 sgs->sum_nr_running += rq->nr_running; 2756 sgs->sum_nr_running += rq->nr_running;
2417 sgs->sum_weighted_load += weighted_cpuload(i); 2757 sgs->sum_weighted_load += weighted_cpuload(i);
2418 2758 if (idle_cpu(i))
2759 sgs->idle_cpus++;
2419 } 2760 }
2420 2761
2421 /* 2762 /*
@@ -2433,11 +2774,11 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2433 } 2774 }
2434 2775
2435 /* Adjust by relative CPU power of the group */ 2776 /* Adjust by relative CPU power of the group */
2436 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; 2777 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
2437 2778
2438 /* 2779 /*
2439 * Consider the group unbalanced when the imbalance is larger 2780 * Consider the group unbalanced when the imbalance is larger
2440 * than the average weight of two tasks. 2781 * than the average weight of a task.
2441 * 2782 *
2442 * APZ: with cgroup the avg task weight can vary wildly and 2783 * APZ: with cgroup the avg task weight can vary wildly and
2443 * might not be a suitable number - should we keep a 2784 * might not be a suitable number - should we keep a
@@ -2447,13 +2788,17 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2447 if (sgs->sum_nr_running) 2788 if (sgs->sum_nr_running)
2448 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 2789 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2449 2790
2450 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 2791 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
2451 sgs->group_imb = 1; 2792 sgs->group_imb = 1;
2452 2793
2453 sgs->group_capacity = 2794 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
2454 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2795 SCHED_POWER_SCALE);
2455 if (!sgs->group_capacity) 2796 if (!sgs->group_capacity)
2456 sgs->group_capacity = fix_small_capacity(sd, group); 2797 sgs->group_capacity = fix_small_capacity(sd, group);
2798 sgs->group_weight = group->group_weight;
2799
2800 if (sgs->group_capacity > sgs->sum_nr_running)
2801 sgs->group_has_capacity = 1;
2457} 2802}
2458 2803
2459/** 2804/**
@@ -2504,15 +2849,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
2504 * @sd: sched_domain whose statistics are to be updated. 2849 * @sd: sched_domain whose statistics are to be updated.
2505 * @this_cpu: Cpu for which load balance is currently performed. 2850 * @this_cpu: Cpu for which load balance is currently performed.
2506 * @idle: Idle status of this_cpu 2851 * @idle: Idle status of this_cpu
2507 * @sd_idle: Idle status of the sched_domain containing sg.
2508 * @cpus: Set of cpus considered for load balancing. 2852 * @cpus: Set of cpus considered for load balancing.
2509 * @balance: Should we balance. 2853 * @balance: Should we balance.
2510 * @sds: variable to hold the statistics for this sched_domain. 2854 * @sds: variable to hold the statistics for this sched_domain.
2511 */ 2855 */
2512static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, 2856static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2513 enum cpu_idle_type idle, int *sd_idle, 2857 enum cpu_idle_type idle, const struct cpumask *cpus,
2514 const struct cpumask *cpus, int *balance, 2858 int *balance, struct sd_lb_stats *sds)
2515 struct sd_lb_stats *sds)
2516{ 2859{
2517 struct sched_domain *child = sd->child; 2860 struct sched_domain *child = sd->child;
2518 struct sched_group *sg = sd->groups; 2861 struct sched_group *sg = sd->groups;
@@ -2530,21 +2873,26 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2530 2873
2531 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); 2874 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2532 memset(&sgs, 0, sizeof(sgs)); 2875 memset(&sgs, 0, sizeof(sgs));
2533 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, 2876 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
2534 local_group, cpus, balance, &sgs); 2877 local_group, cpus, balance, &sgs);
2535 2878
2536 if (local_group && !(*balance)) 2879 if (local_group && !(*balance))
2537 return; 2880 return;
2538 2881
2539 sds->total_load += sgs.group_load; 2882 sds->total_load += sgs.group_load;
2540 sds->total_pwr += sg->cpu_power; 2883 sds->total_pwr += sg->sgp->power;
2541 2884
2542 /* 2885 /*
2543 * In case the child domain prefers tasks go to siblings 2886 * In case the child domain prefers tasks go to siblings
2544 * first, lower the sg capacity to one so that we'll try 2887 * first, lower the sg capacity to one so that we'll try
2545 * and move all the excess tasks away. 2888 * and move all the excess tasks away. We lower the capacity
2889 * of a group only if the local group has the capacity to fit
2890 * these excess tasks, i.e. nr_running < group_capacity. The
2891 * extra check prevents the case where you always pull from the
2892 * heaviest group when it is already under-utilized (possible
2893 * with a large weight task outweighs the tasks on the system).
2546 */ 2894 */
2547 if (prefer_sibling) 2895 if (prefer_sibling && !local_group && sds->this_has_capacity)
2548 sgs.group_capacity = min(sgs.group_capacity, 1UL); 2896 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2549 2897
2550 if (local_group) { 2898 if (local_group) {
@@ -2552,12 +2900,17 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2552 sds->this = sg; 2900 sds->this = sg;
2553 sds->this_nr_running = sgs.sum_nr_running; 2901 sds->this_nr_running = sgs.sum_nr_running;
2554 sds->this_load_per_task = sgs.sum_weighted_load; 2902 sds->this_load_per_task = sgs.sum_weighted_load;
2903 sds->this_has_capacity = sgs.group_has_capacity;
2904 sds->this_idle_cpus = sgs.idle_cpus;
2555 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 2905 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2556 sds->max_load = sgs.avg_load; 2906 sds->max_load = sgs.avg_load;
2557 sds->busiest = sg; 2907 sds->busiest = sg;
2558 sds->busiest_nr_running = sgs.sum_nr_running; 2908 sds->busiest_nr_running = sgs.sum_nr_running;
2909 sds->busiest_idle_cpus = sgs.idle_cpus;
2559 sds->busiest_group_capacity = sgs.group_capacity; 2910 sds->busiest_group_capacity = sgs.group_capacity;
2560 sds->busiest_load_per_task = sgs.sum_weighted_load; 2911 sds->busiest_load_per_task = sgs.sum_weighted_load;
2912 sds->busiest_has_capacity = sgs.group_has_capacity;
2913 sds->busiest_group_weight = sgs.group_weight;
2561 sds->group_imb = sgs.group_imb; 2914 sds->group_imb = sgs.group_imb;
2562 } 2915 }
2563 2916
@@ -2612,8 +2965,8 @@ static int check_asym_packing(struct sched_domain *sd,
2612 if (this_cpu > busiest_cpu) 2965 if (this_cpu > busiest_cpu)
2613 return 0; 2966 return 0;
2614 2967
2615 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, 2968 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
2616 SCHED_LOAD_SCALE); 2969 SCHED_POWER_SCALE);
2617 return 1; 2970 return 1;
2618} 2971}
2619 2972
@@ -2642,8 +2995,8 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2642 cpu_avg_load_per_task(this_cpu); 2995 cpu_avg_load_per_task(this_cpu);
2643 2996
2644 scaled_busy_load_per_task = sds->busiest_load_per_task 2997 scaled_busy_load_per_task = sds->busiest_load_per_task
2645 * SCHED_LOAD_SCALE; 2998 * SCHED_POWER_SCALE;
2646 scaled_busy_load_per_task /= sds->busiest->cpu_power; 2999 scaled_busy_load_per_task /= sds->busiest->sgp->power;
2647 3000
2648 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 3001 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
2649 (scaled_busy_load_per_task * imbn)) { 3002 (scaled_busy_load_per_task * imbn)) {
@@ -2657,30 +3010,30 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2657 * moving them. 3010 * moving them.
2658 */ 3011 */
2659 3012
2660 pwr_now += sds->busiest->cpu_power * 3013 pwr_now += sds->busiest->sgp->power *
2661 min(sds->busiest_load_per_task, sds->max_load); 3014 min(sds->busiest_load_per_task, sds->max_load);
2662 pwr_now += sds->this->cpu_power * 3015 pwr_now += sds->this->sgp->power *
2663 min(sds->this_load_per_task, sds->this_load); 3016 min(sds->this_load_per_task, sds->this_load);
2664 pwr_now /= SCHED_LOAD_SCALE; 3017 pwr_now /= SCHED_POWER_SCALE;
2665 3018
2666 /* Amount of load we'd subtract */ 3019 /* Amount of load we'd subtract */
2667 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / 3020 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
2668 sds->busiest->cpu_power; 3021 sds->busiest->sgp->power;
2669 if (sds->max_load > tmp) 3022 if (sds->max_load > tmp)
2670 pwr_move += sds->busiest->cpu_power * 3023 pwr_move += sds->busiest->sgp->power *
2671 min(sds->busiest_load_per_task, sds->max_load - tmp); 3024 min(sds->busiest_load_per_task, sds->max_load - tmp);
2672 3025
2673 /* Amount of load we'd add */ 3026 /* Amount of load we'd add */
2674 if (sds->max_load * sds->busiest->cpu_power < 3027 if (sds->max_load * sds->busiest->sgp->power <
2675 sds->busiest_load_per_task * SCHED_LOAD_SCALE) 3028 sds->busiest_load_per_task * SCHED_POWER_SCALE)
2676 tmp = (sds->max_load * sds->busiest->cpu_power) / 3029 tmp = (sds->max_load * sds->busiest->sgp->power) /
2677 sds->this->cpu_power; 3030 sds->this->sgp->power;
2678 else 3031 else
2679 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / 3032 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
2680 sds->this->cpu_power; 3033 sds->this->sgp->power;
2681 pwr_move += sds->this->cpu_power * 3034 pwr_move += sds->this->sgp->power *
2682 min(sds->this_load_per_task, sds->this_load + tmp); 3035 min(sds->this_load_per_task, sds->this_load + tmp);
2683 pwr_move /= SCHED_LOAD_SCALE; 3036 pwr_move /= SCHED_POWER_SCALE;
2684 3037
2685 /* Move if we gain throughput */ 3038 /* Move if we gain throughput */
2686 if (pwr_move > pwr_now) 3039 if (pwr_move > pwr_now)
@@ -2722,9 +3075,9 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2722 load_above_capacity = (sds->busiest_nr_running - 3075 load_above_capacity = (sds->busiest_nr_running -
2723 sds->busiest_group_capacity); 3076 sds->busiest_group_capacity);
2724 3077
2725 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); 3078 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
2726 3079
2727 load_above_capacity /= sds->busiest->cpu_power; 3080 load_above_capacity /= sds->busiest->sgp->power;
2728 } 3081 }
2729 3082
2730 /* 3083 /*
@@ -2740,13 +3093,13 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2740 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 3093 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
2741 3094
2742 /* How much load to actually move to equalise the imbalance */ 3095 /* How much load to actually move to equalise the imbalance */
2743 *imbalance = min(max_pull * sds->busiest->cpu_power, 3096 *imbalance = min(max_pull * sds->busiest->sgp->power,
2744 (sds->avg_load - sds->this_load) * sds->this->cpu_power) 3097 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
2745 / SCHED_LOAD_SCALE; 3098 / SCHED_POWER_SCALE;
2746 3099
2747 /* 3100 /*
2748 * if *imbalance is less than the average load per runnable task 3101 * if *imbalance is less than the average load per runnable task
2749 * there is no gaurantee that any tasks will be moved so we'll have 3102 * there is no guarantee that any tasks will be moved so we'll have
2750 * a think about bumping its value to force at least one task to be 3103 * a think about bumping its value to force at least one task to be
2751 * moved 3104 * moved
2752 */ 3105 */
@@ -2754,6 +3107,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2754 return fix_small_imbalance(sds, this_cpu, imbalance); 3107 return fix_small_imbalance(sds, this_cpu, imbalance);
2755 3108
2756} 3109}
3110
2757/******* find_busiest_group() helpers end here *********************/ 3111/******* find_busiest_group() helpers end here *********************/
2758 3112
2759/** 3113/**
@@ -2771,7 +3125,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2771 * @imbalance: Variable which stores amount of weighted load which should 3125 * @imbalance: Variable which stores amount of weighted load which should
2772 * be moved to restore balance/put a group to idle. 3126 * be moved to restore balance/put a group to idle.
2773 * @idle: The idle status of this_cpu. 3127 * @idle: The idle status of this_cpu.
2774 * @sd_idle: The idleness of sd
2775 * @cpus: The set of CPUs under consideration for load-balancing. 3128 * @cpus: The set of CPUs under consideration for load-balancing.
2776 * @balance: Pointer to a variable indicating if this_cpu 3129 * @balance: Pointer to a variable indicating if this_cpu
2777 * is the appropriate cpu to perform load balancing at this_level. 3130 * is the appropriate cpu to perform load balancing at this_level.
@@ -2784,7 +3137,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2784static struct sched_group * 3137static struct sched_group *
2785find_busiest_group(struct sched_domain *sd, int this_cpu, 3138find_busiest_group(struct sched_domain *sd, int this_cpu,
2786 unsigned long *imbalance, enum cpu_idle_type idle, 3139 unsigned long *imbalance, enum cpu_idle_type idle,
2787 int *sd_idle, const struct cpumask *cpus, int *balance) 3140 const struct cpumask *cpus, int *balance)
2788{ 3141{
2789 struct sd_lb_stats sds; 3142 struct sd_lb_stats sds;
2790 3143
@@ -2794,17 +3147,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2794 * Compute the various statistics relavent for load balancing at 3147 * Compute the various statistics relavent for load balancing at
2795 * this level. 3148 * this level.
2796 */ 3149 */
2797 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, 3150 update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
2798 balance, &sds);
2799 3151
2800 /* Cases where imbalance does not exist from POV of this_cpu */ 3152 /*
2801 /* 1) this_cpu is not the appropriate cpu to perform load balancing 3153 * this_cpu is not the appropriate cpu to perform load balancing at
2802 * at this level. 3154 * this level.
2803 * 2) There is no busy sibling group to pull from.
2804 * 3) This group is the busiest group.
2805 * 4) This group is more busy than the avg busieness at this
2806 * sched_domain.
2807 * 5) The imbalance is within the specified limit.
2808 */ 3155 */
2809 if (!(*balance)) 3156 if (!(*balance))
2810 goto ret; 3157 goto ret;
@@ -2813,20 +3160,59 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2813 check_asym_packing(sd, &sds, this_cpu, imbalance)) 3160 check_asym_packing(sd, &sds, this_cpu, imbalance))
2814 return sds.busiest; 3161 return sds.busiest;
2815 3162
3163 /* There is no busy sibling group to pull tasks from */
2816 if (!sds.busiest || sds.busiest_nr_running == 0) 3164 if (!sds.busiest || sds.busiest_nr_running == 0)
2817 goto out_balanced; 3165 goto out_balanced;
2818 3166
3167 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
3168
3169 /*
3170 * If the busiest group is imbalanced the below checks don't
3171 * work because they assumes all things are equal, which typically
3172 * isn't true due to cpus_allowed constraints and the like.
3173 */
3174 if (sds.group_imb)
3175 goto force_balance;
3176
3177 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
3178 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
3179 !sds.busiest_has_capacity)
3180 goto force_balance;
3181
3182 /*
3183 * If the local group is more busy than the selected busiest group
3184 * don't try and pull any tasks.
3185 */
2819 if (sds.this_load >= sds.max_load) 3186 if (sds.this_load >= sds.max_load)
2820 goto out_balanced; 3187 goto out_balanced;
2821 3188
2822 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; 3189 /*
2823 3190 * Don't pull any tasks if this group is already above the domain
3191 * average load.
3192 */
2824 if (sds.this_load >= sds.avg_load) 3193 if (sds.this_load >= sds.avg_load)
2825 goto out_balanced; 3194 goto out_balanced;
2826 3195
2827 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 3196 if (idle == CPU_IDLE) {
2828 goto out_balanced; 3197 /*
3198 * This cpu is idle. If the busiest group load doesn't
3199 * have more tasks than the number of available cpu's and
3200 * there is no imbalance between this and busiest group
3201 * wrt to idle cpu's, it is balanced.
3202 */
3203 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
3204 sds.busiest_nr_running <= sds.busiest_group_weight)
3205 goto out_balanced;
3206 } else {
3207 /*
3208 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
3209 * imbalance_pct to be conservative.
3210 */
3211 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3212 goto out_balanced;
3213 }
2829 3214
3215force_balance:
2830 /* Looks like there is an imbalance. Compute it */ 3216 /* Looks like there is an imbalance. Compute it */
2831 calculate_imbalance(&sds, this_cpu, imbalance); 3217 calculate_imbalance(&sds, this_cpu, imbalance);
2832 return sds.busiest; 3218 return sds.busiest;
@@ -2857,7 +3243,8 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
2857 3243
2858 for_each_cpu(i, sched_group_cpus(group)) { 3244 for_each_cpu(i, sched_group_cpus(group)) {
2859 unsigned long power = power_of(i); 3245 unsigned long power = power_of(i);
2860 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 3246 unsigned long capacity = DIV_ROUND_CLOSEST(power,
3247 SCHED_POWER_SCALE);
2861 unsigned long wl; 3248 unsigned long wl;
2862 3249
2863 if (!capacity) 3250 if (!capacity)
@@ -2882,7 +3269,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
2882 * the load can be moved away from the cpu that is potentially 3269 * the load can be moved away from the cpu that is potentially
2883 * running at a lower capacity. 3270 * running at a lower capacity.
2884 */ 3271 */
2885 wl = (wl * SCHED_LOAD_SCALE) / power; 3272 wl = (wl * SCHED_POWER_SCALE) / power;
2886 3273
2887 if (wl > max_load) { 3274 if (wl > max_load) {
2888 max_load = wl; 3275 max_load = wl;
@@ -2902,7 +3289,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
2902/* Working cpumask for load_balance and load_balance_newidle. */ 3289/* Working cpumask for load_balance and load_balance_newidle. */
2903static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 3290static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2904 3291
2905static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, 3292static int need_active_balance(struct sched_domain *sd, int idle,
2906 int busiest_cpu, int this_cpu) 3293 int busiest_cpu, int this_cpu)
2907{ 3294{
2908 if (idle == CPU_NEWLY_IDLE) { 3295 if (idle == CPU_NEWLY_IDLE) {
@@ -2934,10 +3321,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
2934 * move_tasks() will succeed. ld_moved will be true and this 3321 * move_tasks() will succeed. ld_moved will be true and this
2935 * active balance code will not be triggered. 3322 * active balance code will not be triggered.
2936 */ 3323 */
2937 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2938 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2939 return 0;
2940
2941 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) 3324 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
2942 return 0; 3325 return 0;
2943 } 3326 }
@@ -2955,7 +3338,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2955 struct sched_domain *sd, enum cpu_idle_type idle, 3338 struct sched_domain *sd, enum cpu_idle_type idle,
2956 int *balance) 3339 int *balance)
2957{ 3340{
2958 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3341 int ld_moved, all_pinned = 0, active_balance = 0;
2959 struct sched_group *group; 3342 struct sched_group *group;
2960 unsigned long imbalance; 3343 unsigned long imbalance;
2961 struct rq *busiest; 3344 struct rq *busiest;
@@ -2964,21 +3347,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2964 3347
2965 cpumask_copy(cpus, cpu_active_mask); 3348 cpumask_copy(cpus, cpu_active_mask);
2966 3349
2967 /*
2968 * When power savings policy is enabled for the parent domain, idle
2969 * sibling can pick up load irrespective of busy siblings. In this case,
2970 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2971 * portraying it as CPU_NOT_IDLE.
2972 */
2973 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2974 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2975 sd_idle = 1;
2976
2977 schedstat_inc(sd, lb_count[idle]); 3350 schedstat_inc(sd, lb_count[idle]);
2978 3351
2979redo: 3352redo:
2980 update_shares(sd); 3353 group = find_busiest_group(sd, this_cpu, &imbalance, idle,
2981 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2982 cpus, balance); 3354 cpus, balance);
2983 3355
2984 if (*balance == 0) 3356 if (*balance == 0)
@@ -3007,6 +3379,7 @@ redo:
3007 * still unbalanced. ld_moved simply stays zero, so it is 3379 * still unbalanced. ld_moved simply stays zero, so it is
3008 * correctly treated as an imbalance. 3380 * correctly treated as an imbalance.
3009 */ 3381 */
3382 all_pinned = 1;
3010 local_irq_save(flags); 3383 local_irq_save(flags);
3011 double_rq_lock(this_rq, busiest); 3384 double_rq_lock(this_rq, busiest);
3012 ld_moved = move_tasks(this_rq, this_cpu, busiest, 3385 ld_moved = move_tasks(this_rq, this_cpu, busiest,
@@ -3031,10 +3404,16 @@ redo:
3031 3404
3032 if (!ld_moved) { 3405 if (!ld_moved) {
3033 schedstat_inc(sd, lb_failed[idle]); 3406 schedstat_inc(sd, lb_failed[idle]);
3034 sd->nr_balance_failed++; 3407 /*
3408 * Increment the failure counter only on periodic balance.
3409 * We do not want newidle balance, which can be very
3410 * frequent, pollute the failure counter causing
3411 * excessive cache_hot migrations and active balances.
3412 */
3413 if (idle != CPU_NEWLY_IDLE)
3414 sd->nr_balance_failed++;
3035 3415
3036 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), 3416 if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
3037 this_cpu)) {
3038 raw_spin_lock_irqsave(&busiest->lock, flags); 3417 raw_spin_lock_irqsave(&busiest->lock, flags);
3039 3418
3040 /* don't kick the active_load_balance_cpu_stop, 3419 /* don't kick the active_load_balance_cpu_stop,
@@ -3089,10 +3468,6 @@ redo:
3089 sd->balance_interval *= 2; 3468 sd->balance_interval *= 2;
3090 } 3469 }
3091 3470
3092 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3093 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3094 ld_moved = -1;
3095
3096 goto out; 3471 goto out;
3097 3472
3098out_balanced: 3473out_balanced:
@@ -3106,14 +3481,8 @@ out_one_pinned:
3106 (sd->balance_interval < sd->max_interval)) 3481 (sd->balance_interval < sd->max_interval))
3107 sd->balance_interval *= 2; 3482 sd->balance_interval *= 2;
3108 3483
3109 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3484 ld_moved = 0;
3110 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3111 ld_moved = -1;
3112 else
3113 ld_moved = 0;
3114out: 3485out:
3115 if (ld_moved)
3116 update_shares(sd);
3117 return ld_moved; 3486 return ld_moved;
3118} 3487}
3119 3488
@@ -3137,6 +3506,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3137 */ 3506 */
3138 raw_spin_unlock(&this_rq->lock); 3507 raw_spin_unlock(&this_rq->lock);
3139 3508
3509 update_shares(this_cpu);
3510 rcu_read_lock();
3140 for_each_domain(this_cpu, sd) { 3511 for_each_domain(this_cpu, sd) {
3141 unsigned long interval; 3512 unsigned long interval;
3142 int balance = 1; 3513 int balance = 1;
@@ -3158,6 +3529,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3158 break; 3529 break;
3159 } 3530 }
3160 } 3531 }
3532 rcu_read_unlock();
3161 3533
3162 raw_spin_lock(&this_rq->lock); 3534 raw_spin_lock(&this_rq->lock);
3163 3535
@@ -3206,6 +3578,7 @@ static int active_load_balance_cpu_stop(void *data)
3206 double_lock_balance(busiest_rq, target_rq); 3578 double_lock_balance(busiest_rq, target_rq);
3207 3579
3208 /* Search for an sd spanning us and the target CPU. */ 3580 /* Search for an sd spanning us and the target CPU. */
3581 rcu_read_lock();
3209 for_each_domain(target_cpu, sd) { 3582 for_each_domain(target_cpu, sd) {
3210 if ((sd->flags & SD_LOAD_BALANCE) && 3583 if ((sd->flags & SD_LOAD_BALANCE) &&
3211 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) 3584 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
@@ -3221,6 +3594,7 @@ static int active_load_balance_cpu_stop(void *data)
3221 else 3594 else
3222 schedstat_inc(sd, alb_failed); 3595 schedstat_inc(sd, alb_failed);
3223 } 3596 }
3597 rcu_read_unlock();
3224 double_unlock_balance(busiest_rq, target_rq); 3598 double_unlock_balance(busiest_rq, target_rq);
3225out_unlock: 3599out_unlock:
3226 busiest_rq->active_balance = 0; 3600 busiest_rq->active_balance = 0;
@@ -3347,6 +3721,7 @@ static int find_new_ilb(int cpu)
3347{ 3721{
3348 struct sched_domain *sd; 3722 struct sched_domain *sd;
3349 struct sched_group *ilb_group; 3723 struct sched_group *ilb_group;
3724 int ilb = nr_cpu_ids;
3350 3725
3351 /* 3726 /*
3352 * Have idle load balancer selection from semi-idle packages only 3727 * Have idle load balancer selection from semi-idle packages only
@@ -3362,20 +3737,25 @@ static int find_new_ilb(int cpu)
3362 if (cpumask_weight(nohz.idle_cpus_mask) < 2) 3737 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3363 goto out_done; 3738 goto out_done;
3364 3739
3740 rcu_read_lock();
3365 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3741 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3366 ilb_group = sd->groups; 3742 ilb_group = sd->groups;
3367 3743
3368 do { 3744 do {
3369 if (is_semi_idle_group(ilb_group)) 3745 if (is_semi_idle_group(ilb_group)) {
3370 return cpumask_first(nohz.grp_idle_mask); 3746 ilb = cpumask_first(nohz.grp_idle_mask);
3747 goto unlock;
3748 }
3371 3749
3372 ilb_group = ilb_group->next; 3750 ilb_group = ilb_group->next;
3373 3751
3374 } while (ilb_group != sd->groups); 3752 } while (ilb_group != sd->groups);
3375 } 3753 }
3754unlock:
3755 rcu_read_unlock();
3376 3756
3377out_done: 3757out_done:
3378 return nr_cpu_ids; 3758 return ilb;
3379} 3759}
3380#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3760#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3381static inline int find_new_ilb(int call_cpu) 3761static inline int find_new_ilb(int call_cpu)
@@ -3490,6 +3870,17 @@ void select_nohz_load_balancer(int stop_tick)
3490 3870
3491static DEFINE_SPINLOCK(balancing); 3871static DEFINE_SPINLOCK(balancing);
3492 3872
3873static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3874
3875/*
3876 * Scale the max load_balance interval with the number of CPUs in the system.
3877 * This trades load-balance latency on larger machines for less cross talk.
3878 */
3879static void update_max_interval(void)
3880{
3881 max_load_balance_interval = HZ*num_online_cpus()/10;
3882}
3883
3493/* 3884/*
3494 * It checks each scheduling domain to see if it is due to be balanced, 3885 * It checks each scheduling domain to see if it is due to be balanced,
3495 * and initiates a balancing operation if so. 3886 * and initiates a balancing operation if so.
@@ -3507,6 +3898,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3507 int update_next_balance = 0; 3898 int update_next_balance = 0;
3508 int need_serialize; 3899 int need_serialize;
3509 3900
3901 update_shares(cpu);
3902
3903 rcu_read_lock();
3510 for_each_domain(cpu, sd) { 3904 for_each_domain(cpu, sd) {
3511 if (!(sd->flags & SD_LOAD_BALANCE)) 3905 if (!(sd->flags & SD_LOAD_BALANCE))
3512 continue; 3906 continue;
@@ -3517,10 +3911,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3517 3911
3518 /* scale ms to jiffies */ 3912 /* scale ms to jiffies */
3519 interval = msecs_to_jiffies(interval); 3913 interval = msecs_to_jiffies(interval);
3520 if (unlikely(!interval)) 3914 interval = clamp(interval, 1UL, max_load_balance_interval);
3521 interval = 1;
3522 if (interval > HZ*NR_CPUS/10)
3523 interval = HZ*NR_CPUS/10;
3524 3915
3525 need_serialize = sd->flags & SD_SERIALIZE; 3916 need_serialize = sd->flags & SD_SERIALIZE;
3526 3917
@@ -3533,8 +3924,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3533 if (load_balance(cpu, rq, sd, idle, &balance)) { 3924 if (load_balance(cpu, rq, sd, idle, &balance)) {
3534 /* 3925 /*
3535 * We've pulled tasks over so either we're no 3926 * We've pulled tasks over so either we're no
3536 * longer idle, or one of our SMT siblings is 3927 * longer idle.
3537 * not idle.
3538 */ 3928 */
3539 idle = CPU_NOT_IDLE; 3929 idle = CPU_NOT_IDLE;
3540 } 3930 }
@@ -3556,6 +3946,7 @@ out:
3556 if (!balance) 3946 if (!balance)
3557 break; 3947 break;
3558 } 3948 }
3949 rcu_read_unlock();
3559 3950
3560 /* 3951 /*
3561 * next_balance will be updated only when there is a need. 3952 * next_balance will be updated only when there is a need.
@@ -3751,8 +4142,11 @@ static void task_fork_fair(struct task_struct *p)
3751 4142
3752 update_rq_clock(rq); 4143 update_rq_clock(rq);
3753 4144
3754 if (unlikely(task_cpu(p) != this_cpu)) 4145 if (unlikely(task_cpu(p) != this_cpu)) {
4146 rcu_read_lock();
3755 __set_task_cpu(p, this_cpu); 4147 __set_task_cpu(p, this_cpu);
4148 rcu_read_unlock();
4149 }
3756 4150
3757 update_curr(cfs_rq); 4151 update_curr(cfs_rq);
3758 4152
@@ -3778,33 +4172,62 @@ static void task_fork_fair(struct task_struct *p)
3778 * Priority of the task has changed. Check to see if we preempt 4172 * Priority of the task has changed. Check to see if we preempt
3779 * the current task. 4173 * the current task.
3780 */ 4174 */
3781static void prio_changed_fair(struct rq *rq, struct task_struct *p, 4175static void
3782 int oldprio, int running) 4176prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
3783{ 4177{
4178 if (!p->se.on_rq)
4179 return;
4180
3784 /* 4181 /*
3785 * Reschedule if we are currently running on this runqueue and 4182 * Reschedule if we are currently running on this runqueue and
3786 * our priority decreased, or if we are not currently running on 4183 * our priority decreased, or if we are not currently running on
3787 * this runqueue and our priority is higher than the current's 4184 * this runqueue and our priority is higher than the current's
3788 */ 4185 */
3789 if (running) { 4186 if (rq->curr == p) {
3790 if (p->prio > oldprio) 4187 if (p->prio > oldprio)
3791 resched_task(rq->curr); 4188 resched_task(rq->curr);
3792 } else 4189 } else
3793 check_preempt_curr(rq, p, 0); 4190 check_preempt_curr(rq, p, 0);
3794} 4191}
3795 4192
4193static void switched_from_fair(struct rq *rq, struct task_struct *p)
4194{
4195 struct sched_entity *se = &p->se;
4196 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4197
4198 /*
4199 * Ensure the task's vruntime is normalized, so that when its
4200 * switched back to the fair class the enqueue_entity(.flags=0) will
4201 * do the right thing.
4202 *
4203 * If it was on_rq, then the dequeue_entity(.flags=0) will already
4204 * have normalized the vruntime, if it was !on_rq, then only when
4205 * the task is sleeping will it still have non-normalized vruntime.
4206 */
4207 if (!se->on_rq && p->state != TASK_RUNNING) {
4208 /*
4209 * Fix up our vruntime so that the current sleep doesn't
4210 * cause 'unlimited' sleep bonus.
4211 */
4212 place_entity(cfs_rq, se, 0);
4213 se->vruntime -= cfs_rq->min_vruntime;
4214 }
4215}
4216
3796/* 4217/*
3797 * We switched to the sched_fair class. 4218 * We switched to the sched_fair class.
3798 */ 4219 */
3799static void switched_to_fair(struct rq *rq, struct task_struct *p, 4220static void switched_to_fair(struct rq *rq, struct task_struct *p)
3800 int running)
3801{ 4221{
4222 if (!p->se.on_rq)
4223 return;
4224
3802 /* 4225 /*
3803 * We were most likely switched from sched_rt, so 4226 * We were most likely switched from sched_rt, so
3804 * kick off the schedule if running, otherwise just see 4227 * kick off the schedule if running, otherwise just see
3805 * if we can still preempt the current task. 4228 * if we can still preempt the current task.
3806 */ 4229 */
3807 if (running) 4230 if (rq->curr == p)
3808 resched_task(rq->curr); 4231 resched_task(rq->curr);
3809 else 4232 else
3810 check_preempt_curr(rq, p, 0); 4233 check_preempt_curr(rq, p, 0);
@@ -3824,13 +4247,26 @@ static void set_curr_task_fair(struct rq *rq)
3824} 4247}
3825 4248
3826#ifdef CONFIG_FAIR_GROUP_SCHED 4249#ifdef CONFIG_FAIR_GROUP_SCHED
3827static void moved_group_fair(struct task_struct *p, int on_rq) 4250static void task_move_group_fair(struct task_struct *p, int on_rq)
3828{ 4251{
3829 struct cfs_rq *cfs_rq = task_cfs_rq(p); 4252 /*
3830 4253 * If the task was not on the rq at the time of this cgroup movement
3831 update_curr(cfs_rq); 4254 * it must have been asleep, sleeping tasks keep their ->vruntime
4255 * absolute on their old rq until wakeup (needed for the fair sleeper
4256 * bonus in place_entity()).
4257 *
4258 * If it was on the rq, we've just 'preempted' it, which does convert
4259 * ->vruntime to a relative base.
4260 *
4261 * Make sure both cases convert their relative position when migrating
4262 * to another cgroup's rq. This does somewhat interfere with the
4263 * fair sleeper stuff for the first placement, but who cares.
4264 */
4265 if (!on_rq)
4266 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
4267 set_task_rq(p, task_cpu(p));
3832 if (!on_rq) 4268 if (!on_rq)
3833 place_entity(cfs_rq, &p->se, 1); 4269 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
3834} 4270}
3835#endif 4271#endif
3836 4272
@@ -3857,6 +4293,7 @@ static const struct sched_class fair_sched_class = {
3857 .enqueue_task = enqueue_task_fair, 4293 .enqueue_task = enqueue_task_fair,
3858 .dequeue_task = dequeue_task_fair, 4294 .dequeue_task = dequeue_task_fair,
3859 .yield_task = yield_task_fair, 4295 .yield_task = yield_task_fair,
4296 .yield_to_task = yield_to_task_fair,
3860 4297
3861 .check_preempt_curr = check_preempt_wakeup, 4298 .check_preempt_curr = check_preempt_wakeup,
3862 4299
@@ -3877,12 +4314,13 @@ static const struct sched_class fair_sched_class = {
3877 .task_fork = task_fork_fair, 4314 .task_fork = task_fork_fair,
3878 4315
3879 .prio_changed = prio_changed_fair, 4316 .prio_changed = prio_changed_fair,
4317 .switched_from = switched_from_fair,
3880 .switched_to = switched_to_fair, 4318 .switched_to = switched_to_fair,
3881 4319
3882 .get_rr_interval = get_rr_interval_fair, 4320 .get_rr_interval = get_rr_interval_fair,
3883 4321
3884#ifdef CONFIG_FAIR_GROUP_SCHED 4322#ifdef CONFIG_FAIR_GROUP_SCHED
3885 .moved_group = moved_group_fair, 4323 .task_move_group = task_move_group_fair,
3886#endif 4324#endif
3887}; 4325};
3888 4326
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 83c66e8ad3ee..1e7066d76c26 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
52SCHED_FEAT(HRTICK, 0) 52SCHED_FEAT(HRTICK, 0)
53SCHED_FEAT(DOUBLE_TICK, 0) 53SCHED_FEAT(DOUBLE_TICK, 0)
54SCHED_FEAT(LB_BIAS, 1) 54SCHED_FEAT(LB_BIAS, 1)
55SCHED_FEAT(LB_SHARES_UPDATE, 1)
56SCHED_FEAT(ASYM_EFF_LOAD, 1)
57 55
58/* 56/*
59 * Spin-wait on mutex acquisition when the mutex owner is running on 57 * Spin-wait on mutex acquisition when the mutex owner is running on
@@ -61,3 +59,16 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1)
61 * release the lock. Decreases scheduling overhead. 59 * release the lock. Decreases scheduling overhead.
62 */ 60 */
63SCHED_FEAT(OWNER_SPIN, 1) 61SCHED_FEAT(OWNER_SPIN, 1)
62
63/*
64 * Decrement CPU power based on irq activity
65 */
66SCHED_FEAT(NONIRQ_POWER, 1)
67
68/*
69 * Queue remote wakeups on the target CPU and process them
70 * using the scheduler IPI. Reduces rq->lock contention/bounces.
71 */
72SCHED_FEAT(TTWU_QUEUE, 1)
73
74SCHED_FEAT(FORCE_SD_OVERLAP, 0)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9fa0f402c87c..0a51882534ea 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -7,7 +7,7 @@
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int 9static int
10select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) 10select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
11{ 11{
12 return task_cpu(p); /* IDLE tasks as never migrated */ 12 return task_cpu(p); /* IDLE tasks as never migrated */
13} 13}
@@ -52,31 +52,15 @@ static void set_curr_task_idle(struct rq *rq)
52{ 52{
53} 53}
54 54
55static void switched_to_idle(struct rq *rq, struct task_struct *p, 55static void switched_to_idle(struct rq *rq, struct task_struct *p)
56 int running)
57{ 56{
58 /* Can this actually happen?? */ 57 BUG();
59 if (running)
60 resched_task(rq->curr);
61 else
62 check_preempt_curr(rq, p, 0);
63} 58}
64 59
65static void prio_changed_idle(struct rq *rq, struct task_struct *p, 60static void
66 int oldprio, int running) 61prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
67{ 62{
68 /* This can happen for hot plug CPUS */ 63 BUG();
69
70 /*
71 * Reschedule if we are currently running on this runqueue and
72 * our priority decreased, or if we are not currently running on
73 * this runqueue and our priority is higher than the current's
74 */
75 if (running) {
76 if (p->prio > oldprio)
77 resched_task(rq->curr);
78 } else
79 check_preempt_curr(rq, p, 0);
80} 64}
81 65
82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) 66static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
@@ -110,6 +94,4 @@ static const struct sched_class idle_sched_class = {
110 94
111 .prio_changed = prio_changed_idle, 95 .prio_changed = prio_changed_idle,
112 .switched_to = switched_to_idle, 96 .switched_to = switched_to_idle,
113
114 /* no .task_new for idle tasks */
115}; 97};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index e40e7fe43170..58cf5d18dfdc 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,25 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
184} 184}
185 185
186typedef struct task_group *rt_rq_iter_t;
187
188#define for_each_rt_rq(rt_rq, iter, rq) \
189 for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \
190 (&iter->list != &task_groups) && \
191 (rt_rq = iter->rt_rq[cpu_of(rq)]); \
192 iter = list_entry_rcu(iter->list.next, typeof(*iter), list))
193
194static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
195{
196 list_add_rcu(&rt_rq->leaf_rt_rq_list,
197 &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
198}
199
200static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
201{
202 list_del_rcu(&rt_rq->leaf_rt_rq_list);
203}
204
186#define for_each_leaf_rt_rq(rt_rq, rq) \ 205#define for_each_leaf_rt_rq(rt_rq, rq) \
187 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 206 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
188 207
@@ -199,11 +218,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
199 218
200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 219static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
201{ 220{
202 int this_cpu = smp_processor_id();
203 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 221 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
204 struct sched_rt_entity *rt_se; 222 struct sched_rt_entity *rt_se;
205 223
206 rt_se = rt_rq->tg->rt_se[this_cpu]; 224 int cpu = cpu_of(rq_of_rt_rq(rt_rq));
225
226 rt_se = rt_rq->tg->rt_se[cpu];
207 227
208 if (rt_rq->rt_nr_running) { 228 if (rt_rq->rt_nr_running) {
209 if (rt_se && !on_rt_rq(rt_se)) 229 if (rt_se && !on_rt_rq(rt_se))
@@ -215,10 +235,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
215 235
216static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 236static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
217{ 237{
218 int this_cpu = smp_processor_id();
219 struct sched_rt_entity *rt_se; 238 struct sched_rt_entity *rt_se;
239 int cpu = cpu_of(rq_of_rt_rq(rt_rq));
220 240
221 rt_se = rt_rq->tg->rt_se[this_cpu]; 241 rt_se = rt_rq->tg->rt_se[cpu];
222 242
223 if (rt_se && on_rt_rq(rt_se)) 243 if (rt_se && on_rt_rq(rt_se))
224 dequeue_rt_entity(rt_se); 244 dequeue_rt_entity(rt_se);
@@ -276,6 +296,19 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
276 return ktime_to_ns(def_rt_bandwidth.rt_period); 296 return ktime_to_ns(def_rt_bandwidth.rt_period);
277} 297}
278 298
299typedef struct rt_rq *rt_rq_iter_t;
300
301#define for_each_rt_rq(rt_rq, iter, rq) \
302 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
303
304static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
305{
306}
307
308static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
309{
310}
311
279#define for_each_leaf_rt_rq(rt_rq, rq) \ 312#define for_each_leaf_rt_rq(rt_rq, rq) \
280 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 313 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
281 314
@@ -382,12 +415,13 @@ next:
382static void __disable_runtime(struct rq *rq) 415static void __disable_runtime(struct rq *rq)
383{ 416{
384 struct root_domain *rd = rq->rd; 417 struct root_domain *rd = rq->rd;
418 rt_rq_iter_t iter;
385 struct rt_rq *rt_rq; 419 struct rt_rq *rt_rq;
386 420
387 if (unlikely(!scheduler_running)) 421 if (unlikely(!scheduler_running))
388 return; 422 return;
389 423
390 for_each_leaf_rt_rq(rt_rq, rq) { 424 for_each_rt_rq(rt_rq, iter, rq) {
391 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 425 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
392 s64 want; 426 s64 want;
393 int i; 427 int i;
@@ -467,6 +501,7 @@ static void disable_runtime(struct rq *rq)
467 501
468static void __enable_runtime(struct rq *rq) 502static void __enable_runtime(struct rq *rq)
469{ 503{
504 rt_rq_iter_t iter;
470 struct rt_rq *rt_rq; 505 struct rt_rq *rt_rq;
471 506
472 if (unlikely(!scheduler_running)) 507 if (unlikely(!scheduler_running))
@@ -475,7 +510,7 @@ static void __enable_runtime(struct rq *rq)
475 /* 510 /*
476 * Reset each runqueue's bandwidth settings 511 * Reset each runqueue's bandwidth settings
477 */ 512 */
478 for_each_leaf_rt_rq(rt_rq, rq) { 513 for_each_rt_rq(rt_rq, iter, rq) {
479 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 514 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
480 515
481 raw_spin_lock(&rt_b->rt_runtime_lock); 516 raw_spin_lock(&rt_b->rt_runtime_lock);
@@ -542,12 +577,22 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
542 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { 577 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
543 rt_rq->rt_throttled = 0; 578 rt_rq->rt_throttled = 0;
544 enqueue = 1; 579 enqueue = 1;
580
581 /*
582 * Force a clock update if the CPU was idle,
583 * lest wakeup -> unthrottle time accumulate.
584 */
585 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
586 rq->skip_clock_update = -1;
545 } 587 }
546 if (rt_rq->rt_time || rt_rq->rt_nr_running) 588 if (rt_rq->rt_time || rt_rq->rt_nr_running)
547 idle = 0; 589 idle = 0;
548 raw_spin_unlock(&rt_rq->rt_runtime_lock); 590 raw_spin_unlock(&rt_rq->rt_runtime_lock);
549 } else if (rt_rq->rt_nr_running) 591 } else if (rt_rq->rt_nr_running) {
550 idle = 0; 592 idle = 0;
593 if (!rt_rq_throttled(rt_rq))
594 enqueue = 1;
595 }
551 596
552 if (enqueue) 597 if (enqueue)
553 sched_rt_rq_enqueue(rt_rq); 598 sched_rt_rq_enqueue(rt_rq);
@@ -606,10 +651,10 @@ static void update_curr_rt(struct rq *rq)
606 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 651 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
607 u64 delta_exec; 652 u64 delta_exec;
608 653
609 if (!task_has_rt_policy(curr)) 654 if (curr->sched_class != &rt_sched_class)
610 return; 655 return;
611 656
612 delta_exec = rq->clock - curr->se.exec_start; 657 delta_exec = rq->clock_task - curr->se.exec_start;
613 if (unlikely((s64)delta_exec < 0)) 658 if (unlikely((s64)delta_exec < 0))
614 delta_exec = 0; 659 delta_exec = 0;
615 660
@@ -618,7 +663,7 @@ static void update_curr_rt(struct rq *rq)
618 curr->se.sum_exec_runtime += delta_exec; 663 curr->se.sum_exec_runtime += delta_exec;
619 account_group_exec_runtime(curr, delta_exec); 664 account_group_exec_runtime(curr, delta_exec);
620 665
621 curr->se.exec_start = rq->clock; 666 curr->se.exec_start = rq->clock_task;
622 cpuacct_charge(curr, delta_exec); 667 cpuacct_charge(curr, delta_exec);
623 668
624 sched_rt_avg_update(rq, delta_exec); 669 sched_rt_avg_update(rq, delta_exec);
@@ -825,6 +870,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 870 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
826 return; 871 return;
827 872
873 if (!rt_rq->rt_nr_running)
874 list_add_leaf_rt_rq(rt_rq);
875
828 if (head) 876 if (head)
829 list_add(&rt_se->run_list, queue); 877 list_add(&rt_se->run_list, queue);
830 else 878 else
@@ -844,6 +892,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
844 __clear_bit(rt_se_prio(rt_se), array->bitmap); 892 __clear_bit(rt_se_prio(rt_se), array->bitmap);
845 893
846 dec_rt_tasks(rt_se, rt_rq); 894 dec_rt_tasks(rt_se, rt_rq);
895 if (!rt_rq->rt_nr_running)
896 list_del_leaf_rt_rq(rt_rq);
847} 897}
848 898
849/* 899/*
@@ -949,40 +999,55 @@ static void yield_task_rt(struct rq *rq)
949static int find_lowest_rq(struct task_struct *task); 999static int find_lowest_rq(struct task_struct *task);
950 1000
951static int 1001static int
952select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) 1002select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
953{ 1003{
1004 struct task_struct *curr;
1005 struct rq *rq;
1006 int cpu;
1007
954 if (sd_flag != SD_BALANCE_WAKE) 1008 if (sd_flag != SD_BALANCE_WAKE)
955 return smp_processor_id(); 1009 return smp_processor_id();
956 1010
1011 cpu = task_cpu(p);
1012 rq = cpu_rq(cpu);
1013
1014 rcu_read_lock();
1015 curr = ACCESS_ONCE(rq->curr); /* unlocked access */
1016
957 /* 1017 /*
958 * If the current task is an RT task, then 1018 * If the current task on @p's runqueue is an RT task, then
959 * try to see if we can wake this RT task up on another 1019 * try to see if we can wake this RT task up on another
960 * runqueue. Otherwise simply start this RT task 1020 * runqueue. Otherwise simply start this RT task
961 * on its current runqueue. 1021 * on its current runqueue.
962 * 1022 *
963 * We want to avoid overloading runqueues. Even if 1023 * We want to avoid overloading runqueues. If the woken
964 * the RT task is of higher priority than the current RT task. 1024 * task is a higher priority, then it will stay on this CPU
965 * RT tasks behave differently than other tasks. If 1025 * and the lower prio task should be moved to another CPU.
966 * one gets preempted, we try to push it off to another queue. 1026 * Even though this will probably make the lower prio task
967 * So trying to keep a preempting RT task on the same 1027 * lose its cache, we do not want to bounce a higher task
968 * cache hot CPU will force the running RT task to 1028 * around just because it gave up its CPU, perhaps for a
969 * a cold CPU. So we waste all the cache for the lower 1029 * lock?
970 * RT task in hopes of saving some of a RT task 1030 *
971 * that is just being woken and probably will have 1031 * For equal prio tasks, we just let the scheduler sort it out.
972 * cold cache anyway. 1032 *
1033 * Otherwise, just let it ride on the affined RQ and the
1034 * post-schedule router will push the preempted task away
1035 *
1036 * This test is optimistic, if we get it wrong the load-balancer
1037 * will have to sort it out.
973 */ 1038 */
974 if (unlikely(rt_task(rq->curr)) && 1039 if (curr && unlikely(rt_task(curr)) &&
1040 (curr->rt.nr_cpus_allowed < 2 ||
1041 curr->prio < p->prio) &&
975 (p->rt.nr_cpus_allowed > 1)) { 1042 (p->rt.nr_cpus_allowed > 1)) {
976 int cpu = find_lowest_rq(p); 1043 int target = find_lowest_rq(p);
977 1044
978 return (cpu == -1) ? task_cpu(p) : cpu; 1045 if (target != -1)
1046 cpu = target;
979 } 1047 }
1048 rcu_read_unlock();
980 1049
981 /* 1050 return cpu;
982 * Otherwise, just let it ride on the affined RQ and the
983 * post-schedule router will push the preempted task away
984 */
985 return task_cpu(p);
986} 1051}
987 1052
988static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1053static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
@@ -1031,7 +1096,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
1031 * to move current somewhere else, making room for our non-migratable 1096 * to move current somewhere else, making room for our non-migratable
1032 * task. 1097 * task.
1033 */ 1098 */
1034 if (p->prio == rq->curr->prio && !need_resched()) 1099 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1035 check_preempt_equal_prio(rq, p); 1100 check_preempt_equal_prio(rq, p);
1036#endif 1101#endif
1037} 1102}
@@ -1074,7 +1139,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1074 } while (rt_rq); 1139 } while (rt_rq);
1075 1140
1076 p = rt_task_of(rt_se); 1141 p = rt_task_of(rt_se);
1077 p->se.exec_start = rq->clock; 1142 p->se.exec_start = rq->clock_task;
1078 1143
1079 return p; 1144 return p;
1080} 1145}
@@ -1107,7 +1172,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1107 * The previous task needs to be made eligible for pushing 1172 * The previous task needs to be made eligible for pushing
1108 * if it is still active 1173 * if it is still active
1109 */ 1174 */
1110 if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) 1175 if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
1111 enqueue_pushable_task(rq, p); 1176 enqueue_pushable_task(rq, p);
1112} 1177}
1113 1178
@@ -1139,7 +1204,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1139 for_each_leaf_rt_rq(rt_rq, rq) { 1204 for_each_leaf_rt_rq(rt_rq, rq) {
1140 array = &rt_rq->active; 1205 array = &rt_rq->active;
1141 idx = sched_find_first_bit(array->bitmap); 1206 idx = sched_find_first_bit(array->bitmap);
1142 next_idx: 1207next_idx:
1143 if (idx >= MAX_RT_PRIO) 1208 if (idx >= MAX_RT_PRIO)
1144 continue; 1209 continue;
1145 if (next && next->prio < idx) 1210 if (next && next->prio < idx)
@@ -1174,6 +1239,10 @@ static int find_lowest_rq(struct task_struct *task)
1174 int this_cpu = smp_processor_id(); 1239 int this_cpu = smp_processor_id();
1175 int cpu = task_cpu(task); 1240 int cpu = task_cpu(task);
1176 1241
1242 /* Make sure the mask is initialized first */
1243 if (unlikely(!lowest_mask))
1244 return -1;
1245
1177 if (task->rt.nr_cpus_allowed == 1) 1246 if (task->rt.nr_cpus_allowed == 1)
1178 return -1; /* No other targets possible */ 1247 return -1; /* No other targets possible */
1179 1248
@@ -1198,6 +1267,7 @@ static int find_lowest_rq(struct task_struct *task)
1198 if (!cpumask_test_cpu(this_cpu, lowest_mask)) 1267 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1199 this_cpu = -1; /* Skip this_cpu opt if not among lowest */ 1268 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1200 1269
1270 rcu_read_lock();
1201 for_each_domain(cpu, sd) { 1271 for_each_domain(cpu, sd) {
1202 if (sd->flags & SD_WAKE_AFFINE) { 1272 if (sd->flags & SD_WAKE_AFFINE) {
1203 int best_cpu; 1273 int best_cpu;
@@ -1207,15 +1277,20 @@ static int find_lowest_rq(struct task_struct *task)
1207 * remote processor. 1277 * remote processor.
1208 */ 1278 */
1209 if (this_cpu != -1 && 1279 if (this_cpu != -1 &&
1210 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) 1280 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1281 rcu_read_unlock();
1211 return this_cpu; 1282 return this_cpu;
1283 }
1212 1284
1213 best_cpu = cpumask_first_and(lowest_mask, 1285 best_cpu = cpumask_first_and(lowest_mask,
1214 sched_domain_span(sd)); 1286 sched_domain_span(sd));
1215 if (best_cpu < nr_cpu_ids) 1287 if (best_cpu < nr_cpu_ids) {
1288 rcu_read_unlock();
1216 return best_cpu; 1289 return best_cpu;
1290 }
1217 } 1291 }
1218 } 1292 }
1293 rcu_read_unlock();
1219 1294
1220 /* 1295 /*
1221 * And finally, if there were no matches within the domains 1296 * And finally, if there were no matches within the domains
@@ -1258,7 +1333,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1258 !cpumask_test_cpu(lowest_rq->cpu, 1333 !cpumask_test_cpu(lowest_rq->cpu,
1259 &task->cpus_allowed) || 1334 &task->cpus_allowed) ||
1260 task_running(rq, task) || 1335 task_running(rq, task) ||
1261 !task->se.on_rq)) { 1336 !task->on_rq)) {
1262 1337
1263 raw_spin_unlock(&lowest_rq->lock); 1338 raw_spin_unlock(&lowest_rq->lock);
1264 lowest_rq = NULL; 1339 lowest_rq = NULL;
@@ -1292,7 +1367,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
1292 BUG_ON(task_current(rq, p)); 1367 BUG_ON(task_current(rq, p));
1293 BUG_ON(p->rt.nr_cpus_allowed <= 1); 1368 BUG_ON(p->rt.nr_cpus_allowed <= 1);
1294 1369
1295 BUG_ON(!p->se.on_rq); 1370 BUG_ON(!p->on_rq);
1296 BUG_ON(!rt_task(p)); 1371 BUG_ON(!rt_task(p));
1297 1372
1298 return p; 1373 return p;
@@ -1315,7 +1390,7 @@ static int push_rt_task(struct rq *rq)
1315 if (!next_task) 1390 if (!next_task)
1316 return 0; 1391 return 0;
1317 1392
1318 retry: 1393retry:
1319 if (unlikely(next_task == rq->curr)) { 1394 if (unlikely(next_task == rq->curr)) {
1320 WARN_ON(1); 1395 WARN_ON(1);
1321 return 0; 1396 return 0;
@@ -1349,7 +1424,7 @@ static int push_rt_task(struct rq *rq)
1349 task = pick_next_pushable_task(rq); 1424 task = pick_next_pushable_task(rq);
1350 if (task_cpu(next_task) == rq->cpu && task == next_task) { 1425 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1351 /* 1426 /*
1352 * If we get here, the task hasnt moved at all, but 1427 * If we get here, the task hasn't moved at all, but
1353 * it has failed to push. We will not try again, 1428 * it has failed to push. We will not try again,
1354 * since the other cpus will pull from us when they 1429 * since the other cpus will pull from us when they
1355 * are ready. 1430 * are ready.
@@ -1438,7 +1513,7 @@ static int pull_rt_task(struct rq *this_rq)
1438 */ 1513 */
1439 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 1514 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1440 WARN_ON(p == src_rq->curr); 1515 WARN_ON(p == src_rq->curr);
1441 WARN_ON(!p->se.on_rq); 1516 WARN_ON(!p->on_rq);
1442 1517
1443 /* 1518 /*
1444 * There's a chance that p is higher in priority 1519 * There's a chance that p is higher in priority
@@ -1459,11 +1534,11 @@ static int pull_rt_task(struct rq *this_rq)
1459 /* 1534 /*
1460 * We continue with the search, just in 1535 * We continue with the search, just in
1461 * case there's an even higher prio task 1536 * case there's an even higher prio task
1462 * in another runqueue. (low likelyhood 1537 * in another runqueue. (low likelihood
1463 * but possible) 1538 * but possible)
1464 */ 1539 */
1465 } 1540 }
1466 skip: 1541skip:
1467 double_unlock_balance(this_rq, src_rq); 1542 double_unlock_balance(this_rq, src_rq);
1468 } 1543 }
1469 1544
@@ -1491,7 +1566,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1491 if (!task_running(rq, p) && 1566 if (!task_running(rq, p) &&
1492 !test_tsk_need_resched(rq->curr) && 1567 !test_tsk_need_resched(rq->curr) &&
1493 has_pushable_tasks(rq) && 1568 has_pushable_tasks(rq) &&
1494 p->rt.nr_cpus_allowed > 1) 1569 p->rt.nr_cpus_allowed > 1 &&
1570 rt_task(rq->curr) &&
1571 (rq->curr->rt.nr_cpus_allowed < 2 ||
1572 rq->curr->prio < p->prio))
1495 push_rt_tasks(rq); 1573 push_rt_tasks(rq);
1496} 1574}
1497 1575
@@ -1506,7 +1584,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1506 * Update the migration status of the RQ if we have an RT task 1584 * Update the migration status of the RQ if we have an RT task
1507 * which is running AND changing its weight value. 1585 * which is running AND changing its weight value.
1508 */ 1586 */
1509 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { 1587 if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
1510 struct rq *rq = task_rq(p); 1588 struct rq *rq = task_rq(p);
1511 1589
1512 if (!task_current(rq, p)) { 1590 if (!task_current(rq, p)) {
@@ -1567,8 +1645,7 @@ static void rq_offline_rt(struct rq *rq)
1567 * When switch from the rt queue, we bring ourselves to a position 1645 * When switch from the rt queue, we bring ourselves to a position
1568 * that we might want to pull RT tasks from other runqueues. 1646 * that we might want to pull RT tasks from other runqueues.
1569 */ 1647 */
1570static void switched_from_rt(struct rq *rq, struct task_struct *p, 1648static void switched_from_rt(struct rq *rq, struct task_struct *p)
1571 int running)
1572{ 1649{
1573 /* 1650 /*
1574 * If there are other RT tasks then we will reschedule 1651 * If there are other RT tasks then we will reschedule
@@ -1577,7 +1654,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
1577 * we may need to handle the pulling of RT tasks 1654 * we may need to handle the pulling of RT tasks
1578 * now. 1655 * now.
1579 */ 1656 */
1580 if (!rq->rt.rt_nr_running) 1657 if (p->on_rq && !rq->rt.rt_nr_running)
1581 pull_rt_task(rq); 1658 pull_rt_task(rq);
1582} 1659}
1583 1660
@@ -1596,8 +1673,7 @@ static inline void init_sched_rt_class(void)
1596 * with RT tasks. In this case we try to push them off to 1673 * with RT tasks. In this case we try to push them off to
1597 * other runqueues. 1674 * other runqueues.
1598 */ 1675 */
1599static void switched_to_rt(struct rq *rq, struct task_struct *p, 1676static void switched_to_rt(struct rq *rq, struct task_struct *p)
1600 int running)
1601{ 1677{
1602 int check_resched = 1; 1678 int check_resched = 1;
1603 1679
@@ -1608,7 +1684,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
1608 * If that current running task is also an RT task 1684 * If that current running task is also an RT task
1609 * then see if we can move to another run queue. 1685 * then see if we can move to another run queue.
1610 */ 1686 */
1611 if (!running) { 1687 if (p->on_rq && rq->curr != p) {
1612#ifdef CONFIG_SMP 1688#ifdef CONFIG_SMP
1613 if (rq->rt.overloaded && push_rt_task(rq) && 1689 if (rq->rt.overloaded && push_rt_task(rq) &&
1614 /* Don't resched if we changed runqueues */ 1690 /* Don't resched if we changed runqueues */
@@ -1624,10 +1700,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
1624 * Priority of the task has changed. This may cause 1700 * Priority of the task has changed. This may cause
1625 * us to initiate a push or pull. 1701 * us to initiate a push or pull.
1626 */ 1702 */
1627static void prio_changed_rt(struct rq *rq, struct task_struct *p, 1703static void
1628 int oldprio, int running) 1704prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1629{ 1705{
1630 if (running) { 1706 if (!p->on_rq)
1707 return;
1708
1709 if (rq->curr == p) {
1631#ifdef CONFIG_SMP 1710#ifdef CONFIG_SMP
1632 /* 1711 /*
1633 * If our priority decreases while running, we 1712 * If our priority decreases while running, we
@@ -1709,7 +1788,7 @@ static void set_curr_task_rt(struct rq *rq)
1709{ 1788{
1710 struct task_struct *p = rq->curr; 1789 struct task_struct *p = rq->curr;
1711 1790
1712 p->se.exec_start = rq->clock; 1791 p->se.exec_start = rq->clock_task;
1713 1792
1714 /* The running task is never eligible for pushing */ 1793 /* The running task is never eligible for pushing */
1715 dequeue_pushable_task(rq, p); 1794 dequeue_pushable_task(rq, p);
@@ -1763,10 +1842,11 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1763 1842
1764static void print_rt_stats(struct seq_file *m, int cpu) 1843static void print_rt_stats(struct seq_file *m, int cpu)
1765{ 1844{
1845 rt_rq_iter_t iter;
1766 struct rt_rq *rt_rq; 1846 struct rt_rq *rt_rq;
1767 1847
1768 rcu_read_lock(); 1848 rcu_read_lock();
1769 for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) 1849 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
1770 print_rt_rq(m, cpu, rt_rq); 1850 print_rt_rq(m, cpu, rt_rq);
1771 rcu_read_unlock(); 1851 rcu_read_unlock();
1772} 1852}
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 25c2f962f6fc..331e01bcd026 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -37,7 +37,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
37 37
38#ifdef CONFIG_SMP 38#ifdef CONFIG_SMP
39 /* domain-specific stats */ 39 /* domain-specific stats */
40 preempt_disable(); 40 rcu_read_lock();
41 for_each_domain(cpu, sd) { 41 for_each_domain(cpu, sd) {
42 enum cpu_idle_type itype; 42 enum cpu_idle_type itype;
43 43
@@ -64,7 +64,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
64 sd->ttwu_wake_remote, sd->ttwu_move_affine, 64 sd->ttwu_wake_remote, sd->ttwu_move_affine,
65 sd->ttwu_move_balance); 65 sd->ttwu_move_balance);
66 } 66 }
67 preempt_enable(); 67 rcu_read_unlock();
68#endif 68#endif
69 } 69 }
70 kfree(mask_str); 70 kfree(mask_str);
@@ -157,15 +157,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
157} 157}
158 158
159/* 159/*
160 * Called when a process is dequeued from the active array and given 160 * We are interested in knowing how long it was from the *first* time a
161 * the cpu. We should note that with the exception of interactive
162 * tasks, the expired queue will become the active queue after the active
163 * queue is empty, without explicitly dequeuing and requeuing tasks in the
164 * expired queue. (Interactive tasks may be requeued directly to the
165 * active queue, thus delaying tasks in the expired queue from running;
166 * see scheduler_tick()).
167 *
168 * Though we are interested in knowing how long it was from the *first* time a
169 * task was queued to the time that it finally hit a cpu, we call this routine 161 * task was queued to the time that it finally hit a cpu, we call this routine
170 * from dequeue_task() to account for possible rq->clock skew across cpus. The 162 * from dequeue_task() to account for possible rq->clock skew across cpus. The
171 * delta taken on each cpu would annul the skew. 163 * delta taken on each cpu would annul the skew.
@@ -203,16 +195,6 @@ static void sched_info_arrive(struct task_struct *t)
203} 195}
204 196
205/* 197/*
206 * Called when a process is queued into either the active or expired
207 * array. The time is noted and later used to determine how long we
208 * had to wait for us to reach the cpu. Since the expired queue will
209 * become the active queue after active queue is empty, without dequeuing
210 * and requeuing any tasks, we are interested in queuing to either. It
211 * is unusual but not impossible for tasks to be dequeued and immediately
212 * requeued in the same or another array: this can happen in sched_yield(),
213 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
214 * to runqueue.
215 *
216 * This function is only called from enqueue_task(), but also only updates 198 * This function is only called from enqueue_task(), but also only updates
217 * the timestamp if it is already not set. It's assumed that 199 * the timestamp if it is already not set. It's assumed that
218 * sched_info_dequeued() will clear that stamp when appropriate. 200 * sched_info_dequeued() will clear that stamp when appropriate.
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
new file mode 100644
index 000000000000..6f437632afab
--- /dev/null
+++ b/kernel/sched_stoptask.c
@@ -0,0 +1,104 @@
1/*
2 * stop-task scheduling class.
3 *
4 * The stop task is the highest priority task in the system, it preempts
5 * everything and will be preempted by nothing.
6 *
7 * See kernel/stop_machine.c
8 */
9
10#ifdef CONFIG_SMP
11static int
12select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
13{
14 return task_cpu(p); /* stop tasks as never migrate */
15}
16#endif /* CONFIG_SMP */
17
18static void
19check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
20{
21 /* we're never preempted */
22}
23
24static struct task_struct *pick_next_task_stop(struct rq *rq)
25{
26 struct task_struct *stop = rq->stop;
27
28 if (stop && stop->on_rq)
29 return stop;
30
31 return NULL;
32}
33
34static void
35enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
36{
37}
38
39static void
40dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
41{
42}
43
44static void yield_task_stop(struct rq *rq)
45{
46 BUG(); /* the stop task should never yield, its pointless. */
47}
48
49static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
50{
51}
52
53static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
54{
55}
56
57static void set_curr_task_stop(struct rq *rq)
58{
59}
60
61static void switched_to_stop(struct rq *rq, struct task_struct *p)
62{
63 BUG(); /* its impossible to change to this class */
64}
65
66static void
67prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
68{
69 BUG(); /* how!?, what priority? */
70}
71
72static unsigned int
73get_rr_interval_stop(struct rq *rq, struct task_struct *task)
74{
75 return 0;
76}
77
78/*
79 * Simple, special scheduling class for the per-CPU stop tasks:
80 */
81static const struct sched_class stop_sched_class = {
82 .next = &rt_sched_class,
83
84 .enqueue_task = enqueue_task_stop,
85 .dequeue_task = dequeue_task_stop,
86 .yield_task = yield_task_stop,
87
88 .check_preempt_curr = check_preempt_curr_stop,
89
90 .pick_next_task = pick_next_task_stop,
91 .put_prev_task = put_prev_task_stop,
92
93#ifdef CONFIG_SMP
94 .select_task_rq = select_task_rq_stop,
95#endif
96
97 .set_curr_task = set_curr_task_stop,
98 .task_tick = task_tick_stop,
99
100 .get_rr_interval = get_rr_interval_stop,
101
102 .prio_changed = prio_changed_stop,
103 .switched_to = switched_to_stop,
104};
diff --git a/kernel/signal.c b/kernel/signal.c
index 919562c3d6b7..415d85d6f6c6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
124 124
125static int recalc_sigpending_tsk(struct task_struct *t) 125static int recalc_sigpending_tsk(struct task_struct *t)
126{ 126{
127 if (t->signal->group_stop_count > 0 || 127 if ((t->group_stop & GROUP_STOP_PENDING) ||
128 PENDING(&t->pending, &t->blocked) || 128 PENDING(&t->pending, &t->blocked) ||
129 PENDING(&t->signal->shared_pending, &t->blocked)) { 129 PENDING(&t->signal->shared_pending, &t->blocked)) {
130 set_tsk_thread_flag(t, TIF_SIGPENDING); 130 set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -223,10 +223,87 @@ static inline void print_dropped_signal(int sig)
223 current->comm, current->pid, sig); 223 current->comm, current->pid, sig);
224} 224}
225 225
226/**
227 * task_clear_group_stop_trapping - clear group stop trapping bit
228 * @task: target task
229 *
230 * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us. Clear it
231 * and wake up the ptracer. Note that we don't need any further locking.
232 * @task->siglock guarantees that @task->parent points to the ptracer.
233 *
234 * CONTEXT:
235 * Must be called with @task->sighand->siglock held.
236 */
237static void task_clear_group_stop_trapping(struct task_struct *task)
238{
239 if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) {
240 task->group_stop &= ~GROUP_STOP_TRAPPING;
241 __wake_up_sync_key(&task->parent->signal->wait_chldexit,
242 TASK_UNINTERRUPTIBLE, 1, task);
243 }
244}
245
246/**
247 * task_clear_group_stop_pending - clear pending group stop
248 * @task: target task
249 *
250 * Clear group stop states for @task.
251 *
252 * CONTEXT:
253 * Must be called with @task->sighand->siglock held.
254 */
255void task_clear_group_stop_pending(struct task_struct *task)
256{
257 task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME |
258 GROUP_STOP_DEQUEUED);
259}
260
261/**
262 * task_participate_group_stop - participate in a group stop
263 * @task: task participating in a group stop
264 *
265 * @task has GROUP_STOP_PENDING set and is participating in a group stop.
266 * Group stop states are cleared and the group stop count is consumed if
267 * %GROUP_STOP_CONSUME was set. If the consumption completes the group
268 * stop, the appropriate %SIGNAL_* flags are set.
269 *
270 * CONTEXT:
271 * Must be called with @task->sighand->siglock held.
272 *
273 * RETURNS:
274 * %true if group stop completion should be notified to the parent, %false
275 * otherwise.
276 */
277static bool task_participate_group_stop(struct task_struct *task)
278{
279 struct signal_struct *sig = task->signal;
280 bool consume = task->group_stop & GROUP_STOP_CONSUME;
281
282 WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING));
283
284 task_clear_group_stop_pending(task);
285
286 if (!consume)
287 return false;
288
289 if (!WARN_ON_ONCE(sig->group_stop_count == 0))
290 sig->group_stop_count--;
291
292 /*
293 * Tell the caller to notify completion iff we are entering into a
294 * fresh group stop. Read comment in do_signal_stop() for details.
295 */
296 if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
297 sig->flags = SIGNAL_STOP_STOPPED;
298 return true;
299 }
300 return false;
301}
302
226/* 303/*
227 * allocate a new signal queue record 304 * allocate a new signal queue record
228 * - this may be called without locks if and only if t == current, otherwise an 305 * - this may be called without locks if and only if t == current, otherwise an
229 * appopriate lock must be held to stop the target task from exiting 306 * appropriate lock must be held to stop the target task from exiting
230 */ 307 */
231static struct sigqueue * 308static struct sigqueue *
232__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) 309__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
@@ -375,15 +452,15 @@ int unhandled_signal(struct task_struct *tsk, int sig)
375 return !tracehook_consider_fatal_signal(tsk, sig); 452 return !tracehook_consider_fatal_signal(tsk, sig);
376} 453}
377 454
378 455/*
379/* Notify the system that a driver wants to block all signals for this 456 * Notify the system that a driver wants to block all signals for this
380 * process, and wants to be notified if any signals at all were to be 457 * process, and wants to be notified if any signals at all were to be
381 * sent/acted upon. If the notifier routine returns non-zero, then the 458 * sent/acted upon. If the notifier routine returns non-zero, then the
382 * signal will be acted upon after all. If the notifier routine returns 0, 459 * signal will be acted upon after all. If the notifier routine returns 0,
383 * then then signal will be blocked. Only one block per process is 460 * then then signal will be blocked. Only one block per process is
384 * allowed. priv is a pointer to private data that the notifier routine 461 * allowed. priv is a pointer to private data that the notifier routine
385 * can use to determine if the signal should be blocked or not. */ 462 * can use to determine if the signal should be blocked or not.
386 463 */
387void 464void
388block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) 465block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
389{ 466{
@@ -434,9 +511,10 @@ still_pending:
434 copy_siginfo(info, &first->info); 511 copy_siginfo(info, &first->info);
435 __sigqueue_free(first); 512 __sigqueue_free(first);
436 } else { 513 } else {
437 /* Ok, it wasn't in the queue. This must be 514 /*
438 a fast-pathed signal or we must have been 515 * Ok, it wasn't in the queue. This must be
439 out of queue space. So zero out the info. 516 * a fast-pathed signal or we must have been
517 * out of queue space. So zero out the info.
440 */ 518 */
441 info->si_signo = sig; 519 info->si_signo = sig;
442 info->si_errno = 0; 520 info->si_errno = 0;
@@ -468,7 +546,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
468} 546}
469 547
470/* 548/*
471 * Dequeue a signal and return the element to the caller, which is 549 * Dequeue a signal and return the element to the caller, which is
472 * expected to free it. 550 * expected to free it.
473 * 551 *
474 * All callers have to hold the siglock. 552 * All callers have to hold the siglock.
@@ -490,7 +568,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
490 * itimers are process shared and we restart periodic 568 * itimers are process shared and we restart periodic
491 * itimers in the signal delivery path to prevent DoS 569 * itimers in the signal delivery path to prevent DoS
492 * attacks in the high resolution timer case. This is 570 * attacks in the high resolution timer case. This is
493 * compliant with the old way of self restarting 571 * compliant with the old way of self-restarting
494 * itimers, as the SIGALRM is a legacy signal and only 572 * itimers, as the SIGALRM is a legacy signal and only
495 * queued once. Changing the restart behaviour to 573 * queued once. Changing the restart behaviour to
496 * restart the timer in the signal dequeue path is 574 * restart the timer in the signal dequeue path is
@@ -526,7 +604,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
526 * is to alert stop-signal processing code when another 604 * is to alert stop-signal processing code when another
527 * processor has come along and cleared the flag. 605 * processor has come along and cleared the flag.
528 */ 606 */
529 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; 607 current->group_stop |= GROUP_STOP_DEQUEUED;
530 } 608 }
531 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { 609 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
532 /* 610 /*
@@ -591,7 +669,7 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
591 if (sigisemptyset(&m)) 669 if (sigisemptyset(&m))
592 return 0; 670 return 0;
593 671
594 signandsets(&s->signal, &s->signal, mask); 672 sigandnsets(&s->signal, &s->signal, mask);
595 list_for_each_entry_safe(q, n, &s->list, list) { 673 list_for_each_entry_safe(q, n, &s->list, list) {
596 if (sigismember(mask, q->info.si_signo)) { 674 if (sigismember(mask, q->info.si_signo)) {
597 list_del_init(&q->list); 675 list_del_init(&q->list);
@@ -636,13 +714,33 @@ static inline bool si_fromuser(const struct siginfo *info)
636} 714}
637 715
638/* 716/*
717 * called with RCU read lock from check_kill_permission()
718 */
719static int kill_ok_by_cred(struct task_struct *t)
720{
721 const struct cred *cred = current_cred();
722 const struct cred *tcred = __task_cred(t);
723
724 if (cred->user->user_ns == tcred->user->user_ns &&
725 (cred->euid == tcred->suid ||
726 cred->euid == tcred->uid ||
727 cred->uid == tcred->suid ||
728 cred->uid == tcred->uid))
729 return 1;
730
731 if (ns_capable(tcred->user->user_ns, CAP_KILL))
732 return 1;
733
734 return 0;
735}
736
737/*
639 * Bad permissions for sending the signal 738 * Bad permissions for sending the signal
640 * - the caller must hold the RCU read lock 739 * - the caller must hold the RCU read lock
641 */ 740 */
642static int check_kill_permission(int sig, struct siginfo *info, 741static int check_kill_permission(int sig, struct siginfo *info,
643 struct task_struct *t) 742 struct task_struct *t)
644{ 743{
645 const struct cred *cred, *tcred;
646 struct pid *sid; 744 struct pid *sid;
647 int error; 745 int error;
648 746
@@ -656,14 +754,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
656 if (error) 754 if (error)
657 return error; 755 return error;
658 756
659 cred = current_cred();
660 tcred = __task_cred(t);
661 if (!same_thread_group(current, t) && 757 if (!same_thread_group(current, t) &&
662 (cred->euid ^ tcred->suid) && 758 !kill_ok_by_cred(t)) {
663 (cred->euid ^ tcred->uid) &&
664 (cred->uid ^ tcred->suid) &&
665 (cred->uid ^ tcred->uid) &&
666 !capable(CAP_KILL)) {
667 switch (sig) { 759 switch (sig) {
668 case SIGCONT: 760 case SIGCONT:
669 sid = task_session(t); 761 sid = task_session(t);
@@ -712,34 +804,14 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
712 } else if (sig == SIGCONT) { 804 } else if (sig == SIGCONT) {
713 unsigned int why; 805 unsigned int why;
714 /* 806 /*
715 * Remove all stop signals from all queues, 807 * Remove all stop signals from all queues, wake all threads.
716 * and wake all threads.
717 */ 808 */
718 rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); 809 rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
719 t = p; 810 t = p;
720 do { 811 do {
721 unsigned int state; 812 task_clear_group_stop_pending(t);
722 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); 813 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
723 /* 814 wake_up_state(t, __TASK_STOPPED);
724 * If there is a handler for SIGCONT, we must make
725 * sure that no thread returns to user mode before
726 * we post the signal, in case it was the only
727 * thread eligible to run the signal handler--then
728 * it must not do anything between resuming and
729 * running the handler. With the TIF_SIGPENDING
730 * flag set, the thread will pause and acquire the
731 * siglock that we hold now and until we've queued
732 * the pending signal.
733 *
734 * Wake up the stopped thread _after_ setting
735 * TIF_SIGPENDING
736 */
737 state = __TASK_STOPPED;
738 if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) {
739 set_tsk_thread_flag(t, TIF_SIGPENDING);
740 state |= TASK_INTERRUPTIBLE;
741 }
742 wake_up_state(t, state);
743 } while_each_thread(p, t); 815 } while_each_thread(p, t);
744 816
745 /* 817 /*
@@ -765,13 +837,6 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
765 signal->flags = why | SIGNAL_STOP_CONTINUED; 837 signal->flags = why | SIGNAL_STOP_CONTINUED;
766 signal->group_stop_count = 0; 838 signal->group_stop_count = 0;
767 signal->group_exit_code = 0; 839 signal->group_exit_code = 0;
768 } else {
769 /*
770 * We are not stopped, but there could be a stop
771 * signal in the middle of being processed after
772 * being removed from the queue. Clear that too.
773 */
774 signal->flags &= ~SIGNAL_STOP_DEQUEUED;
775 } 840 }
776 } 841 }
777 842
@@ -860,6 +925,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
860 signal->group_stop_count = 0; 925 signal->group_stop_count = 0;
861 t = p; 926 t = p;
862 do { 927 do {
928 task_clear_group_stop_pending(t);
863 sigaddset(&t->pending.signal, SIGKILL); 929 sigaddset(&t->pending.signal, SIGKILL);
864 signal_wake_up(t, 1); 930 signal_wake_up(t, 1);
865 } while_each_thread(p, t); 931 } while_each_thread(p, t);
@@ -909,14 +975,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
909 if (info == SEND_SIG_FORCED) 975 if (info == SEND_SIG_FORCED)
910 goto out_set; 976 goto out_set;
911 977
912 /* Real-time signals must be queued if sent by sigqueue, or 978 /*
913 some other real-time mechanism. It is implementation 979 * Real-time signals must be queued if sent by sigqueue, or
914 defined whether kill() does so. We attempt to do so, on 980 * some other real-time mechanism. It is implementation
915 the principle of least surprise, but since kill is not 981 * defined whether kill() does so. We attempt to do so, on
916 allowed to fail with EAGAIN when low on memory we just 982 * the principle of least surprise, but since kill is not
917 make sure at least one signal gets delivered and don't 983 * allowed to fail with EAGAIN when low on memory we just
918 pass on the info struct. */ 984 * make sure at least one signal gets delivered and don't
919 985 * pass on the info struct.
986 */
920 if (sig < SIGRTMIN) 987 if (sig < SIGRTMIN)
921 override_rlimit = (is_si_special(info) || info->si_code >= 0); 988 override_rlimit = (is_si_special(info) || info->si_code >= 0);
922 else 989 else
@@ -1093,6 +1160,7 @@ int zap_other_threads(struct task_struct *p)
1093 p->signal->group_stop_count = 0; 1160 p->signal->group_stop_count = 0;
1094 1161
1095 while_each_thread(p, t) { 1162 while_each_thread(p, t) {
1163 task_clear_group_stop_pending(t);
1096 count++; 1164 count++;
1097 1165
1098 /* Don't bother with already dead threads */ 1166 /* Don't bother with already dead threads */
@@ -1105,22 +1173,30 @@ int zap_other_threads(struct task_struct *p)
1105 return count; 1173 return count;
1106} 1174}
1107 1175
1108struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 1176struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
1177 unsigned long *flags)
1109{ 1178{
1110 struct sighand_struct *sighand; 1179 struct sighand_struct *sighand;
1111 1180
1112 rcu_read_lock();
1113 for (;;) { 1181 for (;;) {
1182 local_irq_save(*flags);
1183 rcu_read_lock();
1114 sighand = rcu_dereference(tsk->sighand); 1184 sighand = rcu_dereference(tsk->sighand);
1115 if (unlikely(sighand == NULL)) 1185 if (unlikely(sighand == NULL)) {
1186 rcu_read_unlock();
1187 local_irq_restore(*flags);
1116 break; 1188 break;
1189 }
1117 1190
1118 spin_lock_irqsave(&sighand->siglock, *flags); 1191 spin_lock(&sighand->siglock);
1119 if (likely(sighand == tsk->sighand)) 1192 if (likely(sighand == tsk->sighand)) {
1193 rcu_read_unlock();
1120 break; 1194 break;
1121 spin_unlock_irqrestore(&sighand->siglock, *flags); 1195 }
1196 spin_unlock(&sighand->siglock);
1197 rcu_read_unlock();
1198 local_irq_restore(*flags);
1122 } 1199 }
1123 rcu_read_unlock();
1124 1200
1125 return sighand; 1201 return sighand;
1126} 1202}
@@ -1186,8 +1262,7 @@ retry:
1186 return error; 1262 return error;
1187} 1263}
1188 1264
1189int 1265int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1190kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1191{ 1266{
1192 int error; 1267 int error;
1193 rcu_read_lock(); 1268 rcu_read_lock();
@@ -1284,8 +1359,7 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
1284 * These are for backward compatibility with the rest of the kernel source. 1359 * These are for backward compatibility with the rest of the kernel source.
1285 */ 1360 */
1286 1361
1287int 1362int send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1288send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1289{ 1363{
1290 /* 1364 /*
1291 * Make sure legacy kernel users don't send in bad values 1365 * Make sure legacy kernel users don't send in bad values
@@ -1353,7 +1427,7 @@ EXPORT_SYMBOL(kill_pid);
1353 * These functions support sending signals using preallocated sigqueue 1427 * These functions support sending signals using preallocated sigqueue
1354 * structures. This is needed "because realtime applications cannot 1428 * structures. This is needed "because realtime applications cannot
1355 * afford to lose notifications of asynchronous events, like timer 1429 * afford to lose notifications of asynchronous events, like timer
1356 * expirations or I/O completions". In the case of Posix Timers 1430 * expirations or I/O completions". In the case of POSIX Timers
1357 * we allocate the sigqueue structure from the timer_create. If this 1431 * we allocate the sigqueue structure from the timer_create. If this
1358 * allocation fails we are able to report the failure to the application 1432 * allocation fails we are able to report the failure to the application
1359 * with an EAGAIN error. 1433 * with an EAGAIN error.
@@ -1521,16 +1595,30 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1521 return ret; 1595 return ret;
1522} 1596}
1523 1597
1524static void do_notify_parent_cldstop(struct task_struct *tsk, int why) 1598/**
1599 * do_notify_parent_cldstop - notify parent of stopped/continued state change
1600 * @tsk: task reporting the state change
1601 * @for_ptracer: the notification is for ptracer
1602 * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report
1603 *
1604 * Notify @tsk's parent that the stopped/continued state has changed. If
1605 * @for_ptracer is %false, @tsk's group leader notifies to its real parent.
1606 * If %true, @tsk reports to @tsk->parent which should be the ptracer.
1607 *
1608 * CONTEXT:
1609 * Must be called with tasklist_lock at least read locked.
1610 */
1611static void do_notify_parent_cldstop(struct task_struct *tsk,
1612 bool for_ptracer, int why)
1525{ 1613{
1526 struct siginfo info; 1614 struct siginfo info;
1527 unsigned long flags; 1615 unsigned long flags;
1528 struct task_struct *parent; 1616 struct task_struct *parent;
1529 struct sighand_struct *sighand; 1617 struct sighand_struct *sighand;
1530 1618
1531 if (task_ptrace(tsk)) 1619 if (for_ptracer) {
1532 parent = tsk->parent; 1620 parent = tsk->parent;
1533 else { 1621 } else {
1534 tsk = tsk->group_leader; 1622 tsk = tsk->group_leader;
1535 parent = tsk->real_parent; 1623 parent = tsk->real_parent;
1536 } 1624 }
@@ -1538,7 +1626,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1538 info.si_signo = SIGCHLD; 1626 info.si_signo = SIGCHLD;
1539 info.si_errno = 0; 1627 info.si_errno = 0;
1540 /* 1628 /*
1541 * see comment in do_notify_parent() abot the following 3 lines 1629 * see comment in do_notify_parent() about the following 4 lines
1542 */ 1630 */
1543 rcu_read_lock(); 1631 rcu_read_lock();
1544 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); 1632 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
@@ -1596,7 +1684,7 @@ static inline int may_ptrace_stop(void)
1596} 1684}
1597 1685
1598/* 1686/*
1599 * Return nonzero if there is a SIGKILL that should be waking us up. 1687 * Return non-zero if there is a SIGKILL that should be waking us up.
1600 * Called with the siglock held. 1688 * Called with the siglock held.
1601 */ 1689 */
1602static int sigkill_pending(struct task_struct *tsk) 1690static int sigkill_pending(struct task_struct *tsk)
@@ -1606,6 +1694,15 @@ static int sigkill_pending(struct task_struct *tsk)
1606} 1694}
1607 1695
1608/* 1696/*
1697 * Test whether the target task of the usual cldstop notification - the
1698 * real_parent of @child - is in the same group as the ptracer.
1699 */
1700static bool real_parent_is_ptracer(struct task_struct *child)
1701{
1702 return same_thread_group(child->parent, child->real_parent);
1703}
1704
1705/*
1609 * This must be called with current->sighand->siglock held. 1706 * This must be called with current->sighand->siglock held.
1610 * 1707 *
1611 * This should be the path for all ptrace stops. 1708 * This should be the path for all ptrace stops.
@@ -1616,8 +1713,12 @@ static int sigkill_pending(struct task_struct *tsk)
1616 * If we actually decide not to stop at all because the tracer 1713 * If we actually decide not to stop at all because the tracer
1617 * is gone, we keep current->exit_code unless clear_code. 1714 * is gone, we keep current->exit_code unless clear_code.
1618 */ 1715 */
1619static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) 1716static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1717 __releases(&current->sighand->siglock)
1718 __acquires(&current->sighand->siglock)
1620{ 1719{
1720 bool gstop_done = false;
1721
1621 if (arch_ptrace_stop_needed(exit_code, info)) { 1722 if (arch_ptrace_stop_needed(exit_code, info)) {
1622 /* 1723 /*
1623 * The arch code has something special to do before a 1724 * The arch code has something special to do before a
@@ -1638,21 +1739,49 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1638 } 1739 }
1639 1740
1640 /* 1741 /*
1641 * If there is a group stop in progress, 1742 * If @why is CLD_STOPPED, we're trapping to participate in a group
1642 * we must participate in the bookkeeping. 1743 * stop. Do the bookkeeping. Note that if SIGCONT was delievered
1744 * while siglock was released for the arch hook, PENDING could be
1745 * clear now. We act as if SIGCONT is received after TASK_TRACED
1746 * is entered - ignore it.
1643 */ 1747 */
1644 if (current->signal->group_stop_count > 0) 1748 if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING))
1645 --current->signal->group_stop_count; 1749 gstop_done = task_participate_group_stop(current);
1646 1750
1647 current->last_siginfo = info; 1751 current->last_siginfo = info;
1648 current->exit_code = exit_code; 1752 current->exit_code = exit_code;
1649 1753
1650 /* Let the debugger run. */ 1754 /*
1651 __set_current_state(TASK_TRACED); 1755 * TRACED should be visible before TRAPPING is cleared; otherwise,
1756 * the tracer might fail do_wait().
1757 */
1758 set_current_state(TASK_TRACED);
1759
1760 /*
1761 * We're committing to trapping. Clearing GROUP_STOP_TRAPPING and
1762 * transition to TASK_TRACED should be atomic with respect to
1763 * siglock. This hsould be done after the arch hook as siglock is
1764 * released and regrabbed across it.
1765 */
1766 task_clear_group_stop_trapping(current);
1767
1652 spin_unlock_irq(&current->sighand->siglock); 1768 spin_unlock_irq(&current->sighand->siglock);
1653 read_lock(&tasklist_lock); 1769 read_lock(&tasklist_lock);
1654 if (may_ptrace_stop()) { 1770 if (may_ptrace_stop()) {
1655 do_notify_parent_cldstop(current, CLD_TRAPPED); 1771 /*
1772 * Notify parents of the stop.
1773 *
1774 * While ptraced, there are two parents - the ptracer and
1775 * the real_parent of the group_leader. The ptracer should
1776 * know about every stop while the real parent is only
1777 * interested in the completion of group stop. The states
1778 * for the two don't interact with each other. Notify
1779 * separately unless they're gonna be duplicates.
1780 */
1781 do_notify_parent_cldstop(current, true, why);
1782 if (gstop_done && !real_parent_is_ptracer(current))
1783 do_notify_parent_cldstop(current, false, why);
1784
1656 /* 1785 /*
1657 * Don't want to allow preemption here, because 1786 * Don't want to allow preemption here, because
1658 * sys_ptrace() needs this task to be inactive. 1787 * sys_ptrace() needs this task to be inactive.
@@ -1667,7 +1796,16 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1667 /* 1796 /*
1668 * By the time we got the lock, our tracer went away. 1797 * By the time we got the lock, our tracer went away.
1669 * Don't drop the lock yet, another tracer may come. 1798 * Don't drop the lock yet, another tracer may come.
1799 *
1800 * If @gstop_done, the ptracer went away between group stop
1801 * completion and here. During detach, it would have set
1802 * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED
1803 * in do_signal_stop() on return, so notifying the real
1804 * parent of the group stop completion is enough.
1670 */ 1805 */
1806 if (gstop_done)
1807 do_notify_parent_cldstop(current, false, why);
1808
1671 __set_current_state(TASK_RUNNING); 1809 __set_current_state(TASK_RUNNING);
1672 if (clear_code) 1810 if (clear_code)
1673 current->exit_code = 0; 1811 current->exit_code = 0;
@@ -1711,79 +1849,128 @@ void ptrace_notify(int exit_code)
1711 1849
1712 /* Let the debugger run. */ 1850 /* Let the debugger run. */
1713 spin_lock_irq(&current->sighand->siglock); 1851 spin_lock_irq(&current->sighand->siglock);
1714 ptrace_stop(exit_code, 1, &info); 1852 ptrace_stop(exit_code, CLD_TRAPPED, 1, &info);
1715 spin_unlock_irq(&current->sighand->siglock); 1853 spin_unlock_irq(&current->sighand->siglock);
1716} 1854}
1717 1855
1718/* 1856/*
1719 * This performs the stopping for SIGSTOP and other stop signals. 1857 * This performs the stopping for SIGSTOP and other stop signals.
1720 * We have to stop all threads in the thread group. 1858 * We have to stop all threads in the thread group.
1721 * Returns nonzero if we've actually stopped and released the siglock. 1859 * Returns non-zero if we've actually stopped and released the siglock.
1722 * Returns zero if we didn't stop and still hold the siglock. 1860 * Returns zero if we didn't stop and still hold the siglock.
1723 */ 1861 */
1724static int do_signal_stop(int signr) 1862static int do_signal_stop(int signr)
1725{ 1863{
1726 struct signal_struct *sig = current->signal; 1864 struct signal_struct *sig = current->signal;
1727 int notify;
1728 1865
1729 if (!sig->group_stop_count) { 1866 if (!(current->group_stop & GROUP_STOP_PENDING)) {
1867 unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME;
1730 struct task_struct *t; 1868 struct task_struct *t;
1731 1869
1732 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || 1870 /* signr will be recorded in task->group_stop for retries */
1871 WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK);
1872
1873 if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) ||
1733 unlikely(signal_group_exit(sig))) 1874 unlikely(signal_group_exit(sig)))
1734 return 0; 1875 return 0;
1735 /* 1876 /*
1736 * There is no group stop already in progress. 1877 * There is no group stop already in progress. We must
1737 * We must initiate one now. 1878 * initiate one now.
1879 *
1880 * While ptraced, a task may be resumed while group stop is
1881 * still in effect and then receive a stop signal and
1882 * initiate another group stop. This deviates from the
1883 * usual behavior as two consecutive stop signals can't
1884 * cause two group stops when !ptraced. That is why we
1885 * also check !task_is_stopped(t) below.
1886 *
1887 * The condition can be distinguished by testing whether
1888 * SIGNAL_STOP_STOPPED is already set. Don't generate
1889 * group_exit_code in such case.
1890 *
1891 * This is not necessary for SIGNAL_STOP_CONTINUED because
1892 * an intervening stop signal is required to cause two
1893 * continued events regardless of ptrace.
1738 */ 1894 */
1739 sig->group_exit_code = signr; 1895 if (!(sig->flags & SIGNAL_STOP_STOPPED))
1896 sig->group_exit_code = signr;
1897 else
1898 WARN_ON_ONCE(!task_ptrace(current));
1740 1899
1900 current->group_stop &= ~GROUP_STOP_SIGMASK;
1901 current->group_stop |= signr | gstop;
1741 sig->group_stop_count = 1; 1902 sig->group_stop_count = 1;
1742 for (t = next_thread(current); t != current; t = next_thread(t)) 1903 for (t = next_thread(current); t != current;
1904 t = next_thread(t)) {
1905 t->group_stop &= ~GROUP_STOP_SIGMASK;
1743 /* 1906 /*
1744 * Setting state to TASK_STOPPED for a group 1907 * Setting state to TASK_STOPPED for a group
1745 * stop is always done with the siglock held, 1908 * stop is always done with the siglock held,
1746 * so this check has no races. 1909 * so this check has no races.
1747 */ 1910 */
1748 if (!(t->flags & PF_EXITING) && 1911 if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) {
1749 !task_is_stopped_or_traced(t)) { 1912 t->group_stop |= signr | gstop;
1750 sig->group_stop_count++; 1913 sig->group_stop_count++;
1751 signal_wake_up(t, 0); 1914 signal_wake_up(t, 0);
1752 } 1915 }
1916 }
1753 } 1917 }
1754 /* 1918retry:
1755 * If there are no other threads in the group, or if there is 1919 if (likely(!task_ptrace(current))) {
1756 * a group stop in progress and we are the last to stop, report 1920 int notify = 0;
1757 * to the parent. When ptraced, every thread reports itself. 1921
1758 */ 1922 /*
1759 notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0; 1923 * If there are no other threads in the group, or if there
1760 notify = tracehook_notify_jctl(notify, CLD_STOPPED); 1924 * is a group stop in progress and we are the last to stop,
1761 /* 1925 * report to the parent.
1762 * tracehook_notify_jctl() can drop and reacquire siglock, so 1926 */
1763 * we keep ->group_stop_count != 0 before the call. If SIGCONT 1927 if (task_participate_group_stop(current))
1764 * or SIGKILL comes in between ->group_stop_count == 0. 1928 notify = CLD_STOPPED;
1765 */ 1929
1766 if (sig->group_stop_count) {
1767 if (!--sig->group_stop_count)
1768 sig->flags = SIGNAL_STOP_STOPPED;
1769 current->exit_code = sig->group_exit_code;
1770 __set_current_state(TASK_STOPPED); 1930 __set_current_state(TASK_STOPPED);
1931 spin_unlock_irq(&current->sighand->siglock);
1932
1933 /*
1934 * Notify the parent of the group stop completion. Because
1935 * we're not holding either the siglock or tasklist_lock
1936 * here, ptracer may attach inbetween; however, this is for
1937 * group stop and should always be delivered to the real
1938 * parent of the group leader. The new ptracer will get
1939 * its notification when this task transitions into
1940 * TASK_TRACED.
1941 */
1942 if (notify) {
1943 read_lock(&tasklist_lock);
1944 do_notify_parent_cldstop(current, false, notify);
1945 read_unlock(&tasklist_lock);
1946 }
1947
1948 /* Now we don't run again until woken by SIGCONT or SIGKILL */
1949 schedule();
1950
1951 spin_lock_irq(&current->sighand->siglock);
1952 } else {
1953 ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK,
1954 CLD_STOPPED, 0, NULL);
1955 current->exit_code = 0;
1771 } 1956 }
1772 spin_unlock_irq(&current->sighand->siglock);
1773 1957
1774 if (notify) { 1958 /*
1775 read_lock(&tasklist_lock); 1959 * GROUP_STOP_PENDING could be set if another group stop has
1776 do_notify_parent_cldstop(current, notify); 1960 * started since being woken up or ptrace wants us to transit
1777 read_unlock(&tasklist_lock); 1961 * between TASK_STOPPED and TRACED. Retry group stop.
1962 */
1963 if (current->group_stop & GROUP_STOP_PENDING) {
1964 WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK));
1965 goto retry;
1778 } 1966 }
1779 1967
1780 /* Now we don't run again until woken by SIGCONT or SIGKILL */ 1968 /* PTRACE_ATTACH might have raced with task killing, clear trapping */
1781 do { 1969 task_clear_group_stop_trapping(current);
1782 schedule(); 1970
1783 } while (try_to_freeze()); 1971 spin_unlock_irq(&current->sighand->siglock);
1784 1972
1785 tracehook_finish_jctl(); 1973 tracehook_finish_jctl();
1786 current->exit_code = 0;
1787 1974
1788 return 1; 1975 return 1;
1789} 1976}
@@ -1797,7 +1984,7 @@ static int ptrace_signal(int signr, siginfo_t *info,
1797 ptrace_signal_deliver(regs, cookie); 1984 ptrace_signal_deliver(regs, cookie);
1798 1985
1799 /* Let the debugger run. */ 1986 /* Let the debugger run. */
1800 ptrace_stop(signr, 0, info); 1987 ptrace_stop(signr, CLD_TRAPPED, 0, info);
1801 1988
1802 /* We're back. Did the debugger cancel the sig? */ 1989 /* We're back. Did the debugger cancel the sig? */
1803 signr = current->exit_code; 1990 signr = current->exit_code;
@@ -1806,10 +1993,12 @@ static int ptrace_signal(int signr, siginfo_t *info,
1806 1993
1807 current->exit_code = 0; 1994 current->exit_code = 0;
1808 1995
1809 /* Update the siginfo structure if the signal has 1996 /*
1810 changed. If the debugger wanted something 1997 * Update the siginfo structure if the signal has
1811 specific in the siginfo structure then it should 1998 * changed. If the debugger wanted something
1812 have updated *info via PTRACE_SETSIGINFO. */ 1999 * specific in the siginfo structure then it should
2000 * have updated *info via PTRACE_SETSIGINFO.
2001 */
1813 if (signr != info->si_signo) { 2002 if (signr != info->si_signo) {
1814 info->si_signo = signr; 2003 info->si_signo = signr;
1815 info->si_errno = 0; 2004 info->si_errno = 0;
@@ -1850,25 +2039,43 @@ relock:
1850 * the CLD_ si_code into SIGNAL_CLD_MASK bits. 2039 * the CLD_ si_code into SIGNAL_CLD_MASK bits.
1851 */ 2040 */
1852 if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { 2041 if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
1853 int why = (signal->flags & SIGNAL_STOP_CONTINUED) 2042 struct task_struct *leader;
1854 ? CLD_CONTINUED : CLD_STOPPED; 2043 int why;
2044
2045 if (signal->flags & SIGNAL_CLD_CONTINUED)
2046 why = CLD_CONTINUED;
2047 else
2048 why = CLD_STOPPED;
2049
1855 signal->flags &= ~SIGNAL_CLD_MASK; 2050 signal->flags &= ~SIGNAL_CLD_MASK;
1856 2051
1857 why = tracehook_notify_jctl(why, CLD_CONTINUED);
1858 spin_unlock_irq(&sighand->siglock); 2052 spin_unlock_irq(&sighand->siglock);
1859 2053
1860 if (why) { 2054 /*
1861 read_lock(&tasklist_lock); 2055 * Notify the parent that we're continuing. This event is
1862 do_notify_parent_cldstop(current->group_leader, why); 2056 * always per-process and doesn't make whole lot of sense
1863 read_unlock(&tasklist_lock); 2057 * for ptracers, who shouldn't consume the state via
1864 } 2058 * wait(2) either, but, for backward compatibility, notify
2059 * the ptracer of the group leader too unless it's gonna be
2060 * a duplicate.
2061 */
2062 read_lock(&tasklist_lock);
2063
2064 do_notify_parent_cldstop(current, false, why);
2065
2066 leader = current->group_leader;
2067 if (task_ptrace(leader) && !real_parent_is_ptracer(leader))
2068 do_notify_parent_cldstop(leader, true, why);
2069
2070 read_unlock(&tasklist_lock);
2071
1865 goto relock; 2072 goto relock;
1866 } 2073 }
1867 2074
1868 for (;;) { 2075 for (;;) {
1869 struct k_sigaction *ka; 2076 struct k_sigaction *ka;
1870 /* 2077 /*
1871 * Tracing can induce an artifical signal and choose sigaction. 2078 * Tracing can induce an artificial signal and choose sigaction.
1872 * The return value in @signr determines the default action, 2079 * The return value in @signr determines the default action,
1873 * but @info->si_signo is the signal number we will report. 2080 * but @info->si_signo is the signal number we will report.
1874 */ 2081 */
@@ -1878,8 +2085,8 @@ relock:
1878 if (unlikely(signr != 0)) 2085 if (unlikely(signr != 0))
1879 ka = return_ka; 2086 ka = return_ka;
1880 else { 2087 else {
1881 if (unlikely(signal->group_stop_count > 0) && 2088 if (unlikely(current->group_stop &
1882 do_signal_stop(0)) 2089 GROUP_STOP_PENDING) && do_signal_stop(0))
1883 goto relock; 2090 goto relock;
1884 2091
1885 signr = dequeue_signal(current, &current->blocked, 2092 signr = dequeue_signal(current, &current->blocked,
@@ -1998,10 +2205,42 @@ relock:
1998 return signr; 2205 return signr;
1999} 2206}
2000 2207
2208/*
2209 * It could be that complete_signal() picked us to notify about the
2210 * group-wide signal. Other threads should be notified now to take
2211 * the shared signals in @which since we will not.
2212 */
2213static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
2214{
2215 sigset_t retarget;
2216 struct task_struct *t;
2217
2218 sigandsets(&retarget, &tsk->signal->shared_pending.signal, which);
2219 if (sigisemptyset(&retarget))
2220 return;
2221
2222 t = tsk;
2223 while_each_thread(tsk, t) {
2224 if (t->flags & PF_EXITING)
2225 continue;
2226
2227 if (!has_pending_signals(&retarget, &t->blocked))
2228 continue;
2229 /* Remove the signals this thread can handle. */
2230 sigandsets(&retarget, &retarget, &t->blocked);
2231
2232 if (!signal_pending(t))
2233 signal_wake_up(t, 0);
2234
2235 if (sigisemptyset(&retarget))
2236 break;
2237 }
2238}
2239
2001void exit_signals(struct task_struct *tsk) 2240void exit_signals(struct task_struct *tsk)
2002{ 2241{
2003 int group_stop = 0; 2242 int group_stop = 0;
2004 struct task_struct *t; 2243 sigset_t unblocked;
2005 2244
2006 if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { 2245 if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
2007 tsk->flags |= PF_EXITING; 2246 tsk->flags |= PF_EXITING;
@@ -2017,25 +2256,23 @@ void exit_signals(struct task_struct *tsk)
2017 if (!signal_pending(tsk)) 2256 if (!signal_pending(tsk))
2018 goto out; 2257 goto out;
2019 2258
2020 /* It could be that __group_complete_signal() choose us to 2259 unblocked = tsk->blocked;
2021 * notify about group-wide signal. Another thread should be 2260 signotset(&unblocked);
2022 * woken now to take the signal since we will not. 2261 retarget_shared_pending(tsk, &unblocked);
2023 */
2024 for (t = tsk; (t = next_thread(t)) != tsk; )
2025 if (!signal_pending(t) && !(t->flags & PF_EXITING))
2026 recalc_sigpending_and_wake(t);
2027 2262
2028 if (unlikely(tsk->signal->group_stop_count) && 2263 if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) &&
2029 !--tsk->signal->group_stop_count) { 2264 task_participate_group_stop(tsk))
2030 tsk->signal->flags = SIGNAL_STOP_STOPPED; 2265 group_stop = CLD_STOPPED;
2031 group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
2032 }
2033out: 2266out:
2034 spin_unlock_irq(&tsk->sighand->siglock); 2267 spin_unlock_irq(&tsk->sighand->siglock);
2035 2268
2269 /*
2270 * If group stop has completed, deliver the notification. This
2271 * should always go to the real parent of the group leader.
2272 */
2036 if (unlikely(group_stop)) { 2273 if (unlikely(group_stop)) {
2037 read_lock(&tasklist_lock); 2274 read_lock(&tasklist_lock);
2038 do_notify_parent_cldstop(tsk, group_stop); 2275 do_notify_parent_cldstop(tsk, false, group_stop);
2039 read_unlock(&tasklist_lock); 2276 read_unlock(&tasklist_lock);
2040 } 2277 }
2041} 2278}
@@ -2055,6 +2292,9 @@ EXPORT_SYMBOL(unblock_all_signals);
2055 * System call entry points. 2292 * System call entry points.
2056 */ 2293 */
2057 2294
2295/**
2296 * sys_restart_syscall - restart a system call
2297 */
2058SYSCALL_DEFINE0(restart_syscall) 2298SYSCALL_DEFINE0(restart_syscall)
2059{ 2299{
2060 struct restart_block *restart = &current_thread_info()->restart_block; 2300 struct restart_block *restart = &current_thread_info()->restart_block;
@@ -2066,11 +2306,33 @@ long do_no_restart_syscall(struct restart_block *param)
2066 return -EINTR; 2306 return -EINTR;
2067} 2307}
2068 2308
2069/* 2309static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
2070 * We don't need to get the kernel lock - this is all local to this 2310{
2071 * particular thread.. (and that's good, because this is _heavily_ 2311 if (signal_pending(tsk) && !thread_group_empty(tsk)) {
2072 * used by various programs) 2312 sigset_t newblocked;
2313 /* A set of now blocked but previously unblocked signals. */
2314 sigandnsets(&newblocked, newset, &current->blocked);
2315 retarget_shared_pending(tsk, &newblocked);
2316 }
2317 tsk->blocked = *newset;
2318 recalc_sigpending();
2319}
2320
2321/**
2322 * set_current_blocked - change current->blocked mask
2323 * @newset: new mask
2324 *
2325 * It is wrong to change ->blocked directly, this helper should be used
2326 * to ensure the process can't miss a shared signal we are going to block.
2073 */ 2327 */
2328void set_current_blocked(const sigset_t *newset)
2329{
2330 struct task_struct *tsk = current;
2331
2332 spin_lock_irq(&tsk->sighand->siglock);
2333 __set_task_blocked(tsk, newset);
2334 spin_unlock_irq(&tsk->sighand->siglock);
2335}
2074 2336
2075/* 2337/*
2076 * This is also useful for kernel threads that want to temporarily 2338 * This is also useful for kernel threads that want to temporarily
@@ -2082,66 +2344,66 @@ long do_no_restart_syscall(struct restart_block *param)
2082 */ 2344 */
2083int sigprocmask(int how, sigset_t *set, sigset_t *oldset) 2345int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
2084{ 2346{
2085 int error; 2347 struct task_struct *tsk = current;
2348 sigset_t newset;
2086 2349
2087 spin_lock_irq(&current->sighand->siglock); 2350 /* Lockless, only current can change ->blocked, never from irq */
2088 if (oldset) 2351 if (oldset)
2089 *oldset = current->blocked; 2352 *oldset = tsk->blocked;
2090 2353
2091 error = 0;
2092 switch (how) { 2354 switch (how) {
2093 case SIG_BLOCK: 2355 case SIG_BLOCK:
2094 sigorsets(&current->blocked, &current->blocked, set); 2356 sigorsets(&newset, &tsk->blocked, set);
2095 break; 2357 break;
2096 case SIG_UNBLOCK: 2358 case SIG_UNBLOCK:
2097 signandsets(&current->blocked, &current->blocked, set); 2359 sigandnsets(&newset, &tsk->blocked, set);
2098 break; 2360 break;
2099 case SIG_SETMASK: 2361 case SIG_SETMASK:
2100 current->blocked = *set; 2362 newset = *set;
2101 break; 2363 break;
2102 default: 2364 default:
2103 error = -EINVAL; 2365 return -EINVAL;
2104 } 2366 }
2105 recalc_sigpending();
2106 spin_unlock_irq(&current->sighand->siglock);
2107 2367
2108 return error; 2368 set_current_blocked(&newset);
2369 return 0;
2109} 2370}
2110 2371
2111SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, 2372/**
2373 * sys_rt_sigprocmask - change the list of currently blocked signals
2374 * @how: whether to add, remove, or set signals
2375 * @nset: stores pending signals
2376 * @oset: previous value of signal mask if non-null
2377 * @sigsetsize: size of sigset_t type
2378 */
2379SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
2112 sigset_t __user *, oset, size_t, sigsetsize) 2380 sigset_t __user *, oset, size_t, sigsetsize)
2113{ 2381{
2114 int error = -EINVAL;
2115 sigset_t old_set, new_set; 2382 sigset_t old_set, new_set;
2383 int error;
2116 2384
2117 /* XXX: Don't preclude handling different sized sigset_t's. */ 2385 /* XXX: Don't preclude handling different sized sigset_t's. */
2118 if (sigsetsize != sizeof(sigset_t)) 2386 if (sigsetsize != sizeof(sigset_t))
2119 goto out; 2387 return -EINVAL;
2120 2388
2121 if (set) { 2389 old_set = current->blocked;
2122 error = -EFAULT; 2390
2123 if (copy_from_user(&new_set, set, sizeof(*set))) 2391 if (nset) {
2124 goto out; 2392 if (copy_from_user(&new_set, nset, sizeof(sigset_t)))
2393 return -EFAULT;
2125 sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); 2394 sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
2126 2395
2127 error = sigprocmask(how, &new_set, &old_set); 2396 error = sigprocmask(how, &new_set, NULL);
2128 if (error) 2397 if (error)
2129 goto out; 2398 return error;
2130 if (oset) 2399 }
2131 goto set_old;
2132 } else if (oset) {
2133 spin_lock_irq(&current->sighand->siglock);
2134 old_set = current->blocked;
2135 spin_unlock_irq(&current->sighand->siglock);
2136 2400
2137 set_old: 2401 if (oset) {
2138 error = -EFAULT; 2402 if (copy_to_user(oset, &old_set, sizeof(sigset_t)))
2139 if (copy_to_user(oset, &old_set, sizeof(*oset))) 2403 return -EFAULT;
2140 goto out;
2141 } 2404 }
2142 error = 0; 2405
2143out: 2406 return 0;
2144 return error;
2145} 2407}
2146 2408
2147long do_sigpending(void __user *set, unsigned long sigsetsize) 2409long do_sigpending(void __user *set, unsigned long sigsetsize)
@@ -2166,8 +2428,14 @@ long do_sigpending(void __user *set, unsigned long sigsetsize)
2166 2428
2167out: 2429out:
2168 return error; 2430 return error;
2169} 2431}
2170 2432
2433/**
2434 * sys_rt_sigpending - examine a pending signal that has been raised
2435 * while blocked
2436 * @set: stores pending signals
2437 * @sigsetsize: size of sigset_t type or larger
2438 */
2171SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) 2439SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
2172{ 2440{
2173 return do_sigpending(set, sigsetsize); 2441 return do_sigpending(set, sigsetsize);
@@ -2216,9 +2484,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2216 err |= __put_user(from->si_trapno, &to->si_trapno); 2484 err |= __put_user(from->si_trapno, &to->si_trapno);
2217#endif 2485#endif
2218#ifdef BUS_MCEERR_AO 2486#ifdef BUS_MCEERR_AO
2219 /* 2487 /*
2220 * Other callers might not initialize the si_lsb field, 2488 * Other callers might not initialize the si_lsb field,
2221 * so check explicitely for the right codes here. 2489 * so check explicitly for the right codes here.
2222 */ 2490 */
2223 if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) 2491 if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
2224 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); 2492 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
@@ -2247,15 +2515,82 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2247 2515
2248#endif 2516#endif
2249 2517
2518/**
2519 * do_sigtimedwait - wait for queued signals specified in @which
2520 * @which: queued signals to wait for
2521 * @info: if non-null, the signal's siginfo is returned here
2522 * @ts: upper bound on process time suspension
2523 */
2524int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
2525 const struct timespec *ts)
2526{
2527 struct task_struct *tsk = current;
2528 long timeout = MAX_SCHEDULE_TIMEOUT;
2529 sigset_t mask = *which;
2530 int sig;
2531
2532 if (ts) {
2533 if (!timespec_valid(ts))
2534 return -EINVAL;
2535 timeout = timespec_to_jiffies(ts);
2536 /*
2537 * We can be close to the next tick, add another one
2538 * to ensure we will wait at least the time asked for.
2539 */
2540 if (ts->tv_sec || ts->tv_nsec)
2541 timeout++;
2542 }
2543
2544 /*
2545 * Invert the set of allowed signals to get those we want to block.
2546 */
2547 sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
2548 signotset(&mask);
2549
2550 spin_lock_irq(&tsk->sighand->siglock);
2551 sig = dequeue_signal(tsk, &mask, info);
2552 if (!sig && timeout) {
2553 /*
2554 * None ready, temporarily unblock those we're interested
2555 * while we are sleeping in so that we'll be awakened when
2556 * they arrive. Unblocking is always fine, we can avoid
2557 * set_current_blocked().
2558 */
2559 tsk->real_blocked = tsk->blocked;
2560 sigandsets(&tsk->blocked, &tsk->blocked, &mask);
2561 recalc_sigpending();
2562 spin_unlock_irq(&tsk->sighand->siglock);
2563
2564 timeout = schedule_timeout_interruptible(timeout);
2565
2566 spin_lock_irq(&tsk->sighand->siglock);
2567 __set_task_blocked(tsk, &tsk->real_blocked);
2568 siginitset(&tsk->real_blocked, 0);
2569 sig = dequeue_signal(tsk, &mask, info);
2570 }
2571 spin_unlock_irq(&tsk->sighand->siglock);
2572
2573 if (sig)
2574 return sig;
2575 return timeout ? -EINTR : -EAGAIN;
2576}
2577
2578/**
2579 * sys_rt_sigtimedwait - synchronously wait for queued signals specified
2580 * in @uthese
2581 * @uthese: queued signals to wait for
2582 * @uinfo: if non-null, the signal's siginfo is returned here
2583 * @uts: upper bound on process time suspension
2584 * @sigsetsize: size of sigset_t type
2585 */
2250SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, 2586SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
2251 siginfo_t __user *, uinfo, const struct timespec __user *, uts, 2587 siginfo_t __user *, uinfo, const struct timespec __user *, uts,
2252 size_t, sigsetsize) 2588 size_t, sigsetsize)
2253{ 2589{
2254 int ret, sig;
2255 sigset_t these; 2590 sigset_t these;
2256 struct timespec ts; 2591 struct timespec ts;
2257 siginfo_t info; 2592 siginfo_t info;
2258 long timeout = 0; 2593 int ret;
2259 2594
2260 /* XXX: Don't preclude handling different sized sigset_t's. */ 2595 /* XXX: Don't preclude handling different sized sigset_t's. */
2261 if (sigsetsize != sizeof(sigset_t)) 2596 if (sigsetsize != sizeof(sigset_t))
@@ -2263,65 +2598,27 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
2263 2598
2264 if (copy_from_user(&these, uthese, sizeof(these))) 2599 if (copy_from_user(&these, uthese, sizeof(these)))
2265 return -EFAULT; 2600 return -EFAULT;
2266
2267 /*
2268 * Invert the set of allowed signals to get those we
2269 * want to block.
2270 */
2271 sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP));
2272 signotset(&these);
2273 2601
2274 if (uts) { 2602 if (uts) {
2275 if (copy_from_user(&ts, uts, sizeof(ts))) 2603 if (copy_from_user(&ts, uts, sizeof(ts)))
2276 return -EFAULT; 2604 return -EFAULT;
2277 if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0
2278 || ts.tv_sec < 0)
2279 return -EINVAL;
2280 } 2605 }
2281 2606
2282 spin_lock_irq(&current->sighand->siglock); 2607 ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);
2283 sig = dequeue_signal(current, &these, &info);
2284 if (!sig) {
2285 timeout = MAX_SCHEDULE_TIMEOUT;
2286 if (uts)
2287 timeout = (timespec_to_jiffies(&ts)
2288 + (ts.tv_sec || ts.tv_nsec));
2289
2290 if (timeout) {
2291 /* None ready -- temporarily unblock those we're
2292 * interested while we are sleeping in so that we'll
2293 * be awakened when they arrive. */
2294 current->real_blocked = current->blocked;
2295 sigandsets(&current->blocked, &current->blocked, &these);
2296 recalc_sigpending();
2297 spin_unlock_irq(&current->sighand->siglock);
2298
2299 timeout = schedule_timeout_interruptible(timeout);
2300
2301 spin_lock_irq(&current->sighand->siglock);
2302 sig = dequeue_signal(current, &these, &info);
2303 current->blocked = current->real_blocked;
2304 siginitset(&current->real_blocked, 0);
2305 recalc_sigpending();
2306 }
2307 }
2308 spin_unlock_irq(&current->sighand->siglock);
2309 2608
2310 if (sig) { 2609 if (ret > 0 && uinfo) {
2311 ret = sig; 2610 if (copy_siginfo_to_user(uinfo, &info))
2312 if (uinfo) { 2611 ret = -EFAULT;
2313 if (copy_siginfo_to_user(uinfo, &info))
2314 ret = -EFAULT;
2315 }
2316 } else {
2317 ret = -EAGAIN;
2318 if (timeout)
2319 ret = -EINTR;
2320 } 2612 }
2321 2613
2322 return ret; 2614 return ret;
2323} 2615}
2324 2616
2617/**
2618 * sys_kill - send a signal to a process
2619 * @pid: the PID of the process
2620 * @sig: signal to be sent
2621 */
2325SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) 2622SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
2326{ 2623{
2327 struct siginfo info; 2624 struct siginfo info;
@@ -2397,7 +2694,11 @@ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
2397 return do_tkill(tgid, pid, sig); 2694 return do_tkill(tgid, pid, sig);
2398} 2695}
2399 2696
2400/* 2697/**
2698 * sys_tkill - send signal to one specific task
2699 * @pid: the PID of the task
2700 * @sig: signal to be sent
2701 *
2401 * Send a signal to only one task, even if it's a CLONE_THREAD task. 2702 * Send a signal to only one task, even if it's a CLONE_THREAD task.
2402 */ 2703 */
2403SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) 2704SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
@@ -2409,6 +2710,12 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
2409 return do_tkill(0, pid, sig); 2710 return do_tkill(0, pid, sig);
2410} 2711}
2411 2712
2713/**
2714 * sys_rt_sigqueueinfo - send signal information to a signal
2715 * @pid: the PID of the thread
2716 * @sig: signal to be sent
2717 * @uinfo: signal info to be sent
2718 */
2412SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, 2719SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2413 siginfo_t __user *, uinfo) 2720 siginfo_t __user *, uinfo)
2414{ 2721{
@@ -2418,9 +2725,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2418 return -EFAULT; 2725 return -EFAULT;
2419 2726
2420 /* Not even root can pretend to send signals from the kernel. 2727 /* Not even root can pretend to send signals from the kernel.
2421 Nor can they impersonate a kill(), which adds source info. */ 2728 * Nor can they impersonate a kill()/tgkill(), which adds source info.
2422 if (info.si_code >= 0) 2729 */
2730 if (info.si_code >= 0 || info.si_code == SI_TKILL) {
2731 /* We used to allow any < 0 si_code */
2732 WARN_ON_ONCE(info.si_code < 0);
2423 return -EPERM; 2733 return -EPERM;
2734 }
2424 info.si_signo = sig; 2735 info.si_signo = sig;
2425 2736
2426 /* POSIX.1b doesn't mention process groups. */ 2737 /* POSIX.1b doesn't mention process groups. */
@@ -2434,9 +2745,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
2434 return -EINVAL; 2745 return -EINVAL;
2435 2746
2436 /* Not even root can pretend to send signals from the kernel. 2747 /* Not even root can pretend to send signals from the kernel.
2437 Nor can they impersonate a kill(), which adds source info. */ 2748 * Nor can they impersonate a kill()/tgkill(), which adds source info.
2438 if (info->si_code >= 0) 2749 */
2750 if (info->si_code >= 0 || info->si_code == SI_TKILL) {
2751 /* We used to allow any < 0 si_code */
2752 WARN_ON_ONCE(info->si_code < 0);
2439 return -EPERM; 2753 return -EPERM;
2754 }
2440 info->si_signo = sig; 2755 info->si_signo = sig;
2441 2756
2442 return do_send_specific(tgid, pid, sig, info); 2757 return do_send_specific(tgid, pid, sig, info);
@@ -2528,12 +2843,11 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2528 2843
2529 error = -EINVAL; 2844 error = -EINVAL;
2530 /* 2845 /*
2531 * 2846 * Note - this code used to test ss_flags incorrectly:
2532 * Note - this code used to test ss_flags incorrectly
2533 * old code may have been written using ss_flags==0 2847 * old code may have been written using ss_flags==0
2534 * to mean ss_flags==SS_ONSTACK (as this was the only 2848 * to mean ss_flags==SS_ONSTACK (as this was the only
2535 * way that worked) - this fix preserves that older 2849 * way that worked) - this fix preserves that older
2536 * mechanism 2850 * mechanism.
2537 */ 2851 */
2538 if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) 2852 if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0)
2539 goto out; 2853 goto out;
@@ -2567,6 +2881,10 @@ out:
2567 2881
2568#ifdef __ARCH_WANT_SYS_SIGPENDING 2882#ifdef __ARCH_WANT_SYS_SIGPENDING
2569 2883
2884/**
2885 * sys_sigpending - examine pending signals
2886 * @set: where mask of pending signal is returned
2887 */
2570SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) 2888SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
2571{ 2889{
2572 return do_sigpending(set, sizeof(*set)); 2890 return do_sigpending(set, sizeof(*set));
@@ -2575,60 +2893,65 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
2575#endif 2893#endif
2576 2894
2577#ifdef __ARCH_WANT_SYS_SIGPROCMASK 2895#ifdef __ARCH_WANT_SYS_SIGPROCMASK
2578/* Some platforms have their own version with special arguments others 2896/**
2579 support only sys_rt_sigprocmask. */ 2897 * sys_sigprocmask - examine and change blocked signals
2898 * @how: whether to add, remove, or set signals
2899 * @nset: signals to add or remove (if non-null)
2900 * @oset: previous value of signal mask if non-null
2901 *
2902 * Some platforms have their own version with special arguments;
2903 * others support only sys_rt_sigprocmask.
2904 */
2580 2905
2581SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, 2906SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
2582 old_sigset_t __user *, oset) 2907 old_sigset_t __user *, oset)
2583{ 2908{
2584 int error;
2585 old_sigset_t old_set, new_set; 2909 old_sigset_t old_set, new_set;
2910 sigset_t new_blocked;
2586 2911
2587 if (set) { 2912 old_set = current->blocked.sig[0];
2588 error = -EFAULT; 2913
2589 if (copy_from_user(&new_set, set, sizeof(*set))) 2914 if (nset) {
2590 goto out; 2915 if (copy_from_user(&new_set, nset, sizeof(*nset)))
2916 return -EFAULT;
2591 new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); 2917 new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
2592 2918
2593 spin_lock_irq(&current->sighand->siglock); 2919 new_blocked = current->blocked;
2594 old_set = current->blocked.sig[0];
2595 2920
2596 error = 0;
2597 switch (how) { 2921 switch (how) {
2598 default:
2599 error = -EINVAL;
2600 break;
2601 case SIG_BLOCK: 2922 case SIG_BLOCK:
2602 sigaddsetmask(&current->blocked, new_set); 2923 sigaddsetmask(&new_blocked, new_set);
2603 break; 2924 break;
2604 case SIG_UNBLOCK: 2925 case SIG_UNBLOCK:
2605 sigdelsetmask(&current->blocked, new_set); 2926 sigdelsetmask(&new_blocked, new_set);
2606 break; 2927 break;
2607 case SIG_SETMASK: 2928 case SIG_SETMASK:
2608 current->blocked.sig[0] = new_set; 2929 new_blocked.sig[0] = new_set;
2609 break; 2930 break;
2931 default:
2932 return -EINVAL;
2610 } 2933 }
2611 2934
2612 recalc_sigpending(); 2935 set_current_blocked(&new_blocked);
2613 spin_unlock_irq(&current->sighand->siglock); 2936 }
2614 if (error) 2937
2615 goto out; 2938 if (oset) {
2616 if (oset)
2617 goto set_old;
2618 } else if (oset) {
2619 old_set = current->blocked.sig[0];
2620 set_old:
2621 error = -EFAULT;
2622 if (copy_to_user(oset, &old_set, sizeof(*oset))) 2939 if (copy_to_user(oset, &old_set, sizeof(*oset)))
2623 goto out; 2940 return -EFAULT;
2624 } 2941 }
2625 error = 0; 2942
2626out: 2943 return 0;
2627 return error;
2628} 2944}
2629#endif /* __ARCH_WANT_SYS_SIGPROCMASK */ 2945#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
2630 2946
2631#ifdef __ARCH_WANT_SYS_RT_SIGACTION 2947#ifdef __ARCH_WANT_SYS_RT_SIGACTION
2948/**
2949 * sys_rt_sigaction - alter an action taken by a process
2950 * @sig: signal to be sent
2951 * @act: new sigaction
2952 * @oact: used to save the previous sigaction
2953 * @sigsetsize: size of sigset_t type
2954 */
2632SYSCALL_DEFINE4(rt_sigaction, int, sig, 2955SYSCALL_DEFINE4(rt_sigaction, int, sig,
2633 const struct sigaction __user *, act, 2956 const struct sigaction __user *, act,
2634 struct sigaction __user *, oact, 2957 struct sigaction __user *, oact,
@@ -2707,14 +3030,22 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
2707 3030
2708SYSCALL_DEFINE0(pause) 3031SYSCALL_DEFINE0(pause)
2709{ 3032{
2710 current->state = TASK_INTERRUPTIBLE; 3033 while (!signal_pending(current)) {
2711 schedule(); 3034 current->state = TASK_INTERRUPTIBLE;
3035 schedule();
3036 }
2712 return -ERESTARTNOHAND; 3037 return -ERESTARTNOHAND;
2713} 3038}
2714 3039
2715#endif 3040#endif
2716 3041
2717#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND 3042#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
3043/**
3044 * sys_rt_sigsuspend - replace the signal mask for a value with the
3045 * @unewset value until a signal is received
3046 * @unewset: new signal mask value
3047 * @sigsetsize: size of sigset_t type
3048 */
2718SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) 3049SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
2719{ 3050{
2720 sigset_t newset; 3051 sigset_t newset;
diff --git a/kernel/smp.c b/kernel/smp.c
index ed6aacfcb7ef..fb67dfa8394e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,7 @@
13#include <linux/smp.h> 13#include <linux/smp.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15 15
16#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
16static struct { 17static struct {
17 struct list_head queue; 18 struct list_head queue;
18 raw_spinlock_t lock; 19 raw_spinlock_t lock;
@@ -73,7 +74,7 @@ static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
73 .notifier_call = hotplug_cfd, 74 .notifier_call = hotplug_cfd,
74}; 75};
75 76
76static int __cpuinit init_call_single_data(void) 77void __init call_function_init(void)
77{ 78{
78 void *cpu = (void *)(long)smp_processor_id(); 79 void *cpu = (void *)(long)smp_processor_id();
79 int i; 80 int i;
@@ -87,10 +88,7 @@ static int __cpuinit init_call_single_data(void)
87 88
88 hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); 89 hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
89 register_cpu_notifier(&hotplug_cfd_notifier); 90 register_cpu_notifier(&hotplug_cfd_notifier);
90
91 return 0;
92} 91}
93early_initcall(init_call_single_data);
94 92
95/* 93/*
96 * csd_lock/csd_unlock used to serialize access to per-cpu csd resources 94 * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
@@ -193,23 +191,52 @@ void generic_smp_call_function_interrupt(void)
193 */ 191 */
194 list_for_each_entry_rcu(data, &call_function.queue, csd.list) { 192 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
195 int refs; 193 int refs;
194 smp_call_func_t func;
195
196 /*
197 * Since we walk the list without any locks, we might
198 * see an entry that was completed, removed from the
199 * list and is in the process of being reused.
200 *
201 * We must check that the cpu is in the cpumask before
202 * checking the refs, and both must be set before
203 * executing the callback on this cpu.
204 */
196 205
197 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) 206 if (!cpumask_test_cpu(cpu, data->cpumask))
198 continue; 207 continue;
199 208
200 data->csd.func(data->csd.info); 209 smp_rmb();
210
211 if (atomic_read(&data->refs) == 0)
212 continue;
213
214 func = data->csd.func; /* save for later warn */
215 func(data->csd.info);
216
217 /*
218 * If the cpu mask is not still set then func enabled
219 * interrupts (BUG), and this cpu took another smp call
220 * function interrupt and executed func(info) twice
221 * on this cpu. That nested execution decremented refs.
222 */
223 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
224 WARN(1, "%pf enabled interrupts and double executed\n", func);
225 continue;
226 }
201 227
202 refs = atomic_dec_return(&data->refs); 228 refs = atomic_dec_return(&data->refs);
203 WARN_ON(refs < 0); 229 WARN_ON(refs < 0);
204 if (!refs) {
205 raw_spin_lock(&call_function.lock);
206 list_del_rcu(&data->csd.list);
207 raw_spin_unlock(&call_function.lock);
208 }
209 230
210 if (refs) 231 if (refs)
211 continue; 232 continue;
212 233
234 WARN_ON(!cpumask_empty(data->cpumask));
235
236 raw_spin_lock(&call_function.lock);
237 list_del_rcu(&data->csd.list);
238 raw_spin_unlock(&call_function.lock);
239
213 csd_unlock(&data->csd); 240 csd_unlock(&data->csd);
214 } 241 }
215 242
@@ -267,7 +294,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
267 * 294 *
268 * Returns 0 on success, else a negative status code. 295 * Returns 0 on success, else a negative status code.
269 */ 296 */
270int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 297int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
271 int wait) 298 int wait)
272{ 299{
273 struct call_single_data d = { 300 struct call_single_data d = {
@@ -336,7 +363,7 @@ EXPORT_SYMBOL(smp_call_function_single);
336 * 3) any other online cpu in @mask 363 * 3) any other online cpu in @mask
337 */ 364 */
338int smp_call_function_any(const struct cpumask *mask, 365int smp_call_function_any(const struct cpumask *mask,
339 void (*func)(void *info), void *info, int wait) 366 smp_call_func_t func, void *info, int wait)
340{ 367{
341 unsigned int cpu; 368 unsigned int cpu;
342 const struct cpumask *nodemask; 369 const struct cpumask *nodemask;
@@ -416,11 +443,11 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
416 * must be disabled when calling this function. 443 * must be disabled when calling this function.
417 */ 444 */
418void smp_call_function_many(const struct cpumask *mask, 445void smp_call_function_many(const struct cpumask *mask,
419 void (*func)(void *), void *info, bool wait) 446 smp_call_func_t func, void *info, bool wait)
420{ 447{
421 struct call_function_data *data; 448 struct call_function_data *data;
422 unsigned long flags; 449 unsigned long flags;
423 int cpu, next_cpu, this_cpu = smp_processor_id(); 450 int refs, cpu, next_cpu, this_cpu = smp_processor_id();
424 451
425 /* 452 /*
426 * Can deadlock when called with interrupts disabled. 453 * Can deadlock when called with interrupts disabled.
@@ -429,9 +456,9 @@ void smp_call_function_many(const struct cpumask *mask,
429 * can't happen. 456 * can't happen.
430 */ 457 */
431 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() 458 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
432 && !oops_in_progress); 459 && !oops_in_progress && !early_boot_irqs_disabled);
433 460
434 /* So, what's a CPU they want? Ignoring this one. */ 461 /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */
435 cpu = cpumask_first_and(mask, cpu_online_mask); 462 cpu = cpumask_first_and(mask, cpu_online_mask);
436 if (cpu == this_cpu) 463 if (cpu == this_cpu)
437 cpu = cpumask_next_and(cpu, mask, cpu_online_mask); 464 cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
@@ -454,11 +481,48 @@ void smp_call_function_many(const struct cpumask *mask,
454 data = &__get_cpu_var(cfd_data); 481 data = &__get_cpu_var(cfd_data);
455 csd_lock(&data->csd); 482 csd_lock(&data->csd);
456 483
484 /* This BUG_ON verifies our reuse assertions and can be removed */
485 BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
486
487 /*
488 * The global call function queue list add and delete are protected
489 * by a lock, but the list is traversed without any lock, relying
490 * on the rcu list add and delete to allow safe concurrent traversal.
491 * We reuse the call function data without waiting for any grace
492 * period after some other cpu removes it from the global queue.
493 * This means a cpu might find our data block as it is being
494 * filled out.
495 *
496 * We hold off the interrupt handler on the other cpu by
497 * ordering our writes to the cpu mask vs our setting of the
498 * refs counter. We assert only the cpu owning the data block
499 * will set a bit in cpumask, and each bit will only be cleared
500 * by the subject cpu. Each cpu must first find its bit is
501 * set and then check that refs is set indicating the element is
502 * ready to be processed, otherwise it must skip the entry.
503 *
504 * On the previous iteration refs was set to 0 by another cpu.
505 * To avoid the use of transitivity, set the counter to 0 here
506 * so the wmb will pair with the rmb in the interrupt handler.
507 */
508 atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */
509
457 data->csd.func = func; 510 data->csd.func = func;
458 data->csd.info = info; 511 data->csd.info = info;
512
513 /* Ensure 0 refs is visible before mask. Also orders func and info */
514 smp_wmb();
515
516 /* We rely on the "and" being processed before the store */
459 cpumask_and(data->cpumask, mask, cpu_online_mask); 517 cpumask_and(data->cpumask, mask, cpu_online_mask);
460 cpumask_clear_cpu(this_cpu, data->cpumask); 518 cpumask_clear_cpu(this_cpu, data->cpumask);
461 atomic_set(&data->refs, cpumask_weight(data->cpumask)); 519 refs = cpumask_weight(data->cpumask);
520
521 /* Some callers race with other cpus changing the passed mask */
522 if (unlikely(!refs)) {
523 csd_unlock(&data->csd);
524 return;
525 }
462 526
463 raw_spin_lock_irqsave(&call_function.lock, flags); 527 raw_spin_lock_irqsave(&call_function.lock, flags);
464 /* 528 /*
@@ -467,6 +531,12 @@ void smp_call_function_many(const struct cpumask *mask,
467 * will not miss any other list entries: 531 * will not miss any other list entries:
468 */ 532 */
469 list_add_rcu(&data->csd.list, &call_function.queue); 533 list_add_rcu(&data->csd.list, &call_function.queue);
534 /*
535 * We rely on the wmb() in list_add_rcu to complete our writes
536 * to the cpumask before this write to refs, which indicates
537 * data is on the list and is ready to be processed.
538 */
539 atomic_set(&data->refs, refs);
470 raw_spin_unlock_irqrestore(&call_function.lock, flags); 540 raw_spin_unlock_irqrestore(&call_function.lock, flags);
471 541
472 /* 542 /*
@@ -500,7 +570,7 @@ EXPORT_SYMBOL(smp_call_function_many);
500 * You must not call this function with disabled interrupts or from a 570 * You must not call this function with disabled interrupts or from a
501 * hardware interrupt handler or from a bottom half handler. 571 * hardware interrupt handler or from a bottom half handler.
502 */ 572 */
503int smp_call_function(void (*func)(void *), void *info, int wait) 573int smp_call_function(smp_call_func_t func, void *info, int wait)
504{ 574{
505 preempt_disable(); 575 preempt_disable();
506 smp_call_function_many(cpu_online_mask, func, info, wait); 576 smp_call_function_many(cpu_online_mask, func, info, wait);
@@ -529,3 +599,105 @@ void ipi_call_unlock_irq(void)
529{ 599{
530 raw_spin_unlock_irq(&call_function.lock); 600 raw_spin_unlock_irq(&call_function.lock);
531} 601}
602#endif /* USE_GENERIC_SMP_HELPERS */
603
604/* Setup configured maximum number of CPUs to activate */
605unsigned int setup_max_cpus = NR_CPUS;
606EXPORT_SYMBOL(setup_max_cpus);
607
608
609/*
610 * Setup routine for controlling SMP activation
611 *
612 * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
613 * activation entirely (the MPS table probe still happens, though).
614 *
615 * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
616 * greater than 0, limits the maximum number of CPUs activated in
617 * SMP mode to <NUM>.
618 */
619
620void __weak arch_disable_smp_support(void) { }
621
622static int __init nosmp(char *str)
623{
624 setup_max_cpus = 0;
625 arch_disable_smp_support();
626
627 return 0;
628}
629
630early_param("nosmp", nosmp);
631
632/* this is hard limit */
633static int __init nrcpus(char *str)
634{
635 int nr_cpus;
636
637 get_option(&str, &nr_cpus);
638 if (nr_cpus > 0 && nr_cpus < nr_cpu_ids)
639 nr_cpu_ids = nr_cpus;
640
641 return 0;
642}
643
644early_param("nr_cpus", nrcpus);
645
646static int __init maxcpus(char *str)
647{
648 get_option(&str, &setup_max_cpus);
649 if (setup_max_cpus == 0)
650 arch_disable_smp_support();
651
652 return 0;
653}
654
655early_param("maxcpus", maxcpus);
656
657/* Setup number of possible processor ids */
658int nr_cpu_ids __read_mostly = NR_CPUS;
659EXPORT_SYMBOL(nr_cpu_ids);
660
661/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
662void __init setup_nr_cpu_ids(void)
663{
664 nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
665}
666
667/* Called by boot processor to activate the rest. */
668void __init smp_init(void)
669{
670 unsigned int cpu;
671
672 /* FIXME: This should be done in userspace --RR */
673 for_each_present_cpu(cpu) {
674 if (num_online_cpus() >= setup_max_cpus)
675 break;
676 if (!cpu_online(cpu))
677 cpu_up(cpu);
678 }
679
680 /* Any cleanup work */
681 printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
682 smp_cpus_done(setup_max_cpus);
683}
684
685/*
686 * Call a function on all processors. May be used during early boot while
687 * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
688 * of local_irq_disable/enable().
689 */
690int on_each_cpu(void (*func) (void *info), void *info, int wait)
691{
692 unsigned long flags;
693 int ret = 0;
694
695 preempt_disable();
696 ret = smp_call_function(func, info, wait);
697 local_irq_save(flags);
698 func(info);
699 local_irq_restore(flags);
700 preempt_enable();
701 return ret;
702}
703EXPORT_SYMBOL(on_each_cpu);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 07b4f1b1a73a..fca82c32042b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -54,11 +54,11 @@ EXPORT_SYMBOL(irq_stat);
54 54
55static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; 55static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
56 56
57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 57DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
61 "TASKLET", "SCHED", "HRTIMER", "RCU" 61 "TASKLET", "SCHED", "HRTIMER", "RCU"
62}; 62};
63 63
64/* 64/*
@@ -67,21 +67,31 @@ char *softirq_to_name[NR_SOFTIRQS] = {
67 * to the pending events, so lets the scheduler to balance 67 * to the pending events, so lets the scheduler to balance
68 * the softirq load for us. 68 * the softirq load for us.
69 */ 69 */
70void wakeup_softirqd(void) 70static void wakeup_softirqd(void)
71{ 71{
72 /* Interrupts are disabled: no need to stop preemption */ 72 /* Interrupts are disabled: no need to stop preemption */
73 struct task_struct *tsk = __get_cpu_var(ksoftirqd); 73 struct task_struct *tsk = __this_cpu_read(ksoftirqd);
74 74
75 if (tsk && tsk->state != TASK_RUNNING) 75 if (tsk && tsk->state != TASK_RUNNING)
76 wake_up_process(tsk); 76 wake_up_process(tsk);
77} 77}
78 78
79/* 79/*
80 * preempt_count and SOFTIRQ_OFFSET usage:
81 * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
82 * softirq processing.
83 * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
84 * on local_bh_disable or local_bh_enable.
85 * This lets us distinguish between whether we are currently processing
86 * softirq and whether we just have bh disabled.
87 */
88
89/*
80 * This one is for softirq.c-internal use, 90 * This one is for softirq.c-internal use,
81 * where hardirqs are disabled legitimately: 91 * where hardirqs are disabled legitimately:
82 */ 92 */
83#ifdef CONFIG_TRACE_IRQFLAGS 93#ifdef CONFIG_TRACE_IRQFLAGS
84static void __local_bh_disable(unsigned long ip) 94static void __local_bh_disable(unsigned long ip, unsigned int cnt)
85{ 95{
86 unsigned long flags; 96 unsigned long flags;
87 97
@@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip)
95 * We must manually increment preempt_count here and manually 105 * We must manually increment preempt_count here and manually
96 * call the trace_preempt_off later. 106 * call the trace_preempt_off later.
97 */ 107 */
98 preempt_count() += SOFTIRQ_OFFSET; 108 preempt_count() += cnt;
99 /* 109 /*
100 * Were softirqs turned off above: 110 * Were softirqs turned off above:
101 */ 111 */
102 if (softirq_count() == SOFTIRQ_OFFSET) 112 if (softirq_count() == cnt)
103 trace_softirqs_off(ip); 113 trace_softirqs_off(ip);
104 raw_local_irq_restore(flags); 114 raw_local_irq_restore(flags);
105 115
106 if (preempt_count() == SOFTIRQ_OFFSET) 116 if (preempt_count() == cnt)
107 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 117 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
108} 118}
109#else /* !CONFIG_TRACE_IRQFLAGS */ 119#else /* !CONFIG_TRACE_IRQFLAGS */
110static inline void __local_bh_disable(unsigned long ip) 120static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
111{ 121{
112 add_preempt_count(SOFTIRQ_OFFSET); 122 add_preempt_count(cnt);
113 barrier(); 123 barrier();
114} 124}
115#endif /* CONFIG_TRACE_IRQFLAGS */ 125#endif /* CONFIG_TRACE_IRQFLAGS */
116 126
117void local_bh_disable(void) 127void local_bh_disable(void)
118{ 128{
119 __local_bh_disable((unsigned long)__builtin_return_address(0)); 129 __local_bh_disable((unsigned long)__builtin_return_address(0),
130 SOFTIRQ_DISABLE_OFFSET);
120} 131}
121 132
122EXPORT_SYMBOL(local_bh_disable); 133EXPORT_SYMBOL(local_bh_disable);
123 134
135static void __local_bh_enable(unsigned int cnt)
136{
137 WARN_ON_ONCE(in_irq());
138 WARN_ON_ONCE(!irqs_disabled());
139
140 if (softirq_count() == cnt)
141 trace_softirqs_on((unsigned long)__builtin_return_address(0));
142 sub_preempt_count(cnt);
143}
144
124/* 145/*
125 * Special-case - softirqs can safely be enabled in 146 * Special-case - softirqs can safely be enabled in
126 * cond_resched_softirq(), or by __do_softirq(), 147 * cond_resched_softirq(), or by __do_softirq(),
@@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable);
128 */ 149 */
129void _local_bh_enable(void) 150void _local_bh_enable(void)
130{ 151{
131 WARN_ON_ONCE(in_irq()); 152 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
132 WARN_ON_ONCE(!irqs_disabled());
133
134 if (softirq_count() == SOFTIRQ_OFFSET)
135 trace_softirqs_on((unsigned long)__builtin_return_address(0));
136 sub_preempt_count(SOFTIRQ_OFFSET);
137} 153}
138 154
139EXPORT_SYMBOL(_local_bh_enable); 155EXPORT_SYMBOL(_local_bh_enable);
@@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip)
147 /* 163 /*
148 * Are softirqs going to be turned on now: 164 * Are softirqs going to be turned on now:
149 */ 165 */
150 if (softirq_count() == SOFTIRQ_OFFSET) 166 if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
151 trace_softirqs_on(ip); 167 trace_softirqs_on(ip);
152 /* 168 /*
153 * Keep preemption disabled until we are done with 169 * Keep preemption disabled until we are done with
154 * softirq processing: 170 * softirq processing:
155 */ 171 */
156 sub_preempt_count(SOFTIRQ_OFFSET - 1); 172 sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
157 173
158 if (unlikely(!in_interrupt() && local_softirq_pending())) 174 if (unlikely(!in_interrupt() && local_softirq_pending()))
159 do_softirq(); 175 do_softirq();
@@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void)
198 pending = local_softirq_pending(); 214 pending = local_softirq_pending();
199 account_system_vtime(current); 215 account_system_vtime(current);
200 216
201 __local_bh_disable((unsigned long)__builtin_return_address(0)); 217 __local_bh_disable((unsigned long)__builtin_return_address(0),
218 SOFTIRQ_OFFSET);
202 lockdep_softirq_enter(); 219 lockdep_softirq_enter();
203 220
204 cpu = smp_processor_id(); 221 cpu = smp_processor_id();
@@ -212,18 +229,20 @@ restart:
212 229
213 do { 230 do {
214 if (pending & 1) { 231 if (pending & 1) {
232 unsigned int vec_nr = h - softirq_vec;
215 int prev_count = preempt_count(); 233 int prev_count = preempt_count();
216 kstat_incr_softirqs_this_cpu(h - softirq_vec);
217 234
218 trace_softirq_entry(h, softirq_vec); 235 kstat_incr_softirqs_this_cpu(vec_nr);
236
237 trace_softirq_entry(vec_nr);
219 h->action(h); 238 h->action(h);
220 trace_softirq_exit(h, softirq_vec); 239 trace_softirq_exit(vec_nr);
221 if (unlikely(prev_count != preempt_count())) { 240 if (unlikely(prev_count != preempt_count())) {
222 printk(KERN_ERR "huh, entered softirq %td %s %p" 241 printk(KERN_ERR "huh, entered softirq %u %s %p"
223 "with preempt_count %08x," 242 "with preempt_count %08x,"
224 " exited with %08x?\n", h - softirq_vec, 243 " exited with %08x?\n", vec_nr,
225 softirq_to_name[h - softirq_vec], 244 softirq_to_name[vec_nr], h->action,
226 h->action, prev_count, preempt_count()); 245 prev_count, preempt_count());
227 preempt_count() = prev_count; 246 preempt_count() = prev_count;
228 } 247 }
229 248
@@ -245,7 +264,7 @@ restart:
245 lockdep_softirq_exit(); 264 lockdep_softirq_exit();
246 265
247 account_system_vtime(current); 266 account_system_vtime(current);
248 _local_bh_enable(); 267 __local_bh_enable(SOFTIRQ_OFFSET);
249} 268}
250 269
251#ifndef __ARCH_HAS_DO_SOFTIRQ 270#ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -279,16 +298,42 @@ void irq_enter(void)
279 298
280 rcu_irq_enter(); 299 rcu_irq_enter();
281 if (idle_cpu(cpu) && !in_interrupt()) { 300 if (idle_cpu(cpu) && !in_interrupt()) {
282 __irq_enter(); 301 /*
302 * Prevent raise_softirq from needlessly waking up ksoftirqd
303 * here, as softirq will be serviced on return from interrupt.
304 */
305 local_bh_disable();
283 tick_check_idle(cpu); 306 tick_check_idle(cpu);
284 } else 307 _local_bh_enable();
285 __irq_enter(); 308 }
309
310 __irq_enter();
286} 311}
287 312
288#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 313#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
289# define invoke_softirq() __do_softirq() 314static inline void invoke_softirq(void)
315{
316 if (!force_irqthreads)
317 __do_softirq();
318 else {
319 __local_bh_disable((unsigned long)__builtin_return_address(0),
320 SOFTIRQ_OFFSET);
321 wakeup_softirqd();
322 __local_bh_enable(SOFTIRQ_OFFSET);
323 }
324}
290#else 325#else
291# define invoke_softirq() do_softirq() 326static inline void invoke_softirq(void)
327{
328 if (!force_irqthreads)
329 do_softirq();
330 else {
331 __local_bh_disable((unsigned long)__builtin_return_address(0),
332 SOFTIRQ_OFFSET);
333 wakeup_softirqd();
334 __local_bh_enable(SOFTIRQ_OFFSET);
335 }
336}
292#endif 337#endif
293 338
294/* 339/*
@@ -363,8 +408,8 @@ void __tasklet_schedule(struct tasklet_struct *t)
363 408
364 local_irq_save(flags); 409 local_irq_save(flags);
365 t->next = NULL; 410 t->next = NULL;
366 *__get_cpu_var(tasklet_vec).tail = t; 411 *__this_cpu_read(tasklet_vec.tail) = t;
367 __get_cpu_var(tasklet_vec).tail = &(t->next); 412 __this_cpu_write(tasklet_vec.tail, &(t->next));
368 raise_softirq_irqoff(TASKLET_SOFTIRQ); 413 raise_softirq_irqoff(TASKLET_SOFTIRQ);
369 local_irq_restore(flags); 414 local_irq_restore(flags);
370} 415}
@@ -377,8 +422,8 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
377 422
378 local_irq_save(flags); 423 local_irq_save(flags);
379 t->next = NULL; 424 t->next = NULL;
380 *__get_cpu_var(tasklet_hi_vec).tail = t; 425 *__this_cpu_read(tasklet_hi_vec.tail) = t;
381 __get_cpu_var(tasklet_hi_vec).tail = &(t->next); 426 __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
382 raise_softirq_irqoff(HI_SOFTIRQ); 427 raise_softirq_irqoff(HI_SOFTIRQ);
383 local_irq_restore(flags); 428 local_irq_restore(flags);
384} 429}
@@ -389,8 +434,8 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
389{ 434{
390 BUG_ON(!irqs_disabled()); 435 BUG_ON(!irqs_disabled());
391 436
392 t->next = __get_cpu_var(tasklet_hi_vec).head; 437 t->next = __this_cpu_read(tasklet_hi_vec.head);
393 __get_cpu_var(tasklet_hi_vec).head = t; 438 __this_cpu_write(tasklet_hi_vec.head, t);
394 __raise_softirq_irqoff(HI_SOFTIRQ); 439 __raise_softirq_irqoff(HI_SOFTIRQ);
395} 440}
396 441
@@ -401,9 +446,9 @@ static void tasklet_action(struct softirq_action *a)
401 struct tasklet_struct *list; 446 struct tasklet_struct *list;
402 447
403 local_irq_disable(); 448 local_irq_disable();
404 list = __get_cpu_var(tasklet_vec).head; 449 list = __this_cpu_read(tasklet_vec.head);
405 __get_cpu_var(tasklet_vec).head = NULL; 450 __this_cpu_write(tasklet_vec.head, NULL);
406 __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; 451 __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
407 local_irq_enable(); 452 local_irq_enable();
408 453
409 while (list) { 454 while (list) {
@@ -424,8 +469,8 @@ static void tasklet_action(struct softirq_action *a)
424 469
425 local_irq_disable(); 470 local_irq_disable();
426 t->next = NULL; 471 t->next = NULL;
427 *__get_cpu_var(tasklet_vec).tail = t; 472 *__this_cpu_read(tasklet_vec.tail) = t;
428 __get_cpu_var(tasklet_vec).tail = &(t->next); 473 __this_cpu_write(tasklet_vec.tail, &(t->next));
429 __raise_softirq_irqoff(TASKLET_SOFTIRQ); 474 __raise_softirq_irqoff(TASKLET_SOFTIRQ);
430 local_irq_enable(); 475 local_irq_enable();
431 } 476 }
@@ -436,9 +481,9 @@ static void tasklet_hi_action(struct softirq_action *a)
436 struct tasklet_struct *list; 481 struct tasklet_struct *list;
437 482
438 local_irq_disable(); 483 local_irq_disable();
439 list = __get_cpu_var(tasklet_hi_vec).head; 484 list = __this_cpu_read(tasklet_hi_vec.head);
440 __get_cpu_var(tasklet_hi_vec).head = NULL; 485 __this_cpu_write(tasklet_hi_vec.head, NULL);
441 __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; 486 __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
442 local_irq_enable(); 487 local_irq_enable();
443 488
444 while (list) { 489 while (list) {
@@ -459,8 +504,8 @@ static void tasklet_hi_action(struct softirq_action *a)
459 504
460 local_irq_disable(); 505 local_irq_disable();
461 t->next = NULL; 506 t->next = NULL;
462 *__get_cpu_var(tasklet_hi_vec).tail = t; 507 *__this_cpu_read(tasklet_hi_vec.tail) = t;
463 __get_cpu_var(tasklet_hi_vec).tail = &(t->next); 508 __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
464 __raise_softirq_irqoff(HI_SOFTIRQ); 509 __raise_softirq_irqoff(HI_SOFTIRQ);
465 local_irq_enable(); 510 local_irq_enable();
466 } 511 }
@@ -530,7 +575,7 @@ static void __tasklet_hrtimer_trampoline(unsigned long data)
530/** 575/**
531 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks 576 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
532 * @ttimer: tasklet_hrtimer which is initialized 577 * @ttimer: tasklet_hrtimer which is initialized
533 * @function: hrtimer callback funtion which gets called from softirq context 578 * @function: hrtimer callback function which gets called from softirq context
534 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) 579 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
535 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) 580 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
536 */ 581 */
@@ -712,7 +757,10 @@ static int run_ksoftirqd(void * __bind_cpu)
712 don't process */ 757 don't process */
713 if (cpu_is_offline((long)__bind_cpu)) 758 if (cpu_is_offline((long)__bind_cpu))
714 goto wait_to_die; 759 goto wait_to_die;
715 do_softirq(); 760 local_irq_disable();
761 if (local_softirq_pending())
762 __do_softirq();
763 local_irq_enable();
716 preempt_enable_no_resched(); 764 preempt_enable_no_resched();
717 cond_resched(); 765 cond_resched();
718 preempt_disable(); 766 preempt_disable();
@@ -776,16 +824,16 @@ static void takeover_tasklets(unsigned int cpu)
776 824
777 /* Find end, append list for that CPU. */ 825 /* Find end, append list for that CPU. */
778 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { 826 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
779 *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head; 827 *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;
780 __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; 828 this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
781 per_cpu(tasklet_vec, cpu).head = NULL; 829 per_cpu(tasklet_vec, cpu).head = NULL;
782 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; 830 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
783 } 831 }
784 raise_softirq_irqoff(TASKLET_SOFTIRQ); 832 raise_softirq_irqoff(TASKLET_SOFTIRQ);
785 833
786 if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { 834 if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
787 *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; 835 *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head;
788 __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; 836 __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail);
789 per_cpu(tasklet_hi_vec, cpu).head = NULL; 837 per_cpu(tasklet_hi_vec, cpu).head = NULL;
790 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; 838 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
791 } 839 }
@@ -805,7 +853,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
805 switch (action) { 853 switch (action) {
806 case CPU_UP_PREPARE: 854 case CPU_UP_PREPARE:
807 case CPU_UP_PREPARE_FROZEN: 855 case CPU_UP_PREPARE_FROZEN:
808 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 856 p = kthread_create_on_node(run_ksoftirqd,
857 hcpu,
858 cpu_to_node(hotcpu),
859 "ksoftirqd/%d", hotcpu);
809 if (IS_ERR(p)) { 860 if (IS_ERR(p)) {
810 printk("ksoftirqd for %i failed\n", hotcpu); 861 printk("ksoftirqd for %i failed\n", hotcpu);
811 return notifier_from_errno(PTR_ERR(p)); 862 return notifier_from_errno(PTR_ERR(p));
@@ -827,7 +878,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
827 cpumask_any(cpu_online_mask)); 878 cpumask_any(cpu_online_mask));
828 case CPU_DEAD: 879 case CPU_DEAD:
829 case CPU_DEAD_FROZEN: { 880 case CPU_DEAD_FROZEN: {
830 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 881 static const struct sched_param param = {
882 .sched_priority = MAX_RT_PRIO-1
883 };
831 884
832 p = per_cpu(ksoftirqd, hotcpu); 885 p = per_cpu(ksoftirqd, hotcpu);
833 per_cpu(ksoftirqd, hotcpu) = NULL; 886 per_cpu(ksoftirqd, hotcpu) = NULL;
@@ -857,25 +910,6 @@ static __init int spawn_ksoftirqd(void)
857} 910}
858early_initcall(spawn_ksoftirqd); 911early_initcall(spawn_ksoftirqd);
859 912
860#ifdef CONFIG_SMP
861/*
862 * Call a function on all processors
863 */
864int on_each_cpu(void (*func) (void *info), void *info, int wait)
865{
866 int ret = 0;
867
868 preempt_disable();
869 ret = smp_call_function(func, info, wait);
870 local_irq_disable();
871 func(info);
872 local_irq_enable();
873 preempt_enable();
874 return ret;
875}
876EXPORT_SYMBOL(on_each_cpu);
877#endif
878
879/* 913/*
880 * [ These __weak aliases are kept in a separate compilation unit, so that 914 * [ These __weak aliases are kept in a separate compilation unit, so that
881 * GCC does not inline them incorrectly. ] 915 * GCC does not inline them incorrectly. ]
@@ -886,17 +920,14 @@ int __init __weak early_irq_init(void)
886 return 0; 920 return 0;
887} 921}
888 922
923#ifdef CONFIG_GENERIC_HARDIRQS
889int __init __weak arch_probe_nr_irqs(void) 924int __init __weak arch_probe_nr_irqs(void)
890{ 925{
891 return 0; 926 return NR_IRQS_LEGACY;
892} 927}
893 928
894int __init __weak arch_early_irq_init(void) 929int __init __weak arch_early_irq_init(void)
895{ 930{
896 return 0; 931 return 0;
897} 932}
898 933#endif
899int __weak arch_init_chip_data(struct irq_desc *desc, int node)
900{
901 return 0;
902}
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2980da3fd509..73ce23feaea9 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -31,6 +31,7 @@
31#include <linux/rcupdate.h> 31#include <linux/rcupdate.h>
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/smp.h> 33#include <linux/smp.h>
34#include <linux/delay.h>
34#include <linux/srcu.h> 35#include <linux/srcu.h>
35 36
36static int init_srcu_struct_fields(struct srcu_struct *sp) 37static int init_srcu_struct_fields(struct srcu_struct *sp)
@@ -46,11 +47,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
46int __init_srcu_struct(struct srcu_struct *sp, const char *name, 47int __init_srcu_struct(struct srcu_struct *sp, const char *name,
47 struct lock_class_key *key) 48 struct lock_class_key *key)
48{ 49{
49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50 /* Don't re-initialize a lock while it is held. */ 50 /* Don't re-initialize a lock while it is held. */
51 debug_check_no_locks_freed((void *)sp, sizeof(*sp)); 51 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
52 lockdep_init_map(&sp->dep_map, name, key, 0); 52 lockdep_init_map(&sp->dep_map, name, key, 0);
53#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
54 return init_srcu_struct_fields(sp); 53 return init_srcu_struct_fields(sp);
55} 54}
56EXPORT_SYMBOL_GPL(__init_srcu_struct); 55EXPORT_SYMBOL_GPL(__init_srcu_struct);
@@ -157,6 +156,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx)
157EXPORT_SYMBOL_GPL(__srcu_read_unlock); 156EXPORT_SYMBOL_GPL(__srcu_read_unlock);
158 157
159/* 158/*
159 * We use an adaptive strategy for synchronize_srcu() and especially for
160 * synchronize_srcu_expedited(). We spin for a fixed time period
161 * (defined below) to allow SRCU readers to exit their read-side critical
162 * sections. If there are still some readers after 10 microseconds,
163 * we repeatedly block for 1-millisecond time periods. This approach
164 * has done well in testing, so there is no need for a config parameter.
165 */
166#define SYNCHRONIZE_SRCU_READER_DELAY 10
167
168/*
160 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 169 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
161 */ 170 */
162static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) 171static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
@@ -205,9 +214,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
205 * all srcu_read_lock() calls using the old counters have completed. 214 * all srcu_read_lock() calls using the old counters have completed.
206 * Their corresponding critical sections might well be still 215 * Their corresponding critical sections might well be still
207 * executing, but the srcu_read_lock() primitives themselves 216 * executing, but the srcu_read_lock() primitives themselves
208 * will have finished executing. 217 * will have finished executing. We initially give readers
218 * an arbitrarily chosen 10 microseconds to get out of their
219 * SRCU read-side critical sections, then loop waiting 1/HZ
220 * seconds per iteration. The 10-microsecond value has done
221 * very well in testing.
209 */ 222 */
210 223
224 if (srcu_readers_active_idx(sp, idx))
225 udelay(SYNCHRONIZE_SRCU_READER_DELAY);
211 while (srcu_readers_active_idx(sp, idx)) 226 while (srcu_readers_active_idx(sp, idx))
212 schedule_timeout_interruptible(1); 227 schedule_timeout_interruptible(1);
213 228
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 4372ccb25127..e3516b29076c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -262,7 +262,7 @@ repeat:
262 cpu_stop_fn_t fn = work->fn; 262 cpu_stop_fn_t fn = work->fn;
263 void *arg = work->arg; 263 void *arg = work->arg;
264 struct cpu_stop_done *done = work->done; 264 struct cpu_stop_done *done = work->done;
265 char ksym_buf[KSYM_NAME_LEN]; 265 char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
266 266
267 __set_current_state(TASK_RUNNING); 267 __set_current_state(TASK_RUNNING);
268 268
@@ -287,11 +287,12 @@ repeat:
287 goto repeat; 287 goto repeat;
288} 288}
289 289
290extern void sched_set_stop_task(int cpu, struct task_struct *stop);
291
290/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ 292/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
291static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, 293static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
292 unsigned long action, void *hcpu) 294 unsigned long action, void *hcpu)
293{ 295{
294 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
295 unsigned int cpu = (unsigned long)hcpu; 296 unsigned int cpu = (unsigned long)hcpu;
296 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 297 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
297 struct task_struct *p; 298 struct task_struct *p;
@@ -300,17 +301,19 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
300 case CPU_UP_PREPARE: 301 case CPU_UP_PREPARE:
301 BUG_ON(stopper->thread || stopper->enabled || 302 BUG_ON(stopper->thread || stopper->enabled ||
302 !list_empty(&stopper->works)); 303 !list_empty(&stopper->works));
303 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", 304 p = kthread_create_on_node(cpu_stopper_thread,
304 cpu); 305 stopper,
306 cpu_to_node(cpu),
307 "migration/%d", cpu);
305 if (IS_ERR(p)) 308 if (IS_ERR(p))
306 return NOTIFY_BAD; 309 return notifier_from_errno(PTR_ERR(p));
307 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
308 get_task_struct(p); 310 get_task_struct(p);
311 kthread_bind(p, cpu);
312 sched_set_stop_task(cpu, p);
309 stopper->thread = p; 313 stopper->thread = p;
310 break; 314 break;
311 315
312 case CPU_ONLINE: 316 case CPU_ONLINE:
313 kthread_bind(stopper->thread, cpu);
314 /* strictly unnecessary, as first user will wake it */ 317 /* strictly unnecessary, as first user will wake it */
315 wake_up_process(stopper->thread); 318 wake_up_process(stopper->thread);
316 /* mark enabled */ 319 /* mark enabled */
@@ -325,6 +328,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
325 { 328 {
326 struct cpu_stop_work *work; 329 struct cpu_stop_work *work;
327 330
331 sched_set_stop_task(cpu, NULL);
328 /* kill the stopper */ 332 /* kill the stopper */
329 kthread_stop(stopper->thread); 333 kthread_stop(stopper->thread);
330 /* drain remaining works */ 334 /* drain remaining works */
@@ -370,7 +374,7 @@ static int __init cpu_stop_init(void)
370 /* start one for the boot cpu */ 374 /* start one for the boot cpu */
371 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, 375 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
372 bcpu); 376 bcpu);
373 BUG_ON(err == NOTIFY_BAD); 377 BUG_ON(err != NOTIFY_OK);
374 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); 378 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
375 register_cpu_notifier(&cpu_stop_cpu_notifier); 379 register_cpu_notifier(&cpu_stop_cpu_notifier);
376 380
diff --git a/kernel/sys.c b/kernel/sys.c
index 7f5a0cd296a9..e4128b278f23 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -37,12 +37,15 @@
37#include <linux/ptrace.h> 37#include <linux/ptrace.h>
38#include <linux/fs_struct.h> 38#include <linux/fs_struct.h>
39#include <linux/gfp.h> 39#include <linux/gfp.h>
40#include <linux/syscore_ops.h>
40 41
41#include <linux/compat.h> 42#include <linux/compat.h>
42#include <linux/syscalls.h> 43#include <linux/syscalls.h>
43#include <linux/kprobes.h> 44#include <linux/kprobes.h>
44#include <linux/user_namespace.h> 45#include <linux/user_namespace.h>
45 46
47#include <linux/kmsg_dump.h>
48
46#include <asm/uaccess.h> 49#include <asm/uaccess.h>
47#include <asm/io.h> 50#include <asm/io.h>
48#include <asm/unistd.h> 51#include <asm/unistd.h>
@@ -117,16 +120,33 @@ EXPORT_SYMBOL(cad_pid);
117void (*pm_power_off_prepare)(void); 120void (*pm_power_off_prepare)(void);
118 121
119/* 122/*
123 * Returns true if current's euid is same as p's uid or euid,
124 * or has CAP_SYS_NICE to p's user_ns.
125 *
126 * Called with rcu_read_lock, creds are safe
127 */
128static bool set_one_prio_perm(struct task_struct *p)
129{
130 const struct cred *cred = current_cred(), *pcred = __task_cred(p);
131
132 if (pcred->user->user_ns == cred->user->user_ns &&
133 (pcred->uid == cred->euid ||
134 pcred->euid == cred->euid))
135 return true;
136 if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
137 return true;
138 return false;
139}
140
141/*
120 * set the priority of a task 142 * set the priority of a task
121 * - the caller must hold the RCU read lock 143 * - the caller must hold the RCU read lock
122 */ 144 */
123static int set_one_prio(struct task_struct *p, int niceval, int error) 145static int set_one_prio(struct task_struct *p, int niceval, int error)
124{ 146{
125 const struct cred *cred = current_cred(), *pcred = __task_cred(p);
126 int no_nice; 147 int no_nice;
127 148
128 if (pcred->uid != cred->euid && 149 if (!set_one_prio_perm(p)) {
129 pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
130 error = -EPERM; 150 error = -EPERM;
131 goto out; 151 goto out;
132 } 152 }
@@ -285,6 +305,7 @@ out_unlock:
285 */ 305 */
286void emergency_restart(void) 306void emergency_restart(void)
287{ 307{
308 kmsg_dump(KMSG_DUMP_EMERG);
288 machine_emergency_restart(); 309 machine_emergency_restart();
289} 310}
290EXPORT_SYMBOL_GPL(emergency_restart); 311EXPORT_SYMBOL_GPL(emergency_restart);
@@ -293,8 +314,9 @@ void kernel_restart_prepare(char *cmd)
293{ 314{
294 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 315 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
295 system_state = SYSTEM_RESTART; 316 system_state = SYSTEM_RESTART;
317 usermodehelper_disable();
296 device_shutdown(); 318 device_shutdown();
297 sysdev_shutdown(); 319 syscore_shutdown();
298} 320}
299 321
300/** 322/**
@@ -312,6 +334,7 @@ void kernel_restart(char *cmd)
312 printk(KERN_EMERG "Restarting system.\n"); 334 printk(KERN_EMERG "Restarting system.\n");
313 else 335 else
314 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); 336 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
337 kmsg_dump(KMSG_DUMP_RESTART);
315 machine_restart(cmd); 338 machine_restart(cmd);
316} 339}
317EXPORT_SYMBOL_GPL(kernel_restart); 340EXPORT_SYMBOL_GPL(kernel_restart);
@@ -321,6 +344,7 @@ static void kernel_shutdown_prepare(enum system_states state)
321 blocking_notifier_call_chain(&reboot_notifier_list, 344 blocking_notifier_call_chain(&reboot_notifier_list,
322 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); 345 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
323 system_state = state; 346 system_state = state;
347 usermodehelper_disable();
324 device_shutdown(); 348 device_shutdown();
325} 349}
326/** 350/**
@@ -331,8 +355,9 @@ static void kernel_shutdown_prepare(enum system_states state)
331void kernel_halt(void) 355void kernel_halt(void)
332{ 356{
333 kernel_shutdown_prepare(SYSTEM_HALT); 357 kernel_shutdown_prepare(SYSTEM_HALT);
334 sysdev_shutdown(); 358 syscore_shutdown();
335 printk(KERN_EMERG "System halted.\n"); 359 printk(KERN_EMERG "System halted.\n");
360 kmsg_dump(KMSG_DUMP_HALT);
336 machine_halt(); 361 machine_halt();
337} 362}
338 363
@@ -349,8 +374,9 @@ void kernel_power_off(void)
349 if (pm_power_off_prepare) 374 if (pm_power_off_prepare)
350 pm_power_off_prepare(); 375 pm_power_off_prepare();
351 disable_nonboot_cpus(); 376 disable_nonboot_cpus();
352 sysdev_shutdown(); 377 syscore_shutdown();
353 printk(KERN_EMERG "Power down.\n"); 378 printk(KERN_EMERG "Power down.\n");
379 kmsg_dump(KMSG_DUMP_POWEROFF);
354 machine_power_off(); 380 machine_power_off();
355} 381}
356EXPORT_SYMBOL_GPL(kernel_power_off); 382EXPORT_SYMBOL_GPL(kernel_power_off);
@@ -496,7 +522,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
496 if (rgid != (gid_t) -1) { 522 if (rgid != (gid_t) -1) {
497 if (old->gid == rgid || 523 if (old->gid == rgid ||
498 old->egid == rgid || 524 old->egid == rgid ||
499 capable(CAP_SETGID)) 525 nsown_capable(CAP_SETGID))
500 new->gid = rgid; 526 new->gid = rgid;
501 else 527 else
502 goto error; 528 goto error;
@@ -505,7 +531,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
505 if (old->gid == egid || 531 if (old->gid == egid ||
506 old->egid == egid || 532 old->egid == egid ||
507 old->sgid == egid || 533 old->sgid == egid ||
508 capable(CAP_SETGID)) 534 nsown_capable(CAP_SETGID))
509 new->egid = egid; 535 new->egid = egid;
510 else 536 else
511 goto error; 537 goto error;
@@ -540,7 +566,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
540 old = current_cred(); 566 old = current_cred();
541 567
542 retval = -EPERM; 568 retval = -EPERM;
543 if (capable(CAP_SETGID)) 569 if (nsown_capable(CAP_SETGID))
544 new->gid = new->egid = new->sgid = new->fsgid = gid; 570 new->gid = new->egid = new->sgid = new->fsgid = gid;
545 else if (gid == old->gid || gid == old->sgid) 571 else if (gid == old->gid || gid == old->sgid)
546 new->egid = new->fsgid = gid; 572 new->egid = new->fsgid = gid;
@@ -607,7 +633,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
607 new->uid = ruid; 633 new->uid = ruid;
608 if (old->uid != ruid && 634 if (old->uid != ruid &&
609 old->euid != ruid && 635 old->euid != ruid &&
610 !capable(CAP_SETUID)) 636 !nsown_capable(CAP_SETUID))
611 goto error; 637 goto error;
612 } 638 }
613 639
@@ -616,7 +642,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
616 if (old->uid != euid && 642 if (old->uid != euid &&
617 old->euid != euid && 643 old->euid != euid &&
618 old->suid != euid && 644 old->suid != euid &&
619 !capable(CAP_SETUID)) 645 !nsown_capable(CAP_SETUID))
620 goto error; 646 goto error;
621 } 647 }
622 648
@@ -664,7 +690,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
664 old = current_cred(); 690 old = current_cred();
665 691
666 retval = -EPERM; 692 retval = -EPERM;
667 if (capable(CAP_SETUID)) { 693 if (nsown_capable(CAP_SETUID)) {
668 new->suid = new->uid = uid; 694 new->suid = new->uid = uid;
669 if (uid != old->uid) { 695 if (uid != old->uid) {
670 retval = set_user(new); 696 retval = set_user(new);
@@ -706,7 +732,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
706 old = current_cred(); 732 old = current_cred();
707 733
708 retval = -EPERM; 734 retval = -EPERM;
709 if (!capable(CAP_SETUID)) { 735 if (!nsown_capable(CAP_SETUID)) {
710 if (ruid != (uid_t) -1 && ruid != old->uid && 736 if (ruid != (uid_t) -1 && ruid != old->uid &&
711 ruid != old->euid && ruid != old->suid) 737 ruid != old->euid && ruid != old->suid)
712 goto error; 738 goto error;
@@ -770,7 +796,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
770 old = current_cred(); 796 old = current_cred();
771 797
772 retval = -EPERM; 798 retval = -EPERM;
773 if (!capable(CAP_SETGID)) { 799 if (!nsown_capable(CAP_SETGID)) {
774 if (rgid != (gid_t) -1 && rgid != old->gid && 800 if (rgid != (gid_t) -1 && rgid != old->gid &&
775 rgid != old->egid && rgid != old->sgid) 801 rgid != old->egid && rgid != old->sgid)
776 goto error; 802 goto error;
@@ -830,7 +856,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
830 856
831 if (uid == old->uid || uid == old->euid || 857 if (uid == old->uid || uid == old->euid ||
832 uid == old->suid || uid == old->fsuid || 858 uid == old->suid || uid == old->fsuid ||
833 capable(CAP_SETUID)) { 859 nsown_capable(CAP_SETUID)) {
834 if (uid != old_fsuid) { 860 if (uid != old_fsuid) {
835 new->fsuid = uid; 861 new->fsuid = uid;
836 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) 862 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -863,7 +889,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
863 889
864 if (gid == old->gid || gid == old->egid || 890 if (gid == old->gid || gid == old->egid ||
865 gid == old->sgid || gid == old->fsgid || 891 gid == old->sgid || gid == old->fsgid ||
866 capable(CAP_SETGID)) { 892 nsown_capable(CAP_SETGID)) {
867 if (gid != old_fsgid) { 893 if (gid != old_fsgid) {
868 new->fsgid = gid; 894 new->fsgid = gid;
869 goto change_okay; 895 goto change_okay;
@@ -1080,8 +1106,10 @@ SYSCALL_DEFINE0(setsid)
1080 err = session; 1106 err = session;
1081out: 1107out:
1082 write_unlock_irq(&tasklist_lock); 1108 write_unlock_irq(&tasklist_lock);
1083 if (err > 0) 1109 if (err > 0) {
1084 proc_sid_connector(group_leader); 1110 proc_sid_connector(group_leader);
1111 sched_autogroup_create_attach(group_leader);
1112 }
1085 return err; 1113 return err;
1086} 1114}
1087 1115
@@ -1169,8 +1197,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1169 int errno; 1197 int errno;
1170 char tmp[__NEW_UTS_LEN]; 1198 char tmp[__NEW_UTS_LEN];
1171 1199
1172 if (!capable(CAP_SYS_ADMIN)) 1200 if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
1173 return -EPERM; 1201 return -EPERM;
1202
1174 if (len < 0 || len > __NEW_UTS_LEN) 1203 if (len < 0 || len > __NEW_UTS_LEN)
1175 return -EINVAL; 1204 return -EINVAL;
1176 down_write(&uts_sem); 1205 down_write(&uts_sem);
@@ -1218,7 +1247,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1218 int errno; 1247 int errno;
1219 char tmp[__NEW_UTS_LEN]; 1248 char tmp[__NEW_UTS_LEN];
1220 1249
1221 if (!capable(CAP_SYS_ADMIN)) 1250 if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
1222 return -EPERM; 1251 return -EPERM;
1223 if (len < 0 || len > __NEW_UTS_LEN) 1252 if (len < 0 || len > __NEW_UTS_LEN)
1224 return -EINVAL; 1253 return -EINVAL;
@@ -1333,6 +1362,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
1333 rlim = tsk->signal->rlim + resource; 1362 rlim = tsk->signal->rlim + resource;
1334 task_lock(tsk->group_leader); 1363 task_lock(tsk->group_leader);
1335 if (new_rlim) { 1364 if (new_rlim) {
1365 /* Keep the capable check against init_user_ns until
1366 cgroups can contain all limits */
1336 if (new_rlim->rlim_max > rlim->rlim_max && 1367 if (new_rlim->rlim_max > rlim->rlim_max &&
1337 !capable(CAP_SYS_RESOURCE)) 1368 !capable(CAP_SYS_RESOURCE))
1338 retval = -EPERM; 1369 retval = -EPERM;
@@ -1376,18 +1407,22 @@ static int check_prlimit_permission(struct task_struct *task)
1376{ 1407{
1377 const struct cred *cred = current_cred(), *tcred; 1408 const struct cred *cred = current_cred(), *tcred;
1378 1409
1379 tcred = __task_cred(task); 1410 if (current == task)
1380 if ((cred->uid != tcred->euid || 1411 return 0;
1381 cred->uid != tcred->suid ||
1382 cred->uid != tcred->uid ||
1383 cred->gid != tcred->egid ||
1384 cred->gid != tcred->sgid ||
1385 cred->gid != tcred->gid) &&
1386 !capable(CAP_SYS_RESOURCE)) {
1387 return -EPERM;
1388 }
1389 1412
1390 return 0; 1413 tcred = __task_cred(task);
1414 if (cred->user->user_ns == tcred->user->user_ns &&
1415 (cred->uid == tcred->euid &&
1416 cred->uid == tcred->suid &&
1417 cred->uid == tcred->uid &&
1418 cred->gid == tcred->egid &&
1419 cred->gid == tcred->sgid &&
1420 cred->gid == tcred->gid))
1421 return 0;
1422 if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
1423 return 0;
1424
1425 return -EPERM;
1391} 1426}
1392 1427
1393SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, 1428SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bad369ec5403..62cbc8877fef 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -46,10 +46,13 @@ cond_syscall(sys_getsockopt);
46cond_syscall(compat_sys_getsockopt); 46cond_syscall(compat_sys_getsockopt);
47cond_syscall(sys_shutdown); 47cond_syscall(sys_shutdown);
48cond_syscall(sys_sendmsg); 48cond_syscall(sys_sendmsg);
49cond_syscall(sys_sendmmsg);
49cond_syscall(compat_sys_sendmsg); 50cond_syscall(compat_sys_sendmsg);
51cond_syscall(compat_sys_sendmmsg);
50cond_syscall(sys_recvmsg); 52cond_syscall(sys_recvmsg);
51cond_syscall(sys_recvmmsg); 53cond_syscall(sys_recvmmsg);
52cond_syscall(compat_sys_recvmsg); 54cond_syscall(compat_sys_recvmsg);
55cond_syscall(compat_sys_recv);
53cond_syscall(compat_sys_recvfrom); 56cond_syscall(compat_sys_recvfrom);
54cond_syscall(compat_sys_recvmmsg); 57cond_syscall(compat_sys_recvmmsg);
55cond_syscall(sys_socketcall); 58cond_syscall(sys_socketcall);
@@ -68,15 +71,22 @@ cond_syscall(compat_sys_epoll_pwait);
68cond_syscall(sys_semget); 71cond_syscall(sys_semget);
69cond_syscall(sys_semop); 72cond_syscall(sys_semop);
70cond_syscall(sys_semtimedop); 73cond_syscall(sys_semtimedop);
74cond_syscall(compat_sys_semtimedop);
71cond_syscall(sys_semctl); 75cond_syscall(sys_semctl);
76cond_syscall(compat_sys_semctl);
72cond_syscall(sys_msgget); 77cond_syscall(sys_msgget);
73cond_syscall(sys_msgsnd); 78cond_syscall(sys_msgsnd);
79cond_syscall(compat_sys_msgsnd);
74cond_syscall(sys_msgrcv); 80cond_syscall(sys_msgrcv);
81cond_syscall(compat_sys_msgrcv);
75cond_syscall(sys_msgctl); 82cond_syscall(sys_msgctl);
83cond_syscall(compat_sys_msgctl);
76cond_syscall(sys_shmget); 84cond_syscall(sys_shmget);
77cond_syscall(sys_shmat); 85cond_syscall(sys_shmat);
86cond_syscall(compat_sys_shmat);
78cond_syscall(sys_shmdt); 87cond_syscall(sys_shmdt);
79cond_syscall(sys_shmctl); 88cond_syscall(sys_shmctl);
89cond_syscall(compat_sys_shmctl);
80cond_syscall(sys_mq_open); 90cond_syscall(sys_mq_open);
81cond_syscall(sys_mq_unlink); 91cond_syscall(sys_mq_unlink);
82cond_syscall(sys_mq_timedsend); 92cond_syscall(sys_mq_timedsend);
@@ -185,3 +195,8 @@ cond_syscall(sys_perf_event_open);
185/* fanotify! */ 195/* fanotify! */
186cond_syscall(sys_fanotify_init); 196cond_syscall(sys_fanotify_init);
187cond_syscall(sys_fanotify_mark); 197cond_syscall(sys_fanotify_mark);
198
199/* open by handle */
200cond_syscall(sys_name_to_handle_at);
201cond_syscall(sys_open_by_handle_at);
202cond_syscall(compat_sys_open_by_handle_at);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3a45c224770f..f175d98bd355 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -24,6 +24,7 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/sysctl.h> 25#include <linux/sysctl.h>
26#include <linux/signal.h> 26#include <linux/signal.h>
27#include <linux/printk.h>
27#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
28#include <linux/security.h> 29#include <linux/security.h>
29#include <linux/ctype.h> 30#include <linux/ctype.h>
@@ -55,6 +56,7 @@
55#include <linux/kprobes.h> 56#include <linux/kprobes.h>
56#include <linux/pipe_fs_i.h> 57#include <linux/pipe_fs_i.h>
57#include <linux/oom.h> 58#include <linux/oom.h>
59#include <linux/kmod.h>
58 60
59#include <asm/uaccess.h> 61#include <asm/uaccess.h>
60#include <asm/processor.h> 62#include <asm/processor.h>
@@ -116,6 +118,7 @@ static int neg_one = -1;
116static int zero; 118static int zero;
117static int __maybe_unused one = 1; 119static int __maybe_unused one = 1;
118static int __maybe_unused two = 2; 120static int __maybe_unused two = 2;
121static int __maybe_unused three = 3;
119static unsigned long one_ul = 1; 122static unsigned long one_ul = 1;
120static int one_hundred = 100; 123static int one_hundred = 100;
121#ifdef CONFIG_PRINTK 124#ifdef CONFIG_PRINTK
@@ -161,8 +164,6 @@ extern int no_unaligned_warning;
161extern int unaligned_dump_stack; 164extern int unaligned_dump_stack;
162#endif 165#endif
163 166
164extern struct ratelimit_state printk_ratelimit_state;
165
166#ifdef CONFIG_PROC_SYSCTL 167#ifdef CONFIG_PROC_SYSCTL
167static int proc_do_cad_pid(struct ctl_table *table, int write, 168static int proc_do_cad_pid(struct ctl_table *table, int write,
168 void __user *buffer, size_t *lenp, loff_t *ppos); 169 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -170,8 +171,14 @@ static int proc_taint(struct ctl_table *table, int write,
170 void __user *buffer, size_t *lenp, loff_t *ppos); 171 void __user *buffer, size_t *lenp, loff_t *ppos);
171#endif 172#endif
172 173
174#ifdef CONFIG_PRINTK
175static int proc_dmesg_restrict(struct ctl_table *table, int write,
176 void __user *buffer, size_t *lenp, loff_t *ppos);
177#endif
178
173#ifdef CONFIG_MAGIC_SYSRQ 179#ifdef CONFIG_MAGIC_SYSRQ
174static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */ 180/* Note: sysrq code uses it's own private copy */
181static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
175 182
176static int sysrq_sysctl_handler(ctl_table *table, int write, 183static int sysrq_sysctl_handler(ctl_table *table, int write,
177 void __user *buffer, size_t *lenp, 184 void __user *buffer, size_t *lenp,
@@ -194,9 +201,9 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
194static struct ctl_table root_table[]; 201static struct ctl_table root_table[];
195static struct ctl_table_root sysctl_table_root; 202static struct ctl_table_root sysctl_table_root;
196static struct ctl_table_header root_table_header = { 203static struct ctl_table_header root_table_header = {
197 .count = 1, 204 {{.count = 1,
198 .ctl_table = root_table, 205 .ctl_table = root_table,
199 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), 206 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
200 .root = &sysctl_table_root, 207 .root = &sysctl_table_root,
201 .set = &sysctl_table_root.default_set, 208 .set = &sysctl_table_root.default_set,
202}; 209};
@@ -247,10 +254,6 @@ static struct ctl_table root_table[] = {
247 .mode = 0555, 254 .mode = 0555,
248 .child = dev_table, 255 .child = dev_table,
249 }, 256 },
250/*
251 * NOTE: do not add new entries to this table unless you have read
252 * Documentation/sysctl/ctl_unnumbered.txt
253 */
254 { } 257 { }
255}; 258};
256 259
@@ -261,8 +264,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */
261static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 264static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
262static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; 265static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
263static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; 266static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
264static int min_sched_shares_ratelimit = 100000; /* 100 usec */
265static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
266#endif 267#endif
267 268
268#ifdef CONFIG_COMPACTION 269#ifdef CONFIG_COMPACTION
@@ -307,15 +308,6 @@ static struct ctl_table kern_table[] = {
307 .extra2 = &max_wakeup_granularity_ns, 308 .extra2 = &max_wakeup_granularity_ns,
308 }, 309 },
309 { 310 {
310 .procname = "sched_shares_ratelimit",
311 .data = &sysctl_sched_shares_ratelimit,
312 .maxlen = sizeof(unsigned int),
313 .mode = 0644,
314 .proc_handler = sched_proc_update_handler,
315 .extra1 = &min_sched_shares_ratelimit,
316 .extra2 = &max_sched_shares_ratelimit,
317 },
318 {
319 .procname = "sched_tunable_scaling", 311 .procname = "sched_tunable_scaling",
320 .data = &sysctl_sched_tunable_scaling, 312 .data = &sysctl_sched_tunable_scaling,
321 .maxlen = sizeof(enum sched_tunable_scaling), 313 .maxlen = sizeof(enum sched_tunable_scaling),
@@ -325,14 +317,6 @@ static struct ctl_table kern_table[] = {
325 .extra2 = &max_sched_tunable_scaling, 317 .extra2 = &max_sched_tunable_scaling,
326 }, 318 },
327 { 319 {
328 .procname = "sched_shares_thresh",
329 .data = &sysctl_sched_shares_thresh,
330 .maxlen = sizeof(unsigned int),
331 .mode = 0644,
332 .proc_handler = proc_dointvec_minmax,
333 .extra1 = &zero,
334 },
335 {
336 .procname = "sched_migration_cost", 320 .procname = "sched_migration_cost",
337 .data = &sysctl_sched_migration_cost, 321 .data = &sysctl_sched_migration_cost,
338 .maxlen = sizeof(unsigned int), 322 .maxlen = sizeof(unsigned int),
@@ -354,6 +338,13 @@ static struct ctl_table kern_table[] = {
354 .proc_handler = proc_dointvec, 338 .proc_handler = proc_dointvec,
355 }, 339 },
356 { 340 {
341 .procname = "sched_shares_window",
342 .data = &sysctl_sched_shares_window,
343 .maxlen = sizeof(unsigned int),
344 .mode = 0644,
345 .proc_handler = proc_dointvec,
346 },
347 {
357 .procname = "timer_migration", 348 .procname = "timer_migration",
358 .data = &sysctl_timer_migration, 349 .data = &sysctl_timer_migration,
359 .maxlen = sizeof(unsigned int), 350 .maxlen = sizeof(unsigned int),
@@ -377,13 +368,17 @@ static struct ctl_table kern_table[] = {
377 .mode = 0644, 368 .mode = 0644,
378 .proc_handler = sched_rt_handler, 369 .proc_handler = sched_rt_handler,
379 }, 370 },
371#ifdef CONFIG_SCHED_AUTOGROUP
380 { 372 {
381 .procname = "sched_compat_yield", 373 .procname = "sched_autogroup_enabled",
382 .data = &sysctl_sched_compat_yield, 374 .data = &sysctl_sched_autogroup_enabled,
383 .maxlen = sizeof(unsigned int), 375 .maxlen = sizeof(unsigned int),
384 .mode = 0644, 376 .mode = 0644,
385 .proc_handler = proc_dointvec, 377 .proc_handler = proc_dointvec_minmax,
378 .extra1 = &zero,
379 .extra2 = &one,
386 }, 380 },
381#endif
387#ifdef CONFIG_PROVE_LOCKING 382#ifdef CONFIG_PROVE_LOCKING
388 { 383 {
389 .procname = "prove_locking", 384 .procname = "prove_locking",
@@ -622,6 +617,11 @@ static struct ctl_table kern_table[] = {
622 .child = random_table, 617 .child = random_table,
623 }, 618 },
624 { 619 {
620 .procname = "usermodehelper",
621 .mode = 0555,
622 .child = usermodehelper_table,
623 },
624 {
625 .procname = "overflowuid", 625 .procname = "overflowuid",
626 .data = &overflowuid, 626 .data = &overflowuid,
627 .maxlen = sizeof(int), 627 .maxlen = sizeof(int),
@@ -704,6 +704,24 @@ static struct ctl_table kern_table[] = {
704 .extra1 = &zero, 704 .extra1 = &zero,
705 .extra2 = &ten_thousand, 705 .extra2 = &ten_thousand,
706 }, 706 },
707 {
708 .procname = "dmesg_restrict",
709 .data = &dmesg_restrict,
710 .maxlen = sizeof(int),
711 .mode = 0644,
712 .proc_handler = proc_dointvec_minmax,
713 .extra1 = &zero,
714 .extra2 = &one,
715 },
716 {
717 .procname = "kptr_restrict",
718 .data = &kptr_restrict,
719 .maxlen = sizeof(int),
720 .mode = 0644,
721 .proc_handler = proc_dmesg_restrict,
722 .extra1 = &zero,
723 .extra2 = &two,
724 },
707#endif 725#endif
708 { 726 {
709 .procname = "ngroups_max", 727 .procname = "ngroups_max",
@@ -718,14 +736,16 @@ static struct ctl_table kern_table[] = {
718 .data = &watchdog_enabled, 736 .data = &watchdog_enabled,
719 .maxlen = sizeof (int), 737 .maxlen = sizeof (int),
720 .mode = 0644, 738 .mode = 0644,
721 .proc_handler = proc_dowatchdog_enabled, 739 .proc_handler = proc_dowatchdog,
740 .extra1 = &zero,
741 .extra2 = &one,
722 }, 742 },
723 { 743 {
724 .procname = "watchdog_thresh", 744 .procname = "watchdog_thresh",
725 .data = &softlockup_thresh, 745 .data = &watchdog_thresh,
726 .maxlen = sizeof(int), 746 .maxlen = sizeof(int),
727 .mode = 0644, 747 .mode = 0644,
728 .proc_handler = proc_dowatchdog_thresh, 748 .proc_handler = proc_dowatchdog,
729 .extra1 = &neg_one, 749 .extra1 = &neg_one,
730 .extra2 = &sixty, 750 .extra2 = &sixty,
731 }, 751 },
@@ -738,21 +758,23 @@ static struct ctl_table kern_table[] = {
738 .extra1 = &zero, 758 .extra1 = &zero,
739 .extra2 = &one, 759 .extra2 = &one,
740 }, 760 },
741#endif
742#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
743 { 761 {
744 .procname = "unknown_nmi_panic", 762 .procname = "nmi_watchdog",
745 .data = &unknown_nmi_panic, 763 .data = &watchdog_enabled,
746 .maxlen = sizeof (int), 764 .maxlen = sizeof (int),
747 .mode = 0644, 765 .mode = 0644,
748 .proc_handler = proc_dointvec, 766 .proc_handler = proc_dowatchdog,
767 .extra1 = &zero,
768 .extra2 = &one,
749 }, 769 },
770#endif
771#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
750 { 772 {
751 .procname = "nmi_watchdog", 773 .procname = "unknown_nmi_panic",
752 .data = &nmi_watchdog_enabled, 774 .data = &unknown_nmi_panic,
753 .maxlen = sizeof (int), 775 .maxlen = sizeof (int),
754 .mode = 0644, 776 .mode = 0644,
755 .proc_handler = proc_nmi_enabled, 777 .proc_handler = proc_dointvec,
756 }, 778 },
757#endif 779#endif
758#if defined(CONFIG_X86) 780#if defined(CONFIG_X86)
@@ -916,6 +938,12 @@ static struct ctl_table kern_table[] = {
916 }, 938 },
917#endif 939#endif
918#ifdef CONFIG_PERF_EVENTS 940#ifdef CONFIG_PERF_EVENTS
941 /*
942 * User-space scripts rely on the existence of this file
943 * as a feature check for perf_events being enabled.
944 *
945 * So it's an ABI, do not remove!
946 */
919 { 947 {
920 .procname = "perf_event_paranoid", 948 .procname = "perf_event_paranoid",
921 .data = &sysctl_perf_event_paranoid, 949 .data = &sysctl_perf_event_paranoid,
@@ -935,7 +963,7 @@ static struct ctl_table kern_table[] = {
935 .data = &sysctl_perf_event_sample_rate, 963 .data = &sysctl_perf_event_sample_rate,
936 .maxlen = sizeof(sysctl_perf_event_sample_rate), 964 .maxlen = sizeof(sysctl_perf_event_sample_rate),
937 .mode = 0644, 965 .mode = 0644,
938 .proc_handler = proc_dointvec, 966 .proc_handler = perf_proc_update_handler,
939 }, 967 },
940#endif 968#endif
941#ifdef CONFIG_KMEMCHECK 969#ifdef CONFIG_KMEMCHECK
@@ -956,10 +984,6 @@ static struct ctl_table kern_table[] = {
956 .proc_handler = proc_dointvec, 984 .proc_handler = proc_dointvec,
957 }, 985 },
958#endif 986#endif
959/*
960 * NOTE: do not add new entries to this table unless you have read
961 * Documentation/sysctl/ctl_unnumbered.txt
962 */
963 { } 987 { }
964}; 988};
965 989
@@ -969,14 +993,18 @@ static struct ctl_table vm_table[] = {
969 .data = &sysctl_overcommit_memory, 993 .data = &sysctl_overcommit_memory,
970 .maxlen = sizeof(sysctl_overcommit_memory), 994 .maxlen = sizeof(sysctl_overcommit_memory),
971 .mode = 0644, 995 .mode = 0644,
972 .proc_handler = proc_dointvec, 996 .proc_handler = proc_dointvec_minmax,
997 .extra1 = &zero,
998 .extra2 = &two,
973 }, 999 },
974 { 1000 {
975 .procname = "panic_on_oom", 1001 .procname = "panic_on_oom",
976 .data = &sysctl_panic_on_oom, 1002 .data = &sysctl_panic_on_oom,
977 .maxlen = sizeof(sysctl_panic_on_oom), 1003 .maxlen = sizeof(sysctl_panic_on_oom),
978 .mode = 0644, 1004 .mode = 0644,
979 .proc_handler = proc_dointvec, 1005 .proc_handler = proc_dointvec_minmax,
1006 .extra1 = &zero,
1007 .extra2 = &two,
980 }, 1008 },
981 { 1009 {
982 .procname = "oom_kill_allocating_task", 1010 .procname = "oom_kill_allocating_task",
@@ -1004,7 +1032,8 @@ static struct ctl_table vm_table[] = {
1004 .data = &page_cluster, 1032 .data = &page_cluster,
1005 .maxlen = sizeof(int), 1033 .maxlen = sizeof(int),
1006 .mode = 0644, 1034 .mode = 0644,
1007 .proc_handler = proc_dointvec, 1035 .proc_handler = proc_dointvec_minmax,
1036 .extra1 = &zero,
1008 }, 1037 },
1009 { 1038 {
1010 .procname = "dirty_background_ratio", 1039 .procname = "dirty_background_ratio",
@@ -1052,7 +1081,8 @@ static struct ctl_table vm_table[] = {
1052 .data = &dirty_expire_interval, 1081 .data = &dirty_expire_interval,
1053 .maxlen = sizeof(dirty_expire_interval), 1082 .maxlen = sizeof(dirty_expire_interval),
1054 .mode = 0644, 1083 .mode = 0644,
1055 .proc_handler = proc_dointvec, 1084 .proc_handler = proc_dointvec_minmax,
1085 .extra1 = &zero,
1056 }, 1086 },
1057 { 1087 {
1058 .procname = "nr_pdflush_threads", 1088 .procname = "nr_pdflush_threads",
@@ -1128,6 +1158,8 @@ static struct ctl_table vm_table[] = {
1128 .maxlen = sizeof(int), 1158 .maxlen = sizeof(int),
1129 .mode = 0644, 1159 .mode = 0644,
1130 .proc_handler = drop_caches_sysctl_handler, 1160 .proc_handler = drop_caches_sysctl_handler,
1161 .extra1 = &one,
1162 .extra2 = &three,
1131 }, 1163 },
1132#ifdef CONFIG_COMPACTION 1164#ifdef CONFIG_COMPACTION
1133 { 1165 {
@@ -1320,11 +1352,6 @@ static struct ctl_table vm_table[] = {
1320 .extra2 = &one, 1352 .extra2 = &one,
1321 }, 1353 },
1322#endif 1354#endif
1323
1324/*
1325 * NOTE: do not add new entries to this table unless you have read
1326 * Documentation/sysctl/ctl_unnumbered.txt
1327 */
1328 { } 1355 { }
1329}; 1356};
1330 1357
@@ -1340,28 +1367,28 @@ static struct ctl_table fs_table[] = {
1340 .data = &inodes_stat, 1367 .data = &inodes_stat,
1341 .maxlen = 2*sizeof(int), 1368 .maxlen = 2*sizeof(int),
1342 .mode = 0444, 1369 .mode = 0444,
1343 .proc_handler = proc_dointvec, 1370 .proc_handler = proc_nr_inodes,
1344 }, 1371 },
1345 { 1372 {
1346 .procname = "inode-state", 1373 .procname = "inode-state",
1347 .data = &inodes_stat, 1374 .data = &inodes_stat,
1348 .maxlen = 7*sizeof(int), 1375 .maxlen = 7*sizeof(int),
1349 .mode = 0444, 1376 .mode = 0444,
1350 .proc_handler = proc_dointvec, 1377 .proc_handler = proc_nr_inodes,
1351 }, 1378 },
1352 { 1379 {
1353 .procname = "file-nr", 1380 .procname = "file-nr",
1354 .data = &files_stat, 1381 .data = &files_stat,
1355 .maxlen = 3*sizeof(int), 1382 .maxlen = sizeof(files_stat),
1356 .mode = 0444, 1383 .mode = 0444,
1357 .proc_handler = proc_nr_files, 1384 .proc_handler = proc_nr_files,
1358 }, 1385 },
1359 { 1386 {
1360 .procname = "file-max", 1387 .procname = "file-max",
1361 .data = &files_stat.max_files, 1388 .data = &files_stat.max_files,
1362 .maxlen = sizeof(int), 1389 .maxlen = sizeof(files_stat.max_files),
1363 .mode = 0644, 1390 .mode = 0644,
1364 .proc_handler = proc_dointvec, 1391 .proc_handler = proc_doulongvec_minmax,
1365 }, 1392 },
1366 { 1393 {
1367 .procname = "nr_open", 1394 .procname = "nr_open",
@@ -1377,7 +1404,7 @@ static struct ctl_table fs_table[] = {
1377 .data = &dentry_stat, 1404 .data = &dentry_stat,
1378 .maxlen = 6*sizeof(int), 1405 .maxlen = 6*sizeof(int),
1379 .mode = 0444, 1406 .mode = 0444,
1380 .proc_handler = proc_dointvec, 1407 .proc_handler = proc_nr_dentry,
1381 }, 1408 },
1382 { 1409 {
1383 .procname = "overflowuid", 1410 .procname = "overflowuid",
@@ -1480,16 +1507,12 @@ static struct ctl_table fs_table[] = {
1480 .proc_handler = &pipe_proc_fn, 1507 .proc_handler = &pipe_proc_fn,
1481 .extra1 = &pipe_min_size, 1508 .extra1 = &pipe_min_size,
1482 }, 1509 },
1483/*
1484 * NOTE: do not add new entries to this table unless you have read
1485 * Documentation/sysctl/ctl_unnumbered.txt
1486 */
1487 { } 1510 { }
1488}; 1511};
1489 1512
1490static struct ctl_table debug_table[] = { 1513static struct ctl_table debug_table[] = {
1491#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ 1514#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
1492 defined(CONFIG_S390) 1515 defined(CONFIG_S390) || defined(CONFIG_TILE)
1493 { 1516 {
1494 .procname = "exception-trace", 1517 .procname = "exception-trace",
1495 .data = &show_unhandled_signals, 1518 .data = &show_unhandled_signals,
@@ -1567,11 +1590,16 @@ void sysctl_head_get(struct ctl_table_header *head)
1567 spin_unlock(&sysctl_lock); 1590 spin_unlock(&sysctl_lock);
1568} 1591}
1569 1592
1593static void free_head(struct rcu_head *rcu)
1594{
1595 kfree(container_of(rcu, struct ctl_table_header, rcu));
1596}
1597
1570void sysctl_head_put(struct ctl_table_header *head) 1598void sysctl_head_put(struct ctl_table_header *head)
1571{ 1599{
1572 spin_lock(&sysctl_lock); 1600 spin_lock(&sysctl_lock);
1573 if (!--head->count) 1601 if (!--head->count)
1574 kfree(head); 1602 call_rcu(&head->rcu, free_head);
1575 spin_unlock(&sysctl_lock); 1603 spin_unlock(&sysctl_lock);
1576} 1604}
1577 1605
@@ -1685,13 +1713,8 @@ static int test_perm(int mode, int op)
1685 1713
1686int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) 1714int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1687{ 1715{
1688 int error;
1689 int mode; 1716 int mode;
1690 1717
1691 error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
1692 if (error)
1693 return error;
1694
1695 if (root->permissions) 1718 if (root->permissions)
1696 mode = root->permissions(root, current->nsproxy, table); 1719 mode = root->permissions(root, current->nsproxy, table);
1697 else 1720 else
@@ -1948,10 +1971,10 @@ void unregister_sysctl_table(struct ctl_table_header * header)
1948 start_unregistering(header); 1971 start_unregistering(header);
1949 if (!--header->parent->count) { 1972 if (!--header->parent->count) {
1950 WARN_ON(1); 1973 WARN_ON(1);
1951 kfree(header->parent); 1974 call_rcu(&header->parent->rcu, free_head);
1952 } 1975 }
1953 if (!--header->count) 1976 if (!--header->count)
1954 kfree(header); 1977 call_rcu(&header->rcu, free_head);
1955 spin_unlock(&sysctl_lock); 1978 spin_unlock(&sysctl_lock);
1956} 1979}
1957 1980
@@ -2392,6 +2415,17 @@ static int proc_taint(struct ctl_table *table, int write,
2392 return err; 2415 return err;
2393} 2416}
2394 2417
2418#ifdef CONFIG_PRINTK
2419static int proc_dmesg_restrict(struct ctl_table *table, int write,
2420 void __user *buffer, size_t *lenp, loff_t *ppos)
2421{
2422 if (write && !capable(CAP_SYS_ADMIN))
2423 return -EPERM;
2424
2425 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2426}
2427#endif
2428
2395struct do_proc_dointvec_minmax_conv_param { 2429struct do_proc_dointvec_minmax_conv_param {
2396 int *min; 2430 int *min;
2397 int *max; 2431 int *max;
@@ -2893,7 +2927,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
2893 } 2927 }
2894} 2928}
2895 2929
2896#else /* CONFIG_PROC_FS */ 2930#else /* CONFIG_PROC_SYSCTL */
2897 2931
2898int proc_dostring(struct ctl_table *table, int write, 2932int proc_dostring(struct ctl_table *table, int write,
2899 void __user *buffer, size_t *lenp, loff_t *ppos) 2933 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2945,7 +2979,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2945} 2979}
2946 2980
2947 2981
2948#endif /* CONFIG_PROC_FS */ 2982#endif /* CONFIG_PROC_SYSCTL */
2949 2983
2950/* 2984/*
2951 * No sense putting this after each symbol definition, twice, 2985 * No sense putting this after each symbol definition, twice,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 1357c5786064..3b8e028b9601 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -136,7 +136,6 @@ static const struct bin_table bin_kern_table[] = {
136 { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, 136 { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
137 { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, 137 { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
138 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, 138 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
139 { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" },
140 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, 139 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
141 {} 140 {}
142}; 141};
@@ -1193,7 +1192,7 @@ static ssize_t bin_dn_node_address(struct file *file,
1193 1192
1194 buf[result] = '\0'; 1193 buf[result] = '\0';
1195 1194
1196 /* Convert the decnet addresss to binary */ 1195 /* Convert the decnet address to binary */
1197 result = -EIO; 1196 result = -EIO;
1198 nodep = strchr(buf, '.') + 1; 1197 nodep = strchr(buf, '.') + 1;
1199 if (!nodep) 1198 if (!nodep)
@@ -1322,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1322 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1321 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1323{ 1322{
1324 const struct bin_table *table = NULL; 1323 const struct bin_table *table = NULL;
1325 struct nameidata nd;
1326 struct vfsmount *mnt; 1324 struct vfsmount *mnt;
1327 struct file *file; 1325 struct file *file;
1328 ssize_t result; 1326 ssize_t result;
1329 char *pathname; 1327 char *pathname;
1330 int flags; 1328 int flags;
1331 int acc_mode;
1332 1329
1333 pathname = sysctl_getname(name, nlen, &table); 1330 pathname = sysctl_getname(name, nlen, &table);
1334 result = PTR_ERR(pathname); 1331 result = PTR_ERR(pathname);
@@ -1338,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1338 /* How should the sysctl be accessed? */ 1335 /* How should the sysctl be accessed? */
1339 if (oldval && oldlen && newval && newlen) { 1336 if (oldval && oldlen && newval && newlen) {
1340 flags = O_RDWR; 1337 flags = O_RDWR;
1341 acc_mode = MAY_READ | MAY_WRITE;
1342 } else if (newval && newlen) { 1338 } else if (newval && newlen) {
1343 flags = O_WRONLY; 1339 flags = O_WRONLY;
1344 acc_mode = MAY_WRITE;
1345 } else if (oldval && oldlen) { 1340 } else if (oldval && oldlen) {
1346 flags = O_RDONLY; 1341 flags = O_RDONLY;
1347 acc_mode = MAY_READ;
1348 } else { 1342 } else {
1349 result = 0; 1343 result = 0;
1350 goto out_putname; 1344 goto out_putname;
1351 } 1345 }
1352 1346
1353 mnt = current->nsproxy->pid_ns->proc_mnt; 1347 mnt = current->nsproxy->pid_ns->proc_mnt;
1354 result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd); 1348 file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
1355 if (result)
1356 goto out_putname;
1357
1358 result = may_open(&nd.path, acc_mode, flags);
1359 if (result)
1360 goto out_putpath;
1361
1362 file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
1363 result = PTR_ERR(file); 1349 result = PTR_ERR(file);
1364 if (IS_ERR(file)) 1350 if (IS_ERR(file))
1365 goto out_putname; 1351 goto out_putname;
@@ -1371,10 +1357,6 @@ out_putname:
1371 putname(pathname); 1357 putname(pathname);
1372out: 1358out:
1373 return result; 1359 return result;
1374
1375out_putpath:
1376 path_put(&nd.path);
1377 goto out_putname;
1378} 1360}
1379 1361
1380 1362
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 10b90d8a03c4..4e4932a7b360 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
111 const char *fail = NULL; 111 const char *fail = NULL;
112 112
113 if (table->parent) { 113 if (table->parent) {
114 if (table->procname && !table->parent->procname) 114 if (!table->parent->procname)
115 set_fail(&fail, table, "Parent without procname"); 115 set_fail(&fail, table, "Parent without procname");
116 } 116 }
117 if (!table->procname)
118 set_fail(&fail, table, "No procname");
119 if (table->child) { 117 if (table->child) {
120 if (table->data) 118 if (table->data)
121 set_fail(&fail, table, "Directory with data?"); 119 set_fail(&fail, table, "Directory with data?");
@@ -144,13 +142,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
144 set_fail(&fail, table, "No maxlen"); 142 set_fail(&fail, table, "No maxlen");
145 } 143 }
146#ifdef CONFIG_PROC_SYSCTL 144#ifdef CONFIG_PROC_SYSCTL
147 if (table->procname && !table->proc_handler) 145 if (!table->proc_handler)
148 set_fail(&fail, table, "No proc_handler"); 146 set_fail(&fail, table, "No proc_handler");
149#endif 147#endif
150#if 0
151 if (!table->procname && table->proc_handler)
152 set_fail(&fail, table, "proc_handler without procname");
153#endif
154 sysctl_check_leaf(namespaces, table, &fail); 148 sysctl_check_leaf(namespaces, table, &fail);
155 } 149 }
156 if (table->mode > 0777) 150 if (table->mode > 0777)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 11281d5792bd..fc0f22005417 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -89,8 +89,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
89 return -ENOMEM; 89 return -ENOMEM;
90 90
91 if (!info) { 91 if (!info) {
92 int seq = get_cpu_var(taskstats_seqnum)++; 92 int seq = this_cpu_inc_return(taskstats_seqnum) - 1;
93 put_cpu_var(taskstats_seqnum);
94 93
95 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 94 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
96 } else 95 } else
@@ -175,22 +174,8 @@ static void send_cpu_listeners(struct sk_buff *skb,
175 up_write(&listeners->sem); 174 up_write(&listeners->sem);
176} 175}
177 176
178static int fill_pid(pid_t pid, struct task_struct *tsk, 177static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
179 struct taskstats *stats)
180{ 178{
181 int rc = 0;
182
183 if (!tsk) {
184 rcu_read_lock();
185 tsk = find_task_by_vpid(pid);
186 if (tsk)
187 get_task_struct(tsk);
188 rcu_read_unlock();
189 if (!tsk)
190 return -ESRCH;
191 } else
192 get_task_struct(tsk);
193
194 memset(stats, 0, sizeof(*stats)); 179 memset(stats, 0, sizeof(*stats));
195 /* 180 /*
196 * Each accounting subsystem adds calls to its functions to 181 * Each accounting subsystem adds calls to its functions to
@@ -209,17 +194,27 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
209 194
210 /* fill in extended acct fields */ 195 /* fill in extended acct fields */
211 xacct_add_tsk(stats, tsk); 196 xacct_add_tsk(stats, tsk);
197}
212 198
213 /* Define err: label here if needed */ 199static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
214 put_task_struct(tsk); 200{
215 return rc; 201 struct task_struct *tsk;
216 202
203 rcu_read_lock();
204 tsk = find_task_by_vpid(pid);
205 if (tsk)
206 get_task_struct(tsk);
207 rcu_read_unlock();
208 if (!tsk)
209 return -ESRCH;
210 fill_stats(tsk, stats);
211 put_task_struct(tsk);
212 return 0;
217} 213}
218 214
219static int fill_tgid(pid_t tgid, struct task_struct *first, 215static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
220 struct taskstats *stats)
221{ 216{
222 struct task_struct *tsk; 217 struct task_struct *tsk, *first;
223 unsigned long flags; 218 unsigned long flags;
224 int rc = -ESRCH; 219 int rc = -ESRCH;
225 220
@@ -228,8 +223,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
228 * leaders who are already counted with the dead tasks 223 * leaders who are already counted with the dead tasks
229 */ 224 */
230 rcu_read_lock(); 225 rcu_read_lock();
231 if (!first) 226 first = find_task_by_vpid(tgid);
232 first = find_task_by_vpid(tgid);
233 227
234 if (!first || !lock_task_sighand(first, &flags)) 228 if (!first || !lock_task_sighand(first, &flags))
235 goto out; 229 goto out;
@@ -268,7 +262,6 @@ out:
268 return rc; 262 return rc;
269} 263}
270 264
271
272static void fill_tgid_exit(struct task_struct *tsk) 265static void fill_tgid_exit(struct task_struct *tsk)
273{ 266{
274 unsigned long flags; 267 unsigned long flags;
@@ -292,16 +285,18 @@ ret:
292static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) 285static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
293{ 286{
294 struct listener_list *listeners; 287 struct listener_list *listeners;
295 struct listener *s, *tmp; 288 struct listener *s, *tmp, *s2;
296 unsigned int cpu; 289 unsigned int cpu;
297 290
298 if (!cpumask_subset(mask, cpu_possible_mask)) 291 if (!cpumask_subset(mask, cpu_possible_mask))
299 return -EINVAL; 292 return -EINVAL;
300 293
294 s = NULL;
301 if (isadd == REGISTER) { 295 if (isadd == REGISTER) {
302 for_each_cpu(cpu, mask) { 296 for_each_cpu(cpu, mask) {
303 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, 297 if (!s)
304 cpu_to_node(cpu)); 298 s = kmalloc_node(sizeof(struct listener),
299 GFP_KERNEL, cpu_to_node(cpu));
305 if (!s) 300 if (!s)
306 goto cleanup; 301 goto cleanup;
307 s->pid = pid; 302 s->pid = pid;
@@ -310,9 +305,16 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
310 305
311 listeners = &per_cpu(listener_array, cpu); 306 listeners = &per_cpu(listener_array, cpu);
312 down_write(&listeners->sem); 307 down_write(&listeners->sem);
308 list_for_each_entry_safe(s2, tmp, &listeners->list, list) {
309 if (s2->pid == pid)
310 goto next_cpu;
311 }
313 list_add(&s->list, &listeners->list); 312 list_add(&s->list, &listeners->list);
313 s = NULL;
314next_cpu:
314 up_write(&listeners->sem); 315 up_write(&listeners->sem);
315 } 316 }
317 kfree(s);
316 return 0; 318 return 0;
317 } 319 }
318 320
@@ -355,6 +357,10 @@ static int parse(struct nlattr *na, struct cpumask *mask)
355 return ret; 357 return ret;
356} 358}
357 359
360#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
361#define TASKSTATS_NEEDS_PADDING 1
362#endif
363
358static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) 364static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
359{ 365{
360 struct nlattr *na, *ret; 366 struct nlattr *na, *ret;
@@ -364,9 +370,33 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
364 ? TASKSTATS_TYPE_AGGR_PID 370 ? TASKSTATS_TYPE_AGGR_PID
365 : TASKSTATS_TYPE_AGGR_TGID; 371 : TASKSTATS_TYPE_AGGR_TGID;
366 372
373 /*
374 * The taskstats structure is internally aligned on 8 byte
375 * boundaries but the layout of the aggregrate reply, with
376 * two NLA headers and the pid (each 4 bytes), actually
377 * force the entire structure to be unaligned. This causes
378 * the kernel to issue unaligned access warnings on some
379 * architectures like ia64. Unfortunately, some software out there
380 * doesn't properly unroll the NLA packet and assumes that the start
381 * of the taskstats structure will always be 20 bytes from the start
382 * of the netlink payload. Aligning the start of the taskstats
383 * structure breaks this software, which we don't want. So, for now
384 * the alignment only happens on architectures that require it
385 * and those users will have to update to fixed versions of those
386 * packages. Space is reserved in the packet only when needed.
387 * This ifdef should be removed in several years e.g. 2012 once
388 * we can be confident that fixed versions are installed on most
389 * systems. We add the padding before the aggregate since the
390 * aggregate is already a defined type.
391 */
392#ifdef TASKSTATS_NEEDS_PADDING
393 if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0)
394 goto err;
395#endif
367 na = nla_nest_start(skb, aggr); 396 na = nla_nest_start(skb, aggr);
368 if (!na) 397 if (!na)
369 goto err; 398 goto err;
399
370 if (nla_put(skb, type, sizeof(pid), &pid) < 0) 400 if (nla_put(skb, type, sizeof(pid), &pid) < 0)
371 goto err; 401 goto err;
372 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); 402 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
@@ -424,74 +454,122 @@ err:
424 return rc; 454 return rc;
425} 455}
426 456
427static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 457static int cmd_attr_register_cpumask(struct genl_info *info)
428{ 458{
429 int rc;
430 struct sk_buff *rep_skb;
431 struct taskstats *stats;
432 size_t size;
433 cpumask_var_t mask; 459 cpumask_var_t mask;
460 int rc;
434 461
435 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 462 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
436 return -ENOMEM; 463 return -ENOMEM;
437
438 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 464 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
439 if (rc < 0) 465 if (rc < 0)
440 goto free_return_rc; 466 goto out;
441 if (rc == 0) { 467 rc = add_del_listener(info->snd_pid, mask, REGISTER);
442 rc = add_del_listener(info->snd_pid, mask, REGISTER); 468out:
443 goto free_return_rc; 469 free_cpumask_var(mask);
444 } 470 return rc;
471}
445 472
473static int cmd_attr_deregister_cpumask(struct genl_info *info)
474{
475 cpumask_var_t mask;
476 int rc;
477
478 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
479 return -ENOMEM;
446 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 480 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
447 if (rc < 0) 481 if (rc < 0)
448 goto free_return_rc; 482 goto out;
449 if (rc == 0) { 483 rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
450 rc = add_del_listener(info->snd_pid, mask, DEREGISTER); 484out:
451free_return_rc:
452 free_cpumask_var(mask);
453 return rc;
454 }
455 free_cpumask_var(mask); 485 free_cpumask_var(mask);
486 return rc;
487}
488
489static size_t taskstats_packet_size(void)
490{
491 size_t size;
456 492
457 /*
458 * Size includes space for nested attributes
459 */
460 size = nla_total_size(sizeof(u32)) + 493 size = nla_total_size(sizeof(u32)) +
461 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 494 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
495#ifdef TASKSTATS_NEEDS_PADDING
496 size += nla_total_size(0); /* Padding for alignment */
497#endif
498 return size;
499}
500
501static int cmd_attr_pid(struct genl_info *info)
502{
503 struct taskstats *stats;
504 struct sk_buff *rep_skb;
505 size_t size;
506 u32 pid;
507 int rc;
508
509 size = taskstats_packet_size();
462 510
463 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 511 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
464 if (rc < 0) 512 if (rc < 0)
465 return rc; 513 return rc;
466 514
467 rc = -EINVAL; 515 rc = -EINVAL;
468 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { 516 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
469 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 517 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
470 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 518 if (!stats)
471 if (!stats) 519 goto err;
472 goto err; 520
473 521 rc = fill_stats_for_pid(pid, stats);
474 rc = fill_pid(pid, NULL, stats); 522 if (rc < 0)
475 if (rc < 0) 523 goto err;
476 goto err; 524 return send_reply(rep_skb, info);
477 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { 525err:
478 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 526 nlmsg_free(rep_skb);
479 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); 527 return rc;
480 if (!stats) 528}
481 goto err; 529
482 530static int cmd_attr_tgid(struct genl_info *info)
483 rc = fill_tgid(tgid, NULL, stats); 531{
484 if (rc < 0) 532 struct taskstats *stats;
485 goto err; 533 struct sk_buff *rep_skb;
486 } else 534 size_t size;
535 u32 tgid;
536 int rc;
537
538 size = taskstats_packet_size();
539
540 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
541 if (rc < 0)
542 return rc;
543
544 rc = -EINVAL;
545 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
546 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
547 if (!stats)
487 goto err; 548 goto err;
488 549
550 rc = fill_stats_for_tgid(tgid, stats);
551 if (rc < 0)
552 goto err;
489 return send_reply(rep_skb, info); 553 return send_reply(rep_skb, info);
490err: 554err:
491 nlmsg_free(rep_skb); 555 nlmsg_free(rep_skb);
492 return rc; 556 return rc;
493} 557}
494 558
559static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
560{
561 if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK])
562 return cmd_attr_register_cpumask(info);
563 else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK])
564 return cmd_attr_deregister_cpumask(info);
565 else if (info->attrs[TASKSTATS_CMD_ATTR_PID])
566 return cmd_attr_pid(info);
567 else if (info->attrs[TASKSTATS_CMD_ATTR_TGID])
568 return cmd_attr_tgid(info);
569 else
570 return -EINVAL;
571}
572
495static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 573static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
496{ 574{
497 struct signal_struct *sig = tsk->signal; 575 struct signal_struct *sig = tsk->signal;
@@ -532,8 +610,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
532 /* 610 /*
533 * Size includes space for nested attributes 611 * Size includes space for nested attributes
534 */ 612 */
535 size = nla_total_size(sizeof(u32)) + 613 size = taskstats_packet_size();
536 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
537 614
538 is_thread_group = !!taskstats_tgid_alloc(tsk); 615 is_thread_group = !!taskstats_tgid_alloc(tsk);
539 if (is_thread_group) { 616 if (is_thread_group) {
@@ -543,7 +620,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
543 fill_tgid_exit(tsk); 620 fill_tgid_exit(tsk);
544 } 621 }
545 622
546 listeners = &__raw_get_cpu_var(listener_array); 623 listeners = __this_cpu_ptr(&listener_array);
547 if (list_empty(&listeners->list)) 624 if (list_empty(&listeners->list))
548 return; 625 return;
549 626
@@ -555,9 +632,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
555 if (!stats) 632 if (!stats)
556 goto err; 633 goto err;
557 634
558 rc = fill_pid(-1, tsk, stats); 635 fill_stats(tsk, stats);
559 if (rc < 0)
560 goto err;
561 636
562 /* 637 /*
563 * Doesn't matter if tsk is the leader or the last group member leaving 638 * Doesn't matter if tsk is the leader or the last group member leaving
@@ -619,7 +694,7 @@ static int __init taskstats_init(void)
619 goto err_cgroup_ops; 694 goto err_cgroup_ops;
620 695
621 family_registered = 1; 696 family_registered = 1;
622 printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 697 pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
623 return 0; 698 return 0;
624err_cgroup_ops: 699err_cgroup_ops:
625 genl_unregister_ops(&family, &taskstats_ops); 700 genl_unregister_ops(&family, &taskstats_ops);
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 4f104515a19b..f8b11a283171 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -115,7 +115,9 @@ static int test_kprobes(void)
115 int ret; 115 int ret;
116 struct kprobe *kps[2] = {&kp, &kp2}; 116 struct kprobe *kps[2] = {&kp, &kp2};
117 117
118 kp.addr = 0; /* addr should be cleard for reusing kprobe. */ 118 /* addr and flags should be cleard for reusing kprobe. */
119 kp.addr = NULL;
120 kp.flags = 0;
119 ret = register_kprobes(kps, 2); 121 ret = register_kprobes(kps, 2);
120 if (ret < 0) { 122 if (ret < 0) {
121 printk(KERN_ERR "Kprobe smoke test failed: " 123 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -210,7 +212,9 @@ static int test_jprobes(void)
210 int ret; 212 int ret;
211 struct jprobe *jps[2] = {&jp, &jp2}; 213 struct jprobe *jps[2] = {&jp, &jp2};
212 214
213 jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ 215 /* addr and flags should be cleard for reusing kprobe. */
216 jp.kp.addr = NULL;
217 jp.kp.flags = 0;
214 ret = register_jprobes(jps, 2); 218 ret = register_jprobes(jps, 2);
215 if (ret < 0) { 219 if (ret < 0) {
216 printk(KERN_ERR "Kprobe smoke test failed: " 220 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -323,7 +327,9 @@ static int test_kretprobes(void)
323 int ret; 327 int ret;
324 struct kretprobe *rps[2] = {&rp, &rp2}; 328 struct kretprobe *rps[2] = {&rp, &rp2};
325 329
326 rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ 330 /* addr and flags should be cleard for reusing kprobe. */
331 rp.kp.addr = NULL;
332 rp.kp.flags = 0;
327 ret = register_kretprobes(rps, 2); 333 ret = register_kretprobes(rps, 2);
328 if (ret < 0) { 334 if (ret < 0) {
329 printk(KERN_ERR "Kprobe smoke test failed: " 335 printk(KERN_ERR "Kprobe smoke test failed: "
diff --git a/kernel/time.c b/kernel/time.c
index ba9b338d1835..8e8dc6d705c9 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -150,7 +150,7 @@ static inline void warp_clock(void)
150 * various programs will get confused when the clock gets warped. 150 * various programs will get confused when the clock gets warped.
151 */ 151 */
152 152
153int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) 153int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
154{ 154{
155 static int firsttime = 1; 155 static int firsttime = 1;
156 int error = 0; 156 int error = 0;
@@ -238,7 +238,7 @@ EXPORT_SYMBOL(current_fs_time);
238 * Avoid unnecessary multiplications/divisions in the 238 * Avoid unnecessary multiplications/divisions in the
239 * two most common HZ cases: 239 * two most common HZ cases:
240 */ 240 */
241unsigned int inline jiffies_to_msecs(const unsigned long j) 241inline unsigned int jiffies_to_msecs(const unsigned long j)
242{ 242{
243#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) 243#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
244 return (MSEC_PER_SEC / HZ) * j; 244 return (MSEC_PER_SEC / HZ) * j;
@@ -254,7 +254,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
254} 254}
255EXPORT_SYMBOL(jiffies_to_msecs); 255EXPORT_SYMBOL(jiffies_to_msecs);
256 256
257unsigned int inline jiffies_to_usecs(const unsigned long j) 257inline unsigned int jiffies_to_usecs(const unsigned long j)
258{ 258{
259#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) 259#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
260 return (USEC_PER_SEC / HZ) * j; 260 return (USEC_PER_SEC / HZ) * j;
@@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x)
645} 645}
646 646
647/** 647/**
648 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies 648 * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
649 * 649 *
650 * @n: nsecs in u64 650 * @n: nsecs in u64
651 * 651 *
@@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x)
657 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) 657 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
658 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years 658 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
659 */ 659 */
660unsigned long nsecs_to_jiffies(u64 n) 660u64 nsecs_to_jiffies64(u64 n)
661{ 661{
662#if (NSEC_PER_SEC % HZ) == 0 662#if (NSEC_PER_SEC % HZ) == 0
663 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ 663 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
@@ -674,22 +674,23 @@ unsigned long nsecs_to_jiffies(u64 n)
674#endif 674#endif
675} 675}
676 676
677#if (BITS_PER_LONG < 64) 677/**
678u64 get_jiffies_64(void) 678 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
679 *
680 * @n: nsecs in u64
681 *
682 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
683 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
684 * for scheduler, not for use in device drivers to calculate timeout value.
685 *
686 * note:
687 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
688 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
689 */
690unsigned long nsecs_to_jiffies(u64 n)
679{ 691{
680 unsigned long seq; 692 return (unsigned long)nsecs_to_jiffies64(n);
681 u64 ret;
682
683 do {
684 seq = read_seqbegin(&xtime_lock);
685 ret = jiffies_64;
686 } while (read_seqretry(&xtime_lock, seq));
687 return ret;
688} 693}
689EXPORT_SYMBOL(get_jiffies_64);
690#endif
691
692EXPORT_SYMBOL(jiffies);
693 694
694/* 695/*
695 * Add two timespec values and do a safety check for overflow. 696 * Add two timespec values and do a safety check for overflow.
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ee266620b06c..e2fd74b8e8c2 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,5 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
2obj-y += timeconv.o posix-clock.o alarmtimer.o
2 3
3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
new file mode 100644
index 000000000000..59f369f98a04
--- /dev/null
+++ b/kernel/time/alarmtimer.c
@@ -0,0 +1,720 @@
1/*
2 * Alarmtimer interface
3 *
4 * This interface provides a timer which is similarto hrtimers,
5 * but triggers a RTC alarm if the box is suspend.
6 *
7 * This interface is influenced by the Android RTC Alarm timer
8 * interface.
9 *
10 * Copyright (C) 2010 IBM Corperation
11 *
12 * Author: John Stultz <john.stultz@linaro.org>
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License version 2 as
16 * published by the Free Software Foundation.
17 */
18#include <linux/time.h>
19#include <linux/hrtimer.h>
20#include <linux/timerqueue.h>
21#include <linux/rtc.h>
22#include <linux/alarmtimer.h>
23#include <linux/mutex.h>
24#include <linux/platform_device.h>
25#include <linux/posix-timers.h>
26#include <linux/workqueue.h>
27#include <linux/freezer.h>
28
29/**
30 * struct alarm_base - Alarm timer bases
31 * @lock: Lock for syncrhonized access to the base
32 * @timerqueue: Timerqueue head managing the list of events
33 * @timer: hrtimer used to schedule events while running
34 * @gettime: Function to read the time correlating to the base
35 * @base_clockid: clockid for the base
36 */
37static struct alarm_base {
38 spinlock_t lock;
39 struct timerqueue_head timerqueue;
40 struct hrtimer timer;
41 ktime_t (*gettime)(void);
42 clockid_t base_clockid;
43} alarm_bases[ALARM_NUMTYPE];
44
45/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */
46static ktime_t freezer_delta;
47static DEFINE_SPINLOCK(freezer_delta_lock);
48
49#ifdef CONFIG_RTC_CLASS
50/* rtc timer and device for setting alarm wakeups at suspend */
51static struct rtc_timer rtctimer;
52static struct rtc_device *rtcdev;
53static DEFINE_SPINLOCK(rtcdev_lock);
54
55/**
56 * has_wakealarm - check rtc device has wakealarm ability
57 * @dev: current device
58 * @name_ptr: name to be returned
59 *
60 * This helper function checks to see if the rtc device can wake
61 * from suspend.
62 */
63static int has_wakealarm(struct device *dev, void *name_ptr)
64{
65 struct rtc_device *candidate = to_rtc_device(dev);
66
67 if (!candidate->ops->set_alarm)
68 return 0;
69 if (!device_may_wakeup(candidate->dev.parent))
70 return 0;
71
72 *(const char **)name_ptr = dev_name(dev);
73 return 1;
74}
75
76/**
77 * alarmtimer_get_rtcdev - Return selected rtcdevice
78 *
79 * This function returns the rtc device to use for wakealarms.
80 * If one has not already been chosen, it checks to see if a
81 * functional rtc device is available.
82 */
83static struct rtc_device *alarmtimer_get_rtcdev(void)
84{
85 struct device *dev;
86 char *str;
87 unsigned long flags;
88 struct rtc_device *ret;
89
90 spin_lock_irqsave(&rtcdev_lock, flags);
91 if (!rtcdev) {
92 /* Find an rtc device and init the rtc_timer */
93 dev = class_find_device(rtc_class, NULL, &str, has_wakealarm);
94 /* If we have a device then str is valid. See has_wakealarm() */
95 if (dev) {
96 rtcdev = rtc_class_open(str);
97 /*
98 * Drop the reference we got in class_find_device,
99 * rtc_open takes its own.
100 */
101 put_device(dev);
102 rtc_timer_init(&rtctimer, NULL, NULL);
103 }
104 }
105 ret = rtcdev;
106 spin_unlock_irqrestore(&rtcdev_lock, flags);
107
108 return ret;
109}
110#else
111#define alarmtimer_get_rtcdev() (0)
112#define rtcdev (0)
113#endif
114
115
116/**
117 * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
118 * @base: pointer to the base where the timer is being run
119 * @alarm: pointer to alarm being enqueued.
120 *
121 * Adds alarm to a alarm_base timerqueue and if necessary sets
122 * an hrtimer to run.
123 *
124 * Must hold base->lock when calling.
125 */
126static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
127{
128 timerqueue_add(&base->timerqueue, &alarm->node);
129 if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
130 hrtimer_try_to_cancel(&base->timer);
131 hrtimer_start(&base->timer, alarm->node.expires,
132 HRTIMER_MODE_ABS);
133 }
134}
135
136/**
137 * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue
138 * @base: pointer to the base where the timer is running
139 * @alarm: pointer to alarm being removed
140 *
141 * Removes alarm to a alarm_base timerqueue and if necessary sets
142 * a new timer to run.
143 *
144 * Must hold base->lock when calling.
145 */
146static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
147{
148 struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
149
150 timerqueue_del(&base->timerqueue, &alarm->node);
151 if (next == &alarm->node) {
152 hrtimer_try_to_cancel(&base->timer);
153 next = timerqueue_getnext(&base->timerqueue);
154 if (!next)
155 return;
156 hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS);
157 }
158}
159
160
161/**
162 * alarmtimer_fired - Handles alarm hrtimer being fired.
163 * @timer: pointer to hrtimer being run
164 *
165 * When a alarm timer fires, this runs through the timerqueue to
166 * see which alarms expired, and runs those. If there are more alarm
167 * timers queued for the future, we set the hrtimer to fire when
168 * when the next future alarm timer expires.
169 */
170static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
171{
172 struct alarm_base *base = container_of(timer, struct alarm_base, timer);
173 struct timerqueue_node *next;
174 unsigned long flags;
175 ktime_t now;
176 int ret = HRTIMER_NORESTART;
177
178 spin_lock_irqsave(&base->lock, flags);
179 now = base->gettime();
180 while ((next = timerqueue_getnext(&base->timerqueue))) {
181 struct alarm *alarm;
182 ktime_t expired = next->expires;
183
184 if (expired.tv64 >= now.tv64)
185 break;
186
187 alarm = container_of(next, struct alarm, node);
188
189 timerqueue_del(&base->timerqueue, &alarm->node);
190 alarm->enabled = 0;
191 /* Re-add periodic timers */
192 if (alarm->period.tv64) {
193 alarm->node.expires = ktime_add(expired, alarm->period);
194 timerqueue_add(&base->timerqueue, &alarm->node);
195 alarm->enabled = 1;
196 }
197 spin_unlock_irqrestore(&base->lock, flags);
198 if (alarm->function)
199 alarm->function(alarm);
200 spin_lock_irqsave(&base->lock, flags);
201 }
202
203 if (next) {
204 hrtimer_set_expires(&base->timer, next->expires);
205 ret = HRTIMER_RESTART;
206 }
207 spin_unlock_irqrestore(&base->lock, flags);
208
209 return ret;
210
211}
212
213#ifdef CONFIG_RTC_CLASS
214/**
215 * alarmtimer_suspend - Suspend time callback
216 * @dev: unused
217 * @state: unused
218 *
219 * When we are going into suspend, we look through the bases
220 * to see which is the soonest timer to expire. We then
221 * set an rtc timer to fire that far into the future, which
222 * will wake us from suspend.
223 */
224static int alarmtimer_suspend(struct device *dev)
225{
226 struct rtc_time tm;
227 ktime_t min, now;
228 unsigned long flags;
229 struct rtc_device *rtc;
230 int i;
231
232 spin_lock_irqsave(&freezer_delta_lock, flags);
233 min = freezer_delta;
234 freezer_delta = ktime_set(0, 0);
235 spin_unlock_irqrestore(&freezer_delta_lock, flags);
236
237 rtc = rtcdev;
238 /* If we have no rtcdev, just return */
239 if (!rtc)
240 return 0;
241
242 /* Find the soonest timer to expire*/
243 for (i = 0; i < ALARM_NUMTYPE; i++) {
244 struct alarm_base *base = &alarm_bases[i];
245 struct timerqueue_node *next;
246 ktime_t delta;
247
248 spin_lock_irqsave(&base->lock, flags);
249 next = timerqueue_getnext(&base->timerqueue);
250 spin_unlock_irqrestore(&base->lock, flags);
251 if (!next)
252 continue;
253 delta = ktime_sub(next->expires, base->gettime());
254 if (!min.tv64 || (delta.tv64 < min.tv64))
255 min = delta;
256 }
257 if (min.tv64 == 0)
258 return 0;
259
260 /* XXX - Should we enforce a minimum sleep time? */
261 WARN_ON(min.tv64 < NSEC_PER_SEC);
262
263 /* Setup an rtc timer to fire that far in the future */
264 rtc_timer_cancel(rtc, &rtctimer);
265 rtc_read_time(rtc, &tm);
266 now = rtc_tm_to_ktime(tm);
267 now = ktime_add(now, min);
268
269 rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
270
271 return 0;
272}
273#else
274static int alarmtimer_suspend(struct device *dev)
275{
276 return 0;
277}
278#endif
279
280static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
281{
282 ktime_t delta;
283 unsigned long flags;
284 struct alarm_base *base = &alarm_bases[type];
285
286 delta = ktime_sub(absexp, base->gettime());
287
288 spin_lock_irqsave(&freezer_delta_lock, flags);
289 if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64))
290 freezer_delta = delta;
291 spin_unlock_irqrestore(&freezer_delta_lock, flags);
292}
293
294
295/**
296 * alarm_init - Initialize an alarm structure
297 * @alarm: ptr to alarm to be initialized
298 * @type: the type of the alarm
299 * @function: callback that is run when the alarm fires
300 */
301void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
302 void (*function)(struct alarm *))
303{
304 timerqueue_init(&alarm->node);
305 alarm->period = ktime_set(0, 0);
306 alarm->function = function;
307 alarm->type = type;
308 alarm->enabled = 0;
309}
310
311/**
312 * alarm_start - Sets an alarm to fire
313 * @alarm: ptr to alarm to set
314 * @start: time to run the alarm
315 * @period: period at which the alarm will recur
316 */
317void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period)
318{
319 struct alarm_base *base = &alarm_bases[alarm->type];
320 unsigned long flags;
321
322 spin_lock_irqsave(&base->lock, flags);
323 if (alarm->enabled)
324 alarmtimer_remove(base, alarm);
325 alarm->node.expires = start;
326 alarm->period = period;
327 alarmtimer_enqueue(base, alarm);
328 alarm->enabled = 1;
329 spin_unlock_irqrestore(&base->lock, flags);
330}
331
332/**
333 * alarm_cancel - Tries to cancel an alarm timer
334 * @alarm: ptr to alarm to be canceled
335 */
336void alarm_cancel(struct alarm *alarm)
337{
338 struct alarm_base *base = &alarm_bases[alarm->type];
339 unsigned long flags;
340
341 spin_lock_irqsave(&base->lock, flags);
342 if (alarm->enabled)
343 alarmtimer_remove(base, alarm);
344 alarm->enabled = 0;
345 spin_unlock_irqrestore(&base->lock, flags);
346}
347
348
349/**
350 * clock2alarm - helper that converts from clockid to alarmtypes
351 * @clockid: clockid.
352 */
353static enum alarmtimer_type clock2alarm(clockid_t clockid)
354{
355 if (clockid == CLOCK_REALTIME_ALARM)
356 return ALARM_REALTIME;
357 if (clockid == CLOCK_BOOTTIME_ALARM)
358 return ALARM_BOOTTIME;
359 return -1;
360}
361
362/**
363 * alarm_handle_timer - Callback for posix timers
364 * @alarm: alarm that fired
365 *
366 * Posix timer callback for expired alarm timers.
367 */
368static void alarm_handle_timer(struct alarm *alarm)
369{
370 struct k_itimer *ptr = container_of(alarm, struct k_itimer,
371 it.alarmtimer);
372 if (posix_timer_event(ptr, 0) != 0)
373 ptr->it_overrun++;
374}
375
376/**
377 * alarm_clock_getres - posix getres interface
378 * @which_clock: clockid
379 * @tp: timespec to fill
380 *
381 * Returns the granularity of underlying alarm base clock
382 */
383static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
384{
385 clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
386
387 if (!alarmtimer_get_rtcdev())
388 return -ENOTSUPP;
389
390 return hrtimer_get_res(baseid, tp);
391}
392
393/**
394 * alarm_clock_get - posix clock_get interface
395 * @which_clock: clockid
396 * @tp: timespec to fill.
397 *
398 * Provides the underlying alarm base time.
399 */
400static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
401{
402 struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
403
404 if (!alarmtimer_get_rtcdev())
405 return -ENOTSUPP;
406
407 *tp = ktime_to_timespec(base->gettime());
408 return 0;
409}
410
411/**
412 * alarm_timer_create - posix timer_create interface
413 * @new_timer: k_itimer pointer to manage
414 *
415 * Initializes the k_itimer structure.
416 */
417static int alarm_timer_create(struct k_itimer *new_timer)
418{
419 enum alarmtimer_type type;
420 struct alarm_base *base;
421
422 if (!alarmtimer_get_rtcdev())
423 return -ENOTSUPP;
424
425 if (!capable(CAP_WAKE_ALARM))
426 return -EPERM;
427
428 type = clock2alarm(new_timer->it_clock);
429 base = &alarm_bases[type];
430 alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer);
431 return 0;
432}
433
434/**
435 * alarm_timer_get - posix timer_get interface
436 * @new_timer: k_itimer pointer
437 * @cur_setting: itimerspec data to fill
438 *
439 * Copies the itimerspec data out from the k_itimer
440 */
441static void alarm_timer_get(struct k_itimer *timr,
442 struct itimerspec *cur_setting)
443{
444 cur_setting->it_interval =
445 ktime_to_timespec(timr->it.alarmtimer.period);
446 cur_setting->it_value =
447 ktime_to_timespec(timr->it.alarmtimer.node.expires);
448 return;
449}
450
451/**
452 * alarm_timer_del - posix timer_del interface
453 * @timr: k_itimer pointer to be deleted
454 *
455 * Cancels any programmed alarms for the given timer.
456 */
457static int alarm_timer_del(struct k_itimer *timr)
458{
459 if (!rtcdev)
460 return -ENOTSUPP;
461
462 alarm_cancel(&timr->it.alarmtimer);
463 return 0;
464}
465
466/**
467 * alarm_timer_set - posix timer_set interface
468 * @timr: k_itimer pointer to be deleted
469 * @flags: timer flags
470 * @new_setting: itimerspec to be used
471 * @old_setting: itimerspec being replaced
472 *
473 * Sets the timer to new_setting, and starts the timer.
474 */
475static int alarm_timer_set(struct k_itimer *timr, int flags,
476 struct itimerspec *new_setting,
477 struct itimerspec *old_setting)
478{
479 if (!rtcdev)
480 return -ENOTSUPP;
481
482 /* Save old values */
483 old_setting->it_interval =
484 ktime_to_timespec(timr->it.alarmtimer.period);
485 old_setting->it_value =
486 ktime_to_timespec(timr->it.alarmtimer.node.expires);
487
488 /* If the timer was already set, cancel it */
489 alarm_cancel(&timr->it.alarmtimer);
490
491 /* start the timer */
492 alarm_start(&timr->it.alarmtimer,
493 timespec_to_ktime(new_setting->it_value),
494 timespec_to_ktime(new_setting->it_interval));
495 return 0;
496}
497
498/**
499 * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep
500 * @alarm: ptr to alarm that fired
501 *
502 * Wakes up the task that set the alarmtimer
503 */
504static void alarmtimer_nsleep_wakeup(struct alarm *alarm)
505{
506 struct task_struct *task = (struct task_struct *)alarm->data;
507
508 alarm->data = NULL;
509 if (task)
510 wake_up_process(task);
511}
512
513/**
514 * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation
515 * @alarm: ptr to alarmtimer
516 * @absexp: absolute expiration time
517 *
518 * Sets the alarm timer and sleeps until it is fired or interrupted.
519 */
520static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp)
521{
522 alarm->data = (void *)current;
523 do {
524 set_current_state(TASK_INTERRUPTIBLE);
525 alarm_start(alarm, absexp, ktime_set(0, 0));
526 if (likely(alarm->data))
527 schedule();
528
529 alarm_cancel(alarm);
530 } while (alarm->data && !signal_pending(current));
531
532 __set_current_state(TASK_RUNNING);
533
534 return (alarm->data == NULL);
535}
536
537
538/**
539 * update_rmtp - Update remaining timespec value
540 * @exp: expiration time
541 * @type: timer type
542 * @rmtp: user pointer to remaining timepsec value
543 *
544 * Helper function that fills in rmtp value with time between
545 * now and the exp value
546 */
547static int update_rmtp(ktime_t exp, enum alarmtimer_type type,
548 struct timespec __user *rmtp)
549{
550 struct timespec rmt;
551 ktime_t rem;
552
553 rem = ktime_sub(exp, alarm_bases[type].gettime());
554
555 if (rem.tv64 <= 0)
556 return 0;
557 rmt = ktime_to_timespec(rem);
558
559 if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
560 return -EFAULT;
561
562 return 1;
563
564}
565
566/**
567 * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep
568 * @restart: ptr to restart block
569 *
570 * Handles restarted clock_nanosleep calls
571 */
572static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
573{
574 enum alarmtimer_type type = restart->nanosleep.clockid;
575 ktime_t exp;
576 struct timespec __user *rmtp;
577 struct alarm alarm;
578 int ret = 0;
579
580 exp.tv64 = restart->nanosleep.expires;
581 alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
582
583 if (alarmtimer_do_nsleep(&alarm, exp))
584 goto out;
585
586 if (freezing(current))
587 alarmtimer_freezerset(exp, type);
588
589 rmtp = restart->nanosleep.rmtp;
590 if (rmtp) {
591 ret = update_rmtp(exp, type, rmtp);
592 if (ret <= 0)
593 goto out;
594 }
595
596
597 /* The other values in restart are already filled in */
598 ret = -ERESTART_RESTARTBLOCK;
599out:
600 return ret;
601}
602
603/**
604 * alarm_timer_nsleep - alarmtimer nanosleep
605 * @which_clock: clockid
606 * @flags: determins abstime or relative
607 * @tsreq: requested sleep time (abs or rel)
608 * @rmtp: remaining sleep time saved
609 *
610 * Handles clock_nanosleep calls against _ALARM clockids
611 */
612static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
613 struct timespec *tsreq, struct timespec __user *rmtp)
614{
615 enum alarmtimer_type type = clock2alarm(which_clock);
616 struct alarm alarm;
617 ktime_t exp;
618 int ret = 0;
619 struct restart_block *restart;
620
621 if (!alarmtimer_get_rtcdev())
622 return -ENOTSUPP;
623
624 if (!capable(CAP_WAKE_ALARM))
625 return -EPERM;
626
627 alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
628
629 exp = timespec_to_ktime(*tsreq);
630 /* Convert (if necessary) to absolute time */
631 if (flags != TIMER_ABSTIME) {
632 ktime_t now = alarm_bases[type].gettime();
633 exp = ktime_add(now, exp);
634 }
635
636 if (alarmtimer_do_nsleep(&alarm, exp))
637 goto out;
638
639 if (freezing(current))
640 alarmtimer_freezerset(exp, type);
641
642 /* abs timers don't set remaining time or restart */
643 if (flags == TIMER_ABSTIME) {
644 ret = -ERESTARTNOHAND;
645 goto out;
646 }
647
648 if (rmtp) {
649 ret = update_rmtp(exp, type, rmtp);
650 if (ret <= 0)
651 goto out;
652 }
653
654 restart = &current_thread_info()->restart_block;
655 restart->fn = alarm_timer_nsleep_restart;
656 restart->nanosleep.clockid = type;
657 restart->nanosleep.expires = exp.tv64;
658 restart->nanosleep.rmtp = rmtp;
659 ret = -ERESTART_RESTARTBLOCK;
660
661out:
662 return ret;
663}
664
665
666/* Suspend hook structures */
667static const struct dev_pm_ops alarmtimer_pm_ops = {
668 .suspend = alarmtimer_suspend,
669};
670
671static struct platform_driver alarmtimer_driver = {
672 .driver = {
673 .name = "alarmtimer",
674 .pm = &alarmtimer_pm_ops,
675 }
676};
677
678/**
679 * alarmtimer_init - Initialize alarm timer code
680 *
681 * This function initializes the alarm bases and registers
682 * the posix clock ids.
683 */
684static int __init alarmtimer_init(void)
685{
686 int error = 0;
687 int i;
688 struct k_clock alarm_clock = {
689 .clock_getres = alarm_clock_getres,
690 .clock_get = alarm_clock_get,
691 .timer_create = alarm_timer_create,
692 .timer_set = alarm_timer_set,
693 .timer_del = alarm_timer_del,
694 .timer_get = alarm_timer_get,
695 .nsleep = alarm_timer_nsleep,
696 };
697
698 posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
699 posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
700
701 /* Initialize alarm bases */
702 alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
703 alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real;
704 alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME;
705 alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime;
706 for (i = 0; i < ALARM_NUMTYPE; i++) {
707 timerqueue_init_head(&alarm_bases[i].timerqueue);
708 spin_lock_init(&alarm_bases[i].lock);
709 hrtimer_init(&alarm_bases[i].timer,
710 alarm_bases[i].base_clockid,
711 HRTIMER_MODE_ABS);
712 alarm_bases[i].timer.function = alarmtimer_fired;
713 }
714 error = platform_driver_register(&alarmtimer_driver);
715 platform_device_register_simple("alarmtimer", -1, NULL, 0);
716
717 return error;
718}
719device_initcall(alarmtimer_init);
720
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d7395fdfb9f3..e4c699dfa4e8 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,7 +18,6 @@
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
@@ -183,7 +182,10 @@ void clockevents_register_device(struct clock_event_device *dev)
183 unsigned long flags; 182 unsigned long flags;
184 183
185 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 184 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
186 BUG_ON(!dev->cpumask); 185 if (!dev->cpumask) {
186 WARN_ON(num_possible_cpus() > 1);
187 dev->cpumask = cpumask_of(smp_processor_id());
188 }
187 189
188 raw_spin_lock_irqsave(&clockevents_lock, flags); 190 raw_spin_lock_irqsave(&clockevents_lock, flags);
189 191
@@ -195,6 +197,70 @@ void clockevents_register_device(struct clock_event_device *dev)
195} 197}
196EXPORT_SYMBOL_GPL(clockevents_register_device); 198EXPORT_SYMBOL_GPL(clockevents_register_device);
197 199
200static void clockevents_config(struct clock_event_device *dev,
201 u32 freq)
202{
203 u64 sec;
204
205 if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
206 return;
207
208 /*
209 * Calculate the maximum number of seconds we can sleep. Limit
210 * to 10 minutes for hardware which can program more than
211 * 32bit ticks so we still get reasonable conversion values.
212 */
213 sec = dev->max_delta_ticks;
214 do_div(sec, freq);
215 if (!sec)
216 sec = 1;
217 else if (sec > 600 && dev->max_delta_ticks > UINT_MAX)
218 sec = 600;
219
220 clockevents_calc_mult_shift(dev, freq, sec);
221 dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev);
222 dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev);
223}
224
225/**
226 * clockevents_config_and_register - Configure and register a clock event device
227 * @dev: device to register
228 * @freq: The clock frequency
229 * @min_delta: The minimum clock ticks to program in oneshot mode
230 * @max_delta: The maximum clock ticks to program in oneshot mode
231 *
232 * min/max_delta can be 0 for devices which do not support oneshot mode.
233 */
234void clockevents_config_and_register(struct clock_event_device *dev,
235 u32 freq, unsigned long min_delta,
236 unsigned long max_delta)
237{
238 dev->min_delta_ticks = min_delta;
239 dev->max_delta_ticks = max_delta;
240 clockevents_config(dev, freq);
241 clockevents_register_device(dev);
242}
243
244/**
245 * clockevents_update_freq - Update frequency and reprogram a clock event device.
246 * @dev: device to modify
247 * @freq: new device frequency
248 *
249 * Reconfigure and reprogram a clock event device in oneshot
250 * mode. Must be called on the cpu for which the device delivers per
251 * cpu timer events with interrupts disabled! Returns 0 on success,
252 * -ETIME when the event is in the past.
253 */
254int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
255{
256 clockevents_config(dev, freq);
257
258 if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
259 return 0;
260
261 return clockevents_program_event(dev, dev->next_event, ktime_get());
262}
263
198/* 264/*
199 * Noop handler when we shut down an event device 265 * Noop handler when we shut down an event device
200 */ 266 */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c18d7efa1b4b..e0980f0d9a0a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
113 * @shift: pointer to shift variable 113 * @shift: pointer to shift variable
114 * @from: frequency to convert from 114 * @from: frequency to convert from
115 * @to: frequency to convert to 115 * @to: frequency to convert to
116 * @minsec: guaranteed runtime conversion range in seconds 116 * @maxsec: guaranteed runtime conversion range in seconds
117 * 117 *
118 * The function evaluates the shift/mult pair for the scaled math 118 * The function evaluates the shift/mult pair for the scaled math
119 * operations of clocksources and clockevents. 119 * operations of clocksources and clockevents.
@@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
122 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock 122 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
123 * event @to is the counter frequency and @from is NSEC_PER_SEC. 123 * event @to is the counter frequency and @from is NSEC_PER_SEC.
124 * 124 *
125 * The @minsec conversion range argument controls the time frame in 125 * The @maxsec conversion range argument controls the time frame in
126 * seconds which must be covered by the runtime conversion with the 126 * seconds which must be covered by the runtime conversion with the
127 * calculated mult and shift factors. This guarantees that no 64bit 127 * calculated mult and shift factors. This guarantees that no 64bit
128 * overflow happens when the input value of the conversion is 128 * overflow happens when the input value of the conversion is
@@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
131 * factors. 131 * factors.
132 */ 132 */
133void 133void
134clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) 134clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
135{ 135{
136 u64 tmp; 136 u64 tmp;
137 u32 sft, sftacc= 32; 137 u32 sft, sftacc= 32;
@@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
140 * Calculate the shift factor which is limiting the conversion 140 * Calculate the shift factor which is limiting the conversion
141 * range: 141 * range:
142 */ 142 */
143 tmp = ((u64)minsec * from) >> 32; 143 tmp = ((u64)maxsec * from) >> 32;
144 while (tmp) { 144 while (tmp) {
145 tmp >>=1; 145 tmp >>=1;
146 sftacc--; 146 sftacc--;
@@ -152,6 +152,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
152 */ 152 */
153 for (sft = 32; sft > 0; sft--) { 153 for (sft = 32; sft > 0; sft--) {
154 tmp = (u64) to << sft; 154 tmp = (u64) to << sft;
155 tmp += from / 2;
155 do_div(tmp, from); 156 do_div(tmp, from);
156 if ((tmp >> sftacc) == 0) 157 if ((tmp >> sftacc) == 0)
157 break; 158 break;
@@ -184,7 +185,6 @@ static struct clocksource *watchdog;
184static struct timer_list watchdog_timer; 185static struct timer_list watchdog_timer;
185static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); 186static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
186static DEFINE_SPINLOCK(watchdog_lock); 187static DEFINE_SPINLOCK(watchdog_lock);
187static cycle_t watchdog_last;
188static int watchdog_running; 188static int watchdog_running;
189 189
190static int clocksource_watchdog_kthread(void *data); 190static int clocksource_watchdog_kthread(void *data);
@@ -253,11 +253,6 @@ static void clocksource_watchdog(unsigned long data)
253 if (!watchdog_running) 253 if (!watchdog_running)
254 goto out; 254 goto out;
255 255
256 wdnow = watchdog->read(watchdog);
257 wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
258 watchdog->mult, watchdog->shift);
259 watchdog_last = wdnow;
260
261 list_for_each_entry(cs, &watchdog_list, wd_list) { 256 list_for_each_entry(cs, &watchdog_list, wd_list) {
262 257
263 /* Clocksource already marked unstable? */ 258 /* Clocksource already marked unstable? */
@@ -267,19 +262,28 @@ static void clocksource_watchdog(unsigned long data)
267 continue; 262 continue;
268 } 263 }
269 264
265 local_irq_disable();
270 csnow = cs->read(cs); 266 csnow = cs->read(cs);
267 wdnow = watchdog->read(watchdog);
268 local_irq_enable();
271 269
272 /* Clocksource initialized ? */ 270 /* Clocksource initialized ? */
273 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { 271 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
274 cs->flags |= CLOCK_SOURCE_WATCHDOG; 272 cs->flags |= CLOCK_SOURCE_WATCHDOG;
275 cs->wd_last = csnow; 273 cs->wd_last = wdnow;
274 cs->cs_last = csnow;
276 continue; 275 continue;
277 } 276 }
278 277
279 /* Check the deviation from the watchdog clocksource. */ 278 wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask,
280 cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) & 279 watchdog->mult, watchdog->shift);
280
281 cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) &
281 cs->mask, cs->mult, cs->shift); 282 cs->mask, cs->mult, cs->shift);
282 cs->wd_last = csnow; 283 cs->cs_last = csnow;
284 cs->wd_last = wdnow;
285
286 /* Check the deviation from the watchdog clocksource. */
283 if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { 287 if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
284 clocksource_unstable(cs, cs_nsec - wd_nsec); 288 clocksource_unstable(cs, cs_nsec - wd_nsec);
285 continue; 289 continue;
@@ -317,7 +321,6 @@ static inline void clocksource_start_watchdog(void)
317 return; 321 return;
318 init_timer(&watchdog_timer); 322 init_timer(&watchdog_timer);
319 watchdog_timer.function = clocksource_watchdog; 323 watchdog_timer.function = clocksource_watchdog;
320 watchdog_last = watchdog->read(watchdog);
321 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; 324 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
322 add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); 325 add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
323 watchdog_running = 1; 326 watchdog_running = 1;
@@ -625,19 +628,6 @@ static void clocksource_enqueue(struct clocksource *cs)
625 list_add(&cs->list, entry); 628 list_add(&cs->list, entry);
626} 629}
627 630
628
629/*
630 * Maximum time we expect to go between ticks. This includes idle
631 * tickless time. It provides the trade off between selecting a
632 * mult/shift pair that is very precise but can only handle a short
633 * period of time, vs. a mult/shift pair that can handle long periods
634 * of time but isn't as precise.
635 *
636 * This is a subsystem constant, and actual hardware limitations
637 * may override it (ie: clocksources that wrap every 3 seconds).
638 */
639#define MAX_UPDATE_LENGTH 5 /* Seconds */
640
641/** 631/**
642 * __clocksource_updatefreq_scale - Used update clocksource with new freq 632 * __clocksource_updatefreq_scale - Used update clocksource with new freq
643 * @t: clocksource to be registered 633 * @t: clocksource to be registered
@@ -651,15 +641,28 @@ static void clocksource_enqueue(struct clocksource *cs)
651 */ 641 */
652void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) 642void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
653{ 643{
644 u64 sec;
645
654 /* 646 /*
655 * Ideally we want to use some of the limits used in 647 * Calc the maximum number of seconds which we can run before
656 * clocksource_max_deferment, to provide a more informed 648 * wrapping around. For clocksources which have a mask > 32bit
657 * MAX_UPDATE_LENGTH. But for now this just gets the 649 * we need to limit the max sleep time to have a good
658 * register interface working properly. 650 * conversion precision. 10 minutes is still a reasonable
651 * amount. That results in a shift value of 24 for a
652 * clocksource with mask >= 40bit and f >= 4GHz. That maps to
653 * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
654 * margin as we do in clocksource_max_deferment()
659 */ 655 */
656 sec = (cs->mask - (cs->mask >> 5));
657 do_div(sec, freq);
658 do_div(sec, scale);
659 if (!sec)
660 sec = 1;
661 else if (sec > 600 && cs->mask > UINT_MAX)
662 sec = 600;
663
660 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, 664 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
661 NSEC_PER_SEC/scale, 665 NSEC_PER_SEC / scale, sec * scale);
662 MAX_UPDATE_LENGTH*scale);
663 cs->max_idle_ns = clocksource_max_deferment(cs); 666 cs->max_idle_ns = clocksource_max_deferment(cs);
664} 667}
665EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); 668EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
@@ -678,14 +681,14 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
678int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) 681int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
679{ 682{
680 683
681 /* Intialize mult/shift and max_idle_ns */ 684 /* Initialize mult/shift and max_idle_ns */
682 __clocksource_updatefreq_scale(cs, scale, freq); 685 __clocksource_updatefreq_scale(cs, scale, freq);
683 686
684 /* Add clocksource to the clcoksource list */ 687 /* Add clocksource to the clcoksource list */
685 mutex_lock(&clocksource_mutex); 688 mutex_lock(&clocksource_mutex);
686 clocksource_enqueue(cs); 689 clocksource_enqueue(cs);
687 clocksource_select();
688 clocksource_enqueue_watchdog(cs); 690 clocksource_enqueue_watchdog(cs);
691 clocksource_select();
689 mutex_unlock(&clocksource_mutex); 692 mutex_unlock(&clocksource_mutex);
690 return 0; 693 return 0;
691} 694}
@@ -705,8 +708,8 @@ int clocksource_register(struct clocksource *cs)
705 708
706 mutex_lock(&clocksource_mutex); 709 mutex_lock(&clocksource_mutex);
707 clocksource_enqueue(cs); 710 clocksource_enqueue(cs);
708 clocksource_select();
709 clocksource_enqueue_watchdog(cs); 711 clocksource_enqueue_watchdog(cs);
712 clocksource_select();
710 mutex_unlock(&clocksource_mutex); 713 mutex_unlock(&clocksource_mutex);
711 return 0; 714 return 0;
712} 715}
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 5404a8456909..a470154e0408 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -22,8 +22,11 @@
22************************************************************************/ 22************************************************************************/
23#include <linux/clocksource.h> 23#include <linux/clocksource.h>
24#include <linux/jiffies.h> 24#include <linux/jiffies.h>
25#include <linux/module.h>
25#include <linux/init.h> 26#include <linux/init.h>
26 27
28#include "tick-internal.h"
29
27/* The Jiffies based clocksource is the lowest common 30/* The Jiffies based clocksource is the lowest common
28 * denominator clock source which should function on 31 * denominator clock source which should function on
29 * all systems. It has the same coarse resolution as 32 * all systems. It has the same coarse resolution as
@@ -31,7 +34,7 @@
31 * inaccuracies caused by missed or lost timer 34 * inaccuracies caused by missed or lost timer
32 * interrupts and the inability for the timer 35 * interrupts and the inability for the timer
33 * interrupt hardware to accuratly tick at the 36 * interrupt hardware to accuratly tick at the
34 * requested HZ value. It is also not reccomended 37 * requested HZ value. It is also not recommended
35 * for "tick-less" systems. 38 * for "tick-less" systems.
36 */ 39 */
37#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) 40#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
@@ -64,6 +67,23 @@ struct clocksource clocksource_jiffies = {
64 .shift = JIFFIES_SHIFT, 67 .shift = JIFFIES_SHIFT,
65}; 68};
66 69
70#if (BITS_PER_LONG < 64)
71u64 get_jiffies_64(void)
72{
73 unsigned long seq;
74 u64 ret;
75
76 do {
77 seq = read_seqbegin(&xtime_lock);
78 ret = jiffies_64;
79 } while (read_seqretry(&xtime_lock, seq));
80 return ret;
81}
82EXPORT_SYMBOL(get_jiffies_64);
83#endif
84
85EXPORT_SYMBOL(jiffies);
86
67static int __init init_jiffies_clocksource(void) 87static int __init init_jiffies_clocksource(void)
68{ 88{
69 return clocksource_register(&clocksource_jiffies); 89 return clocksource_register(&clocksource_jiffies);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c63116863a80..f6117a4c7cb8 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -14,6 +14,9 @@
14#include <linux/timex.h> 14#include <linux/timex.h>
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/module.h>
18
19#include "tick-internal.h"
17 20
18/* 21/*
19 * NTP timekeeping variables: 22 * NTP timekeeping variables:
@@ -74,6 +77,162 @@ static long time_adjust;
74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ 77/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
75static s64 ntp_tick_adj; 78static s64 ntp_tick_adj;
76 79
80#ifdef CONFIG_NTP_PPS
81
82/*
83 * The following variables are used when a pulse-per-second (PPS) signal
84 * is available. They establish the engineering parameters of the clock
85 * discipline loop when controlled by the PPS signal.
86 */
87#define PPS_VALID 10 /* PPS signal watchdog max (s) */
88#define PPS_POPCORN 4 /* popcorn spike threshold (shift) */
89#define PPS_INTMIN 2 /* min freq interval (s) (shift) */
90#define PPS_INTMAX 8 /* max freq interval (s) (shift) */
91#define PPS_INTCOUNT 4 /* number of consecutive good intervals to
92 increase pps_shift or consecutive bad
93 intervals to decrease it */
94#define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */
95
96static int pps_valid; /* signal watchdog counter */
97static long pps_tf[3]; /* phase median filter */
98static long pps_jitter; /* current jitter (ns) */
99static struct timespec pps_fbase; /* beginning of the last freq interval */
100static int pps_shift; /* current interval duration (s) (shift) */
101static int pps_intcnt; /* interval counter */
102static s64 pps_freq; /* frequency offset (scaled ns/s) */
103static long pps_stabil; /* current stability (scaled ns/s) */
104
105/*
106 * PPS signal quality monitors
107 */
108static long pps_calcnt; /* calibration intervals */
109static long pps_jitcnt; /* jitter limit exceeded */
110static long pps_stbcnt; /* stability limit exceeded */
111static long pps_errcnt; /* calibration errors */
112
113
114/* PPS kernel consumer compensates the whole phase error immediately.
115 * Otherwise, reduce the offset by a fixed factor times the time constant.
116 */
117static inline s64 ntp_offset_chunk(s64 offset)
118{
119 if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
120 return offset;
121 else
122 return shift_right(offset, SHIFT_PLL + time_constant);
123}
124
125static inline void pps_reset_freq_interval(void)
126{
127 /* the PPS calibration interval may end
128 surprisingly early */
129 pps_shift = PPS_INTMIN;
130 pps_intcnt = 0;
131}
132
133/**
134 * pps_clear - Clears the PPS state variables
135 *
136 * Must be called while holding a write on the xtime_lock
137 */
138static inline void pps_clear(void)
139{
140 pps_reset_freq_interval();
141 pps_tf[0] = 0;
142 pps_tf[1] = 0;
143 pps_tf[2] = 0;
144 pps_fbase.tv_sec = pps_fbase.tv_nsec = 0;
145 pps_freq = 0;
146}
147
148/* Decrease pps_valid to indicate that another second has passed since
149 * the last PPS signal. When it reaches 0, indicate that PPS signal is
150 * missing.
151 *
152 * Must be called while holding a write on the xtime_lock
153 */
154static inline void pps_dec_valid(void)
155{
156 if (pps_valid > 0)
157 pps_valid--;
158 else {
159 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
160 STA_PPSWANDER | STA_PPSERROR);
161 pps_clear();
162 }
163}
164
165static inline void pps_set_freq(s64 freq)
166{
167 pps_freq = freq;
168}
169
170static inline int is_error_status(int status)
171{
172 return (time_status & (STA_UNSYNC|STA_CLOCKERR))
173 /* PPS signal lost when either PPS time or
174 * PPS frequency synchronization requested
175 */
176 || ((time_status & (STA_PPSFREQ|STA_PPSTIME))
177 && !(time_status & STA_PPSSIGNAL))
178 /* PPS jitter exceeded when
179 * PPS time synchronization requested */
180 || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
181 == (STA_PPSTIME|STA_PPSJITTER))
182 /* PPS wander exceeded or calibration error when
183 * PPS frequency synchronization requested
184 */
185 || ((time_status & STA_PPSFREQ)
186 && (time_status & (STA_PPSWANDER|STA_PPSERROR)));
187}
188
189static inline void pps_fill_timex(struct timex *txc)
190{
191 txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *
192 PPM_SCALE_INV, NTP_SCALE_SHIFT);
193 txc->jitter = pps_jitter;
194 if (!(time_status & STA_NANO))
195 txc->jitter /= NSEC_PER_USEC;
196 txc->shift = pps_shift;
197 txc->stabil = pps_stabil;
198 txc->jitcnt = pps_jitcnt;
199 txc->calcnt = pps_calcnt;
200 txc->errcnt = pps_errcnt;
201 txc->stbcnt = pps_stbcnt;
202}
203
204#else /* !CONFIG_NTP_PPS */
205
206static inline s64 ntp_offset_chunk(s64 offset)
207{
208 return shift_right(offset, SHIFT_PLL + time_constant);
209}
210
211static inline void pps_reset_freq_interval(void) {}
212static inline void pps_clear(void) {}
213static inline void pps_dec_valid(void) {}
214static inline void pps_set_freq(s64 freq) {}
215
216static inline int is_error_status(int status)
217{
218 return status & (STA_UNSYNC|STA_CLOCKERR);
219}
220
221static inline void pps_fill_timex(struct timex *txc)
222{
223 /* PPS is not implemented, so these are zero */
224 txc->ppsfreq = 0;
225 txc->jitter = 0;
226 txc->shift = 0;
227 txc->stabil = 0;
228 txc->jitcnt = 0;
229 txc->calcnt = 0;
230 txc->errcnt = 0;
231 txc->stbcnt = 0;
232}
233
234#endif /* CONFIG_NTP_PPS */
235
77/* 236/*
78 * NTP methods: 237 * NTP methods:
79 */ 238 */
@@ -149,10 +308,18 @@ static void ntp_update_offset(long offset)
149 time_reftime = get_seconds(); 308 time_reftime = get_seconds();
150 309
151 offset64 = offset; 310 offset64 = offset;
152 freq_adj = (offset64 * secs) << 311 freq_adj = ntp_update_offset_fll(offset64, secs);
153 (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
154 312
155 freq_adj += ntp_update_offset_fll(offset64, secs); 313 /*
314 * Clamp update interval to reduce PLL gain with low
315 * sampling rate (e.g. intermittent network connection)
316 * to avoid instability.
317 */
318 if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant)))
319 secs = 1 << (SHIFT_PLL + 1 + time_constant);
320
321 freq_adj += (offset64 * secs) <<
322 (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
156 323
157 freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); 324 freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED);
158 325
@@ -177,6 +344,9 @@ void ntp_clear(void)
177 344
178 tick_length = tick_length_base; 345 tick_length = tick_length_base;
179 time_offset = 0; 346 time_offset = 0;
347
348 /* Clear PPS state variables */
349 pps_clear();
180} 350}
181 351
182/* 352/*
@@ -242,16 +412,16 @@ void second_overflow(void)
242 time_status |= STA_UNSYNC; 412 time_status |= STA_UNSYNC;
243 } 413 }
244 414
245 /* 415 /* Compute the phase adjustment for the next second */
246 * Compute the phase adjustment for the next second. The offset is
247 * reduced by a fixed factor times the time constant.
248 */
249 tick_length = tick_length_base; 416 tick_length = tick_length_base;
250 417
251 delta = shift_right(time_offset, SHIFT_PLL + time_constant); 418 delta = ntp_offset_chunk(time_offset);
252 time_offset -= delta; 419 time_offset -= delta;
253 tick_length += delta; 420 tick_length += delta;
254 421
422 /* Check PPS signal */
423 pps_dec_valid();
424
255 if (!time_adjust) 425 if (!time_adjust)
256 return; 426 return;
257 427
@@ -361,6 +531,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
361 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { 531 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
362 time_state = TIME_OK; 532 time_state = TIME_OK;
363 time_status = STA_UNSYNC; 533 time_status = STA_UNSYNC;
534 /* restart PPS frequency calibration */
535 pps_reset_freq_interval();
364 } 536 }
365 537
366 /* 538 /*
@@ -410,6 +582,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
410 time_freq = txc->freq * PPM_SCALE; 582 time_freq = txc->freq * PPM_SCALE;
411 time_freq = min(time_freq, MAXFREQ_SCALED); 583 time_freq = min(time_freq, MAXFREQ_SCALED);
412 time_freq = max(time_freq, -MAXFREQ_SCALED); 584 time_freq = max(time_freq, -MAXFREQ_SCALED);
585 /* update pps_freq */
586 pps_set_freq(time_freq);
413 } 587 }
414 588
415 if (txc->modes & ADJ_MAXERROR) 589 if (txc->modes & ADJ_MAXERROR)
@@ -474,6 +648,19 @@ int do_adjtimex(struct timex *txc)
474 hrtimer_cancel(&leap_timer); 648 hrtimer_cancel(&leap_timer);
475 } 649 }
476 650
651 if (txc->modes & ADJ_SETOFFSET) {
652 struct timespec delta;
653 delta.tv_sec = txc->time.tv_sec;
654 delta.tv_nsec = txc->time.tv_usec;
655 if (!capable(CAP_SYS_TIME))
656 return -EPERM;
657 if (!(txc->modes & ADJ_NANO))
658 delta.tv_nsec *= 1000;
659 result = timekeeping_inject_offset(&delta);
660 if (result)
661 return result;
662 }
663
477 getnstimeofday(&ts); 664 getnstimeofday(&ts);
478 665
479 write_seqlock_irq(&xtime_lock); 666 write_seqlock_irq(&xtime_lock);
@@ -500,7 +687,8 @@ int do_adjtimex(struct timex *txc)
500 } 687 }
501 688
502 result = time_state; /* mostly `TIME_OK' */ 689 result = time_state; /* mostly `TIME_OK' */
503 if (time_status & (STA_UNSYNC|STA_CLOCKERR)) 690 /* check for errors */
691 if (is_error_status(time_status))
504 result = TIME_ERROR; 692 result = TIME_ERROR;
505 693
506 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * 694 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
@@ -514,15 +702,8 @@ int do_adjtimex(struct timex *txc)
514 txc->tick = tick_usec; 702 txc->tick = tick_usec;
515 txc->tai = time_tai; 703 txc->tai = time_tai;
516 704
517 /* PPS is not implemented, so these are zero */ 705 /* fill PPS status fields */
518 txc->ppsfreq = 0; 706 pps_fill_timex(txc);
519 txc->jitter = 0;
520 txc->shift = 0;
521 txc->stabil = 0;
522 txc->jitcnt = 0;
523 txc->calcnt = 0;
524 txc->errcnt = 0;
525 txc->stbcnt = 0;
526 707
527 write_sequnlock_irq(&xtime_lock); 708 write_sequnlock_irq(&xtime_lock);
528 709
@@ -536,6 +717,243 @@ int do_adjtimex(struct timex *txc)
536 return result; 717 return result;
537} 718}
538 719
720#ifdef CONFIG_NTP_PPS
721
722/* actually struct pps_normtime is good old struct timespec, but it is
723 * semantically different (and it is the reason why it was invented):
724 * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
725 * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
726struct pps_normtime {
727 __kernel_time_t sec; /* seconds */
728 long nsec; /* nanoseconds */
729};
730
731/* normalize the timestamp so that nsec is in the
732 ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
733static inline struct pps_normtime pps_normalize_ts(struct timespec ts)
734{
735 struct pps_normtime norm = {
736 .sec = ts.tv_sec,
737 .nsec = ts.tv_nsec
738 };
739
740 if (norm.nsec > (NSEC_PER_SEC >> 1)) {
741 norm.nsec -= NSEC_PER_SEC;
742 norm.sec++;
743 }
744
745 return norm;
746}
747
748/* get current phase correction and jitter */
749static inline long pps_phase_filter_get(long *jitter)
750{
751 *jitter = pps_tf[0] - pps_tf[1];
752 if (*jitter < 0)
753 *jitter = -*jitter;
754
755 /* TODO: test various filters */
756 return pps_tf[0];
757}
758
759/* add the sample to the phase filter */
760static inline void pps_phase_filter_add(long err)
761{
762 pps_tf[2] = pps_tf[1];
763 pps_tf[1] = pps_tf[0];
764 pps_tf[0] = err;
765}
766
767/* decrease frequency calibration interval length.
768 * It is halved after four consecutive unstable intervals.
769 */
770static inline void pps_dec_freq_interval(void)
771{
772 if (--pps_intcnt <= -PPS_INTCOUNT) {
773 pps_intcnt = -PPS_INTCOUNT;
774 if (pps_shift > PPS_INTMIN) {
775 pps_shift--;
776 pps_intcnt = 0;
777 }
778 }
779}
780
781/* increase frequency calibration interval length.
782 * It is doubled after four consecutive stable intervals.
783 */
784static inline void pps_inc_freq_interval(void)
785{
786 if (++pps_intcnt >= PPS_INTCOUNT) {
787 pps_intcnt = PPS_INTCOUNT;
788 if (pps_shift < PPS_INTMAX) {
789 pps_shift++;
790 pps_intcnt = 0;
791 }
792 }
793}
794
795/* update clock frequency based on MONOTONIC_RAW clock PPS signal
796 * timestamps
797 *
798 * At the end of the calibration interval the difference between the
799 * first and last MONOTONIC_RAW clock timestamps divided by the length
800 * of the interval becomes the frequency update. If the interval was
801 * too long, the data are discarded.
802 * Returns the difference between old and new frequency values.
803 */
804static long hardpps_update_freq(struct pps_normtime freq_norm)
805{
806 long delta, delta_mod;
807 s64 ftemp;
808
809 /* check if the frequency interval was too long */
810 if (freq_norm.sec > (2 << pps_shift)) {
811 time_status |= STA_PPSERROR;
812 pps_errcnt++;
813 pps_dec_freq_interval();
814 pr_err("hardpps: PPSERROR: interval too long - %ld s\n",
815 freq_norm.sec);
816 return 0;
817 }
818
819 /* here the raw frequency offset and wander (stability) is
820 * calculated. If the wander is less than the wander threshold
821 * the interval is increased; otherwise it is decreased.
822 */
823 ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT,
824 freq_norm.sec);
825 delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
826 pps_freq = ftemp;
827 if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
828 pr_warning("hardpps: PPSWANDER: change=%ld\n", delta);
829 time_status |= STA_PPSWANDER;
830 pps_stbcnt++;
831 pps_dec_freq_interval();
832 } else { /* good sample */
833 pps_inc_freq_interval();
834 }
835
836 /* the stability metric is calculated as the average of recent
837 * frequency changes, but is used only for performance
838 * monitoring
839 */
840 delta_mod = delta;
841 if (delta_mod < 0)
842 delta_mod = -delta_mod;
843 pps_stabil += (div_s64(((s64)delta_mod) <<
844 (NTP_SCALE_SHIFT - SHIFT_USEC),
845 NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN;
846
847 /* if enabled, the system clock frequency is updated */
848 if ((time_status & STA_PPSFREQ) != 0 &&
849 (time_status & STA_FREQHOLD) == 0) {
850 time_freq = pps_freq;
851 ntp_update_frequency();
852 }
853
854 return delta;
855}
856
857/* correct REALTIME clock phase error against PPS signal */
858static void hardpps_update_phase(long error)
859{
860 long correction = -error;
861 long jitter;
862
863 /* add the sample to the median filter */
864 pps_phase_filter_add(correction);
865 correction = pps_phase_filter_get(&jitter);
866
867 /* Nominal jitter is due to PPS signal noise. If it exceeds the
868 * threshold, the sample is discarded; otherwise, if so enabled,
869 * the time offset is updated.
870 */
871 if (jitter > (pps_jitter << PPS_POPCORN)) {
872 pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
873 jitter, (pps_jitter << PPS_POPCORN));
874 time_status |= STA_PPSJITTER;
875 pps_jitcnt++;
876 } else if (time_status & STA_PPSTIME) {
877 /* correct the time using the phase offset */
878 time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT,
879 NTP_INTERVAL_FREQ);
880 /* cancel running adjtime() */
881 time_adjust = 0;
882 }
883 /* update jitter */
884 pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN;
885}
886
887/*
888 * hardpps() - discipline CPU clock oscillator to external PPS signal
889 *
890 * This routine is called at each PPS signal arrival in order to
891 * discipline the CPU clock oscillator to the PPS signal. It takes two
892 * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former
893 * is used to correct clock phase error and the latter is used to
894 * correct the frequency.
895 *
896 * This code is based on David Mills's reference nanokernel
897 * implementation. It was mostly rewritten but keeps the same idea.
898 */
899void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
900{
901 struct pps_normtime pts_norm, freq_norm;
902 unsigned long flags;
903
904 pts_norm = pps_normalize_ts(*phase_ts);
905
906 write_seqlock_irqsave(&xtime_lock, flags);
907
908 /* clear the error bits, they will be set again if needed */
909 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
910
911 /* indicate signal presence */
912 time_status |= STA_PPSSIGNAL;
913 pps_valid = PPS_VALID;
914
915 /* when called for the first time,
916 * just start the frequency interval */
917 if (unlikely(pps_fbase.tv_sec == 0)) {
918 pps_fbase = *raw_ts;
919 write_sequnlock_irqrestore(&xtime_lock, flags);
920 return;
921 }
922
923 /* ok, now we have a base for frequency calculation */
924 freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase));
925
926 /* check that the signal is in the range
927 * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
928 if ((freq_norm.sec == 0) ||
929 (freq_norm.nsec > MAXFREQ * freq_norm.sec) ||
930 (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) {
931 time_status |= STA_PPSJITTER;
932 /* restart the frequency calibration interval */
933 pps_fbase = *raw_ts;
934 write_sequnlock_irqrestore(&xtime_lock, flags);
935 pr_err("hardpps: PPSJITTER: bad pulse\n");
936 return;
937 }
938
939 /* signal is ok */
940
941 /* check if the current frequency interval is finished */
942 if (freq_norm.sec >= (1 << pps_shift)) {
943 pps_calcnt++;
944 /* restart the frequency calibration interval */
945 pps_fbase = *raw_ts;
946 hardpps_update_freq(freq_norm);
947 }
948
949 hardpps_update_phase(pts_norm.nsec);
950
951 write_sequnlock_irqrestore(&xtime_lock, flags);
952}
953EXPORT_SYMBOL(hardpps);
954
955#endif /* CONFIG_NTP_PPS */
956
539static int __init ntp_tick_adj_setup(char *str) 957static int __init ntp_tick_adj_setup(char *str)
540{ 958{
541 ntp_tick_adj = simple_strtol(str, NULL, 0); 959 ntp_tick_adj = simple_strtol(str, NULL, 0);
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
new file mode 100644
index 000000000000..c340ca658f37
--- /dev/null
+++ b/kernel/time/posix-clock.c
@@ -0,0 +1,445 @@
1/*
2 * posix-clock.c - support for dynamic clock devices
3 *
4 * Copyright (C) 2010 OMICRON electronics GmbH
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20#include <linux/device.h>
21#include <linux/file.h>
22#include <linux/posix-clock.h>
23#include <linux/slab.h>
24#include <linux/syscalls.h>
25#include <linux/uaccess.h>
26
27static void delete_clock(struct kref *kref);
28
29/*
30 * Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
31 */
32static struct posix_clock *get_posix_clock(struct file *fp)
33{
34 struct posix_clock *clk = fp->private_data;
35
36 down_read(&clk->rwsem);
37
38 if (!clk->zombie)
39 return clk;
40
41 up_read(&clk->rwsem);
42
43 return NULL;
44}
45
46static void put_posix_clock(struct posix_clock *clk)
47{
48 up_read(&clk->rwsem);
49}
50
51static ssize_t posix_clock_read(struct file *fp, char __user *buf,
52 size_t count, loff_t *ppos)
53{
54 struct posix_clock *clk = get_posix_clock(fp);
55 int err = -EINVAL;
56
57 if (!clk)
58 return -ENODEV;
59
60 if (clk->ops.read)
61 err = clk->ops.read(clk, fp->f_flags, buf, count);
62
63 put_posix_clock(clk);
64
65 return err;
66}
67
68static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
69{
70 struct posix_clock *clk = get_posix_clock(fp);
71 int result = 0;
72
73 if (!clk)
74 return -ENODEV;
75
76 if (clk->ops.poll)
77 result = clk->ops.poll(clk, fp, wait);
78
79 put_posix_clock(clk);
80
81 return result;
82}
83
84static int posix_clock_fasync(int fd, struct file *fp, int on)
85{
86 struct posix_clock *clk = get_posix_clock(fp);
87 int err = 0;
88
89 if (!clk)
90 return -ENODEV;
91
92 if (clk->ops.fasync)
93 err = clk->ops.fasync(clk, fd, fp, on);
94
95 put_posix_clock(clk);
96
97 return err;
98}
99
100static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma)
101{
102 struct posix_clock *clk = get_posix_clock(fp);
103 int err = -ENODEV;
104
105 if (!clk)
106 return -ENODEV;
107
108 if (clk->ops.mmap)
109 err = clk->ops.mmap(clk, vma);
110
111 put_posix_clock(clk);
112
113 return err;
114}
115
116static long posix_clock_ioctl(struct file *fp,
117 unsigned int cmd, unsigned long arg)
118{
119 struct posix_clock *clk = get_posix_clock(fp);
120 int err = -ENOTTY;
121
122 if (!clk)
123 return -ENODEV;
124
125 if (clk->ops.ioctl)
126 err = clk->ops.ioctl(clk, cmd, arg);
127
128 put_posix_clock(clk);
129
130 return err;
131}
132
133#ifdef CONFIG_COMPAT
134static long posix_clock_compat_ioctl(struct file *fp,
135 unsigned int cmd, unsigned long arg)
136{
137 struct posix_clock *clk = get_posix_clock(fp);
138 int err = -ENOTTY;
139
140 if (!clk)
141 return -ENODEV;
142
143 if (clk->ops.ioctl)
144 err = clk->ops.ioctl(clk, cmd, arg);
145
146 put_posix_clock(clk);
147
148 return err;
149}
150#endif
151
152static int posix_clock_open(struct inode *inode, struct file *fp)
153{
154 int err;
155 struct posix_clock *clk =
156 container_of(inode->i_cdev, struct posix_clock, cdev);
157
158 down_read(&clk->rwsem);
159
160 if (clk->zombie) {
161 err = -ENODEV;
162 goto out;
163 }
164 if (clk->ops.open)
165 err = clk->ops.open(clk, fp->f_mode);
166 else
167 err = 0;
168
169 if (!err) {
170 kref_get(&clk->kref);
171 fp->private_data = clk;
172 }
173out:
174 up_read(&clk->rwsem);
175 return err;
176}
177
178static int posix_clock_release(struct inode *inode, struct file *fp)
179{
180 struct posix_clock *clk = fp->private_data;
181 int err = 0;
182
183 if (clk->ops.release)
184 err = clk->ops.release(clk);
185
186 kref_put(&clk->kref, delete_clock);
187
188 fp->private_data = NULL;
189
190 return err;
191}
192
193static const struct file_operations posix_clock_file_operations = {
194 .owner = THIS_MODULE,
195 .llseek = no_llseek,
196 .read = posix_clock_read,
197 .poll = posix_clock_poll,
198 .unlocked_ioctl = posix_clock_ioctl,
199 .open = posix_clock_open,
200 .release = posix_clock_release,
201 .fasync = posix_clock_fasync,
202 .mmap = posix_clock_mmap,
203#ifdef CONFIG_COMPAT
204 .compat_ioctl = posix_clock_compat_ioctl,
205#endif
206};
207
208int posix_clock_register(struct posix_clock *clk, dev_t devid)
209{
210 int err;
211
212 kref_init(&clk->kref);
213 init_rwsem(&clk->rwsem);
214
215 cdev_init(&clk->cdev, &posix_clock_file_operations);
216 clk->cdev.owner = clk->ops.owner;
217 err = cdev_add(&clk->cdev, devid, 1);
218
219 return err;
220}
221EXPORT_SYMBOL_GPL(posix_clock_register);
222
223static void delete_clock(struct kref *kref)
224{
225 struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
226
227 if (clk->release)
228 clk->release(clk);
229}
230
231void posix_clock_unregister(struct posix_clock *clk)
232{
233 cdev_del(&clk->cdev);
234
235 down_write(&clk->rwsem);
236 clk->zombie = true;
237 up_write(&clk->rwsem);
238
239 kref_put(&clk->kref, delete_clock);
240}
241EXPORT_SYMBOL_GPL(posix_clock_unregister);
242
243struct posix_clock_desc {
244 struct file *fp;
245 struct posix_clock *clk;
246};
247
248static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd)
249{
250 struct file *fp = fget(CLOCKID_TO_FD(id));
251 int err = -EINVAL;
252
253 if (!fp)
254 return err;
255
256 if (fp->f_op->open != posix_clock_open || !fp->private_data)
257 goto out;
258
259 cd->fp = fp;
260 cd->clk = get_posix_clock(fp);
261
262 err = cd->clk ? 0 : -ENODEV;
263out:
264 if (err)
265 fput(fp);
266 return err;
267}
268
269static void put_clock_desc(struct posix_clock_desc *cd)
270{
271 put_posix_clock(cd->clk);
272 fput(cd->fp);
273}
274
275static int pc_clock_adjtime(clockid_t id, struct timex *tx)
276{
277 struct posix_clock_desc cd;
278 int err;
279
280 err = get_clock_desc(id, &cd);
281 if (err)
282 return err;
283
284 if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
285 err = -EACCES;
286 goto out;
287 }
288
289 if (cd.clk->ops.clock_adjtime)
290 err = cd.clk->ops.clock_adjtime(cd.clk, tx);
291 else
292 err = -EOPNOTSUPP;
293out:
294 put_clock_desc(&cd);
295
296 return err;
297}
298
299static int pc_clock_gettime(clockid_t id, struct timespec *ts)
300{
301 struct posix_clock_desc cd;
302 int err;
303
304 err = get_clock_desc(id, &cd);
305 if (err)
306 return err;
307
308 if (cd.clk->ops.clock_gettime)
309 err = cd.clk->ops.clock_gettime(cd.clk, ts);
310 else
311 err = -EOPNOTSUPP;
312
313 put_clock_desc(&cd);
314
315 return err;
316}
317
318static int pc_clock_getres(clockid_t id, struct timespec *ts)
319{
320 struct posix_clock_desc cd;
321 int err;
322
323 err = get_clock_desc(id, &cd);
324 if (err)
325 return err;
326
327 if (cd.clk->ops.clock_getres)
328 err = cd.clk->ops.clock_getres(cd.clk, ts);
329 else
330 err = -EOPNOTSUPP;
331
332 put_clock_desc(&cd);
333
334 return err;
335}
336
337static int pc_clock_settime(clockid_t id, const struct timespec *ts)
338{
339 struct posix_clock_desc cd;
340 int err;
341
342 err = get_clock_desc(id, &cd);
343 if (err)
344 return err;
345
346 if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
347 err = -EACCES;
348 goto out;
349 }
350
351 if (cd.clk->ops.clock_settime)
352 err = cd.clk->ops.clock_settime(cd.clk, ts);
353 else
354 err = -EOPNOTSUPP;
355out:
356 put_clock_desc(&cd);
357
358 return err;
359}
360
361static int pc_timer_create(struct k_itimer *kit)
362{
363 clockid_t id = kit->it_clock;
364 struct posix_clock_desc cd;
365 int err;
366
367 err = get_clock_desc(id, &cd);
368 if (err)
369 return err;
370
371 if (cd.clk->ops.timer_create)
372 err = cd.clk->ops.timer_create(cd.clk, kit);
373 else
374 err = -EOPNOTSUPP;
375
376 put_clock_desc(&cd);
377
378 return err;
379}
380
381static int pc_timer_delete(struct k_itimer *kit)
382{
383 clockid_t id = kit->it_clock;
384 struct posix_clock_desc cd;
385 int err;
386
387 err = get_clock_desc(id, &cd);
388 if (err)
389 return err;
390
391 if (cd.clk->ops.timer_delete)
392 err = cd.clk->ops.timer_delete(cd.clk, kit);
393 else
394 err = -EOPNOTSUPP;
395
396 put_clock_desc(&cd);
397
398 return err;
399}
400
401static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
402{
403 clockid_t id = kit->it_clock;
404 struct posix_clock_desc cd;
405
406 if (get_clock_desc(id, &cd))
407 return;
408
409 if (cd.clk->ops.timer_gettime)
410 cd.clk->ops.timer_gettime(cd.clk, kit, ts);
411
412 put_clock_desc(&cd);
413}
414
415static int pc_timer_settime(struct k_itimer *kit, int flags,
416 struct itimerspec *ts, struct itimerspec *old)
417{
418 clockid_t id = kit->it_clock;
419 struct posix_clock_desc cd;
420 int err;
421
422 err = get_clock_desc(id, &cd);
423 if (err)
424 return err;
425
426 if (cd.clk->ops.timer_settime)
427 err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
428 else
429 err = -EOPNOTSUPP;
430
431 put_clock_desc(&cd);
432
433 return err;
434}
435
436struct k_clock clock_posix_dynamic = {
437 .clock_getres = pc_clock_getres,
438 .clock_set = pc_clock_settime,
439 .clock_get = pc_clock_gettime,
440 .clock_adj = pc_clock_adjtime,
441 .timer_create = pc_timer_create,
442 .timer_set = pc_timer_settime,
443 .timer_del = pc_timer_delete,
444 .timer_get = pc_timer_gettime,
445};
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 48b2761b5668..c7218d132738 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
@@ -457,23 +456,27 @@ void tick_broadcast_oneshot_control(unsigned long reason)
457 unsigned long flags; 456 unsigned long flags;
458 int cpu; 457 int cpu;
459 458
460 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
461
462 /* 459 /*
463 * Periodic mode does not care about the enter/exit of power 460 * Periodic mode does not care about the enter/exit of power
464 * states 461 * states
465 */ 462 */
466 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) 463 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
467 goto out; 464 return;
468 465
469 bc = tick_broadcast_device.evtdev; 466 /*
467 * We are called with preemtion disabled from the depth of the
468 * idle code, so we can't be moved away.
469 */
470 cpu = smp_processor_id(); 470 cpu = smp_processor_id();
471 td = &per_cpu(tick_cpu_device, cpu); 471 td = &per_cpu(tick_cpu_device, cpu);
472 dev = td->evtdev; 472 dev = td->evtdev;
473 473
474 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) 474 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
475 goto out; 475 return;
476
477 bc = tick_broadcast_device.evtdev;
476 478
479 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
477 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { 480 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
478 if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { 481 if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
479 cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); 482 cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
@@ -490,8 +493,6 @@ void tick_broadcast_oneshot_control(unsigned long reason)
490 tick_program_event(dev->next_event, 1); 493 tick_program_event(dev->next_event, 1);
491 } 494 }
492 } 495 }
493
494out:
495 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 496 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
496} 497}
497 498
@@ -523,10 +524,11 @@ static void tick_broadcast_init_next_event(struct cpumask *mask,
523 */ 524 */
524void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 525void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
525{ 526{
527 int cpu = smp_processor_id();
528
526 /* Set it up only once ! */ 529 /* Set it up only once ! */
527 if (bc->event_handler != tick_handle_oneshot_broadcast) { 530 if (bc->event_handler != tick_handle_oneshot_broadcast) {
528 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; 531 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
529 int cpu = smp_processor_id();
530 532
531 bc->event_handler = tick_handle_oneshot_broadcast; 533 bc->event_handler = tick_handle_oneshot_broadcast;
532 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 534 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
@@ -552,6 +554,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
552 tick_broadcast_set_event(tick_next_period, 1); 554 tick_broadcast_set_event(tick_next_period, 1);
553 } else 555 } else
554 bc->next_event.tv64 = KTIME_MAX; 556 bc->next_event.tv64 = KTIME_MAX;
557 } else {
558 /*
559 * The first cpu which switches to oneshot mode sets
560 * the bit for all other cpus which are in the general
561 * (periodic) broadcast mask. So the bit is set and
562 * would prevent the first broadcast enter after this
563 * to program the bc device.
564 */
565 tick_broadcast_clear_oneshot(cpu);
555 } 566 }
556} 567}
557 568
@@ -600,4 +611,14 @@ int tick_broadcast_oneshot_active(void)
600 return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; 611 return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
601} 612}
602 613
614/*
615 * Check whether the broadcast device supports oneshot.
616 */
617bool tick_broadcast_oneshot_available(void)
618{
619 struct clock_event_device *bc = tick_broadcast_device.evtdev;
620
621 return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
622}
623
603#endif 624#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b6b898d2eeef..119528de8235 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include <asm/irq_regs.h> 22#include <asm/irq_regs.h>
24 23
@@ -49,9 +48,13 @@ struct tick_device *tick_get_device(int cpu)
49 */ 48 */
50int tick_is_oneshot_available(void) 49int tick_is_oneshot_available(void)
51{ 50{
52 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 51 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
53 52
54 return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); 53 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT))
54 return 0;
55 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
56 return 1;
57 return tick_broadcast_oneshot_available();
55} 58}
56 59
57/* 60/*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 290eefbc1f60..1009b06d6f89 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
1/* 1/*
2 * tick internal variable and functions used by low/high res code 2 * tick internal variable and functions used by low/high res code
3 */ 3 */
4#include <linux/hrtimer.h>
5#include <linux/tick.h>
6
7#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
4 8
5#define TICK_DO_TIMER_NONE -1 9#define TICK_DO_TIMER_NONE -1
6#define TICK_DO_TIMER_BOOT -2 10#define TICK_DO_TIMER_BOOT -2
@@ -36,6 +40,7 @@ extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
36extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); 40extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
37extern int tick_broadcast_oneshot_active(void); 41extern int tick_broadcast_oneshot_active(void);
38extern void tick_check_oneshot_broadcast(int cpu); 42extern void tick_check_oneshot_broadcast(int cpu);
43bool tick_broadcast_oneshot_available(void);
39# else /* BROADCAST */ 44# else /* BROADCAST */
40static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 45static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
41{ 46{
@@ -46,6 +51,7 @@ static inline void tick_broadcast_switch_to_oneshot(void) { }
46static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 51static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
47static inline int tick_broadcast_oneshot_active(void) { return 0; } 52static inline int tick_broadcast_oneshot_active(void) { return 0; }
48static inline void tick_check_oneshot_broadcast(int cpu) { } 53static inline void tick_check_oneshot_broadcast(int cpu) { }
54static inline bool tick_broadcast_oneshot_available(void) { return true; }
49# endif /* !BROADCAST */ 55# endif /* !BROADCAST */
50 56
51#else /* !ONESHOT */ 57#else /* !ONESHOT */
@@ -76,6 +82,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
76 return 0; 82 return 0;
77} 83}
78static inline int tick_broadcast_oneshot_active(void) { return 0; } 84static inline int tick_broadcast_oneshot_active(void) { return 0; }
85static inline bool tick_broadcast_oneshot_available(void) { return false; }
79#endif /* !TICK_ONESHOT */ 86#endif /* !TICK_ONESHOT */
80 87
81/* 88/*
@@ -132,3 +139,8 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
132{ 139{
133 return !(dev->features & CLOCK_EVT_FEAT_DUMMY); 140 return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
134} 141}
142
143#endif
144
145extern void do_timer(unsigned long ticks);
146extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index aada0e52680a..2d04411a5f05 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
@@ -95,7 +94,7 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
95 */ 94 */
96int tick_program_event(ktime_t expires, int force) 95int tick_program_event(ktime_t expires, int force)
97{ 96{
98 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 97 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
99 98
100 return tick_dev_program_event(dev, expires, force); 99 return tick_dev_program_event(dev, expires, force);
101} 100}
@@ -167,7 +166,7 @@ int tick_oneshot_mode_active(void)
167 int ret; 166 int ret;
168 167
169 local_irq_save(flags); 168 local_irq_save(flags);
170 ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT; 169 ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT;
171 local_irq_restore(flags); 170 local_irq_restore(flags);
172 171
173 return ret; 172 return ret;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index bb2d8b7850a3..0c0e02f1b819 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -19,7 +19,6 @@
19#include <linux/percpu.h> 19#include <linux/percpu.h>
20#include <linux/profile.h> 20#include <linux/profile.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/tick.h>
23#include <linux/module.h> 22#include <linux/module.h>
24 23
25#include <asm/irq_regs.h> 24#include <asm/irq_regs.h>
@@ -642,8 +641,7 @@ static void tick_nohz_switch_to_nohz(void)
642 } 641 }
643 local_irq_enable(); 642 local_irq_enable();
644 643
645 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", 644 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
646 smp_processor_id());
647} 645}
648 646
649/* 647/*
@@ -842,8 +840,10 @@ void tick_setup_sched_timer(void)
842 } 840 }
843 841
844#ifdef CONFIG_NO_HZ 842#ifdef CONFIG_NO_HZ
845 if (tick_nohz_enabled) 843 if (tick_nohz_enabled) {
846 ts->nohz_mode = NOHZ_MODE_HIGHRES; 844 ts->nohz_mode = NOHZ_MODE_HIGHRES;
845 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
846 }
847#endif 847#endif
848} 848}
849#endif /* HIGH_RES_TIMERS */ 849#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index ac38fbb176cc..a9ae369925ce 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -21,6 +21,7 @@
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/math64.h> 23#include <linux/math64.h>
24#include <linux/kernel.h>
24 25
25/* 26/*
26 * fixed point arithmetic scale factor for skew 27 * fixed point arithmetic scale factor for skew
@@ -57,11 +58,11 @@ int timecompare_offset(struct timecompare *sync,
57 int index; 58 int index;
58 int num_samples = sync->num_samples; 59 int num_samples = sync->num_samples;
59 60
60 if (num_samples > sizeof(buffer)/sizeof(buffer[0])) { 61 if (num_samples > ARRAY_SIZE(buffer)) {
61 samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); 62 samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
62 if (!samples) { 63 if (!samples) {
63 samples = buffer; 64 samples = buffer;
64 num_samples = sizeof(buffer)/sizeof(buffer[0]); 65 num_samples = ARRAY_SIZE(buffer);
65 } 66 }
66 } else { 67 } else {
67 samples = buffer; 68 samples = buffer;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 49010d822f72..342408cf68dd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -14,7 +14,7 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/sysdev.h> 17#include <linux/syscore_ops.h>
18#include <linux/clocksource.h> 18#include <linux/clocksource.h>
19#include <linux/jiffies.h> 19#include <linux/jiffies.h>
20#include <linux/time.h> 20#include <linux/time.h>
@@ -32,6 +32,8 @@ struct timekeeper {
32 cycle_t cycle_interval; 32 cycle_t cycle_interval;
33 /* Number of clock shifted nano seconds in one NTP interval. */ 33 /* Number of clock shifted nano seconds in one NTP interval. */
34 u64 xtime_interval; 34 u64 xtime_interval;
35 /* shifted nano seconds left over when rounding cycle_interval */
36 s64 xtime_remainder;
35 /* Raw nano seconds accumulated per NTP interval. */ 37 /* Raw nano seconds accumulated per NTP interval. */
36 u32 raw_interval; 38 u32 raw_interval;
37 39
@@ -47,7 +49,7 @@ struct timekeeper {
47 u32 mult; 49 u32 mult;
48}; 50};
49 51
50struct timekeeper timekeeper; 52static struct timekeeper timekeeper;
51 53
52/** 54/**
53 * timekeeper_setup_internals - Set up internals to use clocksource clock. 55 * timekeeper_setup_internals - Set up internals to use clocksource clock.
@@ -62,7 +64,7 @@ struct timekeeper timekeeper;
62static void timekeeper_setup_internals(struct clocksource *clock) 64static void timekeeper_setup_internals(struct clocksource *clock)
63{ 65{
64 cycle_t interval; 66 cycle_t interval;
65 u64 tmp; 67 u64 tmp, ntpinterval;
66 68
67 timekeeper.clock = clock; 69 timekeeper.clock = clock;
68 clock->cycle_last = clock->read(clock); 70 clock->cycle_last = clock->read(clock);
@@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
70 /* Do the ns -> cycle conversion first, using original mult */ 72 /* Do the ns -> cycle conversion first, using original mult */
71 tmp = NTP_INTERVAL_LENGTH; 73 tmp = NTP_INTERVAL_LENGTH;
72 tmp <<= clock->shift; 74 tmp <<= clock->shift;
75 ntpinterval = tmp;
73 tmp += clock->mult/2; 76 tmp += clock->mult/2;
74 do_div(tmp, clock->mult); 77 do_div(tmp, clock->mult);
75 if (tmp == 0) 78 if (tmp == 0)
@@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
80 83
81 /* Go back from cycles -> shifted ns */ 84 /* Go back from cycles -> shifted ns */
82 timekeeper.xtime_interval = (u64) interval * clock->mult; 85 timekeeper.xtime_interval = (u64) interval * clock->mult;
86 timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
83 timekeeper.raw_interval = 87 timekeeper.raw_interval =
84 ((u64) interval * clock->mult) >> clock->shift; 88 ((u64) interval * clock->mult) >> clock->shift;
85 89
@@ -160,7 +164,7 @@ static struct timespec total_sleep_time;
160/* 164/*
161 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. 165 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
162 */ 166 */
163struct timespec raw_time; 167static struct timespec raw_time;
164 168
165/* flag for if timekeeping is suspended */ 169/* flag for if timekeeping is suspended */
166int __read_mostly timekeeping_suspended; 170int __read_mostly timekeeping_suspended;
@@ -284,6 +288,49 @@ void ktime_get_ts(struct timespec *ts)
284} 288}
285EXPORT_SYMBOL_GPL(ktime_get_ts); 289EXPORT_SYMBOL_GPL(ktime_get_ts);
286 290
291#ifdef CONFIG_NTP_PPS
292
293/**
294 * getnstime_raw_and_real - get day and raw monotonic time in timespec format
295 * @ts_raw: pointer to the timespec to be set to raw monotonic time
296 * @ts_real: pointer to the timespec to be set to the time of day
297 *
298 * This function reads both the time of day and raw monotonic time at the
299 * same time atomically and stores the resulting timestamps in timespec
300 * format.
301 */
302void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
303{
304 unsigned long seq;
305 s64 nsecs_raw, nsecs_real;
306
307 WARN_ON_ONCE(timekeeping_suspended);
308
309 do {
310 u32 arch_offset;
311
312 seq = read_seqbegin(&xtime_lock);
313
314 *ts_raw = raw_time;
315 *ts_real = xtime;
316
317 nsecs_raw = timekeeping_get_ns_raw();
318 nsecs_real = timekeeping_get_ns();
319
320 /* If arch requires, add in gettimeoffset() */
321 arch_offset = arch_gettimeoffset();
322 nsecs_raw += arch_offset;
323 nsecs_real += arch_offset;
324
325 } while (read_seqretry(&xtime_lock, seq));
326
327 timespec_add_ns(ts_raw, nsecs_raw);
328 timespec_add_ns(ts_real, nsecs_real);
329}
330EXPORT_SYMBOL(getnstime_raw_and_real);
331
332#endif /* CONFIG_NTP_PPS */
333
287/** 334/**
288 * do_gettimeofday - Returns the time of day in a timeval 335 * do_gettimeofday - Returns the time of day in a timeval
289 * @tv: pointer to the timeval to be set 336 * @tv: pointer to the timeval to be set
@@ -306,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday);
306 * 353 *
307 * Sets the time of day to the new time and update NTP and notify hrtimers 354 * Sets the time of day to the new time and update NTP and notify hrtimers
308 */ 355 */
309int do_settimeofday(struct timespec *tv) 356int do_settimeofday(const struct timespec *tv)
310{ 357{
311 struct timespec ts_delta; 358 struct timespec ts_delta;
312 unsigned long flags; 359 unsigned long flags;
@@ -340,6 +387,42 @@ int do_settimeofday(struct timespec *tv)
340 387
341EXPORT_SYMBOL(do_settimeofday); 388EXPORT_SYMBOL(do_settimeofday);
342 389
390
391/**
392 * timekeeping_inject_offset - Adds or subtracts from the current time.
393 * @tv: pointer to the timespec variable containing the offset
394 *
395 * Adds or subtracts an offset value from the current time.
396 */
397int timekeeping_inject_offset(struct timespec *ts)
398{
399 unsigned long flags;
400
401 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
402 return -EINVAL;
403
404 write_seqlock_irqsave(&xtime_lock, flags);
405
406 timekeeping_forward_now();
407
408 xtime = timespec_add(xtime, *ts);
409 wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts);
410
411 timekeeper.ntp_error = 0;
412 ntp_clear();
413
414 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
415 timekeeper.mult);
416
417 write_sequnlock_irqrestore(&xtime_lock, flags);
418
419 /* signal hrtimers about time change */
420 clock_was_set();
421
422 return 0;
423}
424EXPORT_SYMBOL(timekeeping_inject_offset);
425
343/** 426/**
344 * change_clocksource - Swaps clocksources if a new one is available 427 * change_clocksource - Swaps clocksources if a new one is available
345 * 428 *
@@ -513,14 +596,65 @@ void __init timekeeping_init(void)
513static struct timespec timekeeping_suspend_time; 596static struct timespec timekeeping_suspend_time;
514 597
515/** 598/**
599 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
600 * @delta: pointer to a timespec delta value
601 *
602 * Takes a timespec offset measuring a suspend interval and properly
603 * adds the sleep offset to the timekeeping variables.
604 */
605static void __timekeeping_inject_sleeptime(struct timespec *delta)
606{
607 xtime = timespec_add(xtime, *delta);
608 wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta);
609 total_sleep_time = timespec_add(total_sleep_time, *delta);
610}
611
612
613/**
614 * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
615 * @delta: pointer to a timespec delta value
616 *
617 * This hook is for architectures that cannot support read_persistent_clock
618 * because their RTC/persistent clock is only accessible when irqs are enabled.
619 *
620 * This function should only be called by rtc_resume(), and allows
621 * a suspend offset to be injected into the timekeeping values.
622 */
623void timekeeping_inject_sleeptime(struct timespec *delta)
624{
625 unsigned long flags;
626 struct timespec ts;
627
628 /* Make sure we don't set the clock twice */
629 read_persistent_clock(&ts);
630 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
631 return;
632
633 write_seqlock_irqsave(&xtime_lock, flags);
634 timekeeping_forward_now();
635
636 __timekeeping_inject_sleeptime(delta);
637
638 timekeeper.ntp_error = 0;
639 ntp_clear();
640 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
641 timekeeper.mult);
642
643 write_sequnlock_irqrestore(&xtime_lock, flags);
644
645 /* signal hrtimers about time change */
646 clock_was_set();
647}
648
649
650/**
516 * timekeeping_resume - Resumes the generic timekeeping subsystem. 651 * timekeeping_resume - Resumes the generic timekeeping subsystem.
517 * @dev: unused
518 * 652 *
519 * This is for the generic clocksource timekeeping. 653 * This is for the generic clocksource timekeeping.
520 * xtime/wall_to_monotonic/jiffies/etc are 654 * xtime/wall_to_monotonic/jiffies/etc are
521 * still managed by arch specific suspend/resume code. 655 * still managed by arch specific suspend/resume code.
522 */ 656 */
523static int timekeeping_resume(struct sys_device *dev) 657static void timekeeping_resume(void)
524{ 658{
525 unsigned long flags; 659 unsigned long flags;
526 struct timespec ts; 660 struct timespec ts;
@@ -533,9 +667,7 @@ static int timekeeping_resume(struct sys_device *dev)
533 667
534 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 668 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
535 ts = timespec_sub(ts, timekeeping_suspend_time); 669 ts = timespec_sub(ts, timekeeping_suspend_time);
536 xtime = timespec_add(xtime, ts); 670 __timekeeping_inject_sleeptime(&ts);
537 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
538 total_sleep_time = timespec_add(total_sleep_time, ts);
539 } 671 }
540 /* re-base the last cycle value */ 672 /* re-base the last cycle value */
541 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 673 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
@@ -548,12 +680,10 @@ static int timekeeping_resume(struct sys_device *dev)
548 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); 680 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
549 681
550 /* Resume hrtimers */ 682 /* Resume hrtimers */
551 hres_timers_resume(); 683 hrtimers_resume();
552
553 return 0;
554} 684}
555 685
556static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) 686static int timekeeping_suspend(void)
557{ 687{
558 unsigned long flags; 688 unsigned long flags;
559 689
@@ -571,26 +701,18 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
571} 701}
572 702
573/* sysfs resume/suspend bits for timekeeping */ 703/* sysfs resume/suspend bits for timekeeping */
574static struct sysdev_class timekeeping_sysclass = { 704static struct syscore_ops timekeeping_syscore_ops = {
575 .name = "timekeeping",
576 .resume = timekeeping_resume, 705 .resume = timekeeping_resume,
577 .suspend = timekeeping_suspend, 706 .suspend = timekeeping_suspend,
578}; 707};
579 708
580static struct sys_device device_timer = { 709static int __init timekeeping_init_ops(void)
581 .id = 0,
582 .cls = &timekeeping_sysclass,
583};
584
585static int __init timekeeping_init_device(void)
586{ 710{
587 int error = sysdev_class_register(&timekeeping_sysclass); 711 register_syscore_ops(&timekeeping_syscore_ops);
588 if (!error) 712 return 0;
589 error = sysdev_register(&device_timer);
590 return error;
591} 713}
592 714
593device_initcall(timekeeping_init_device); 715device_initcall(timekeeping_init_ops);
594 716
595/* 717/*
596 * If the error is already larger, we look ahead even further 718 * If the error is already larger, we look ahead even further
@@ -719,7 +841,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
719 841
720 /* Accumulate error between NTP and clock interval */ 842 /* Accumulate error between NTP and clock interval */
721 timekeeper.ntp_error += tick_length << shift; 843 timekeeper.ntp_error += tick_length << shift;
722 timekeeper.ntp_error -= timekeeper.xtime_interval << 844 timekeeper.ntp_error -=
845 (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
723 (timekeeper.ntp_error_shift + shift); 846 (timekeeper.ntp_error_shift + shift);
724 847
725 return offset; 848 return offset;
@@ -731,7 +854,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
731 * 854 *
732 * Called from the timer interrupt, must hold a write on xtime_lock. 855 * Called from the timer interrupt, must hold a write on xtime_lock.
733 */ 856 */
734void update_wall_time(void) 857static void update_wall_time(void)
735{ 858{
736 struct clocksource *clock; 859 struct clocksource *clock;
737 cycle_t offset; 860 cycle_t offset;
@@ -823,7 +946,7 @@ void update_wall_time(void)
823 * getboottime - Return the real time of system boot. 946 * getboottime - Return the real time of system boot.
824 * @ts: pointer to the timespec to be set 947 * @ts: pointer to the timespec to be set
825 * 948 *
826 * Returns the time of day in a timespec. 949 * Returns the wall-time of boot in a timespec.
827 * 950 *
828 * This is based on the wall_to_monotonic offset and the total suspend 951 * This is based on the wall_to_monotonic offset and the total suspend
829 * time. Calls to settimeofday will affect the value returned (which 952 * time. Calls to settimeofday will affect the value returned (which
@@ -841,6 +964,55 @@ void getboottime(struct timespec *ts)
841} 964}
842EXPORT_SYMBOL_GPL(getboottime); 965EXPORT_SYMBOL_GPL(getboottime);
843 966
967
968/**
969 * get_monotonic_boottime - Returns monotonic time since boot
970 * @ts: pointer to the timespec to be set
971 *
972 * Returns the monotonic time since boot in a timespec.
973 *
974 * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
975 * includes the time spent in suspend.
976 */
977void get_monotonic_boottime(struct timespec *ts)
978{
979 struct timespec tomono, sleep;
980 unsigned int seq;
981 s64 nsecs;
982
983 WARN_ON(timekeeping_suspended);
984
985 do {
986 seq = read_seqbegin(&xtime_lock);
987 *ts = xtime;
988 tomono = wall_to_monotonic;
989 sleep = total_sleep_time;
990 nsecs = timekeeping_get_ns();
991
992 } while (read_seqretry(&xtime_lock, seq));
993
994 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
995 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
996}
997EXPORT_SYMBOL_GPL(get_monotonic_boottime);
998
999/**
1000 * ktime_get_boottime - Returns monotonic time since boot in a ktime
1001 *
1002 * Returns the monotonic time since boot in a ktime
1003 *
1004 * This is similar to CLOCK_MONTONIC/ktime_get, but also
1005 * includes the time spent in suspend.
1006 */
1007ktime_t ktime_get_boottime(void)
1008{
1009 struct timespec ts;
1010
1011 get_monotonic_boottime(&ts);
1012 return timespec_to_ktime(ts);
1013}
1014EXPORT_SYMBOL_GPL(ktime_get_boottime);
1015
844/** 1016/**
845 * monotonic_to_bootbased - Convert the monotonic time to boot based. 1017 * monotonic_to_bootbased - Convert the monotonic time to boot based.
846 * @ts: pointer to the timespec to be converted 1018 * @ts: pointer to the timespec to be converted
@@ -862,11 +1034,6 @@ struct timespec __current_kernel_time(void)
862 return xtime; 1034 return xtime;
863} 1035}
864 1036
865struct timespec __get_wall_to_monotonic(void)
866{
867 return wall_to_monotonic;
868}
869
870struct timespec current_kernel_time(void) 1037struct timespec current_kernel_time(void)
871{ 1038{
872 struct timespec now; 1039 struct timespec now;
@@ -898,3 +1065,63 @@ struct timespec get_monotonic_coarse(void)
898 now.tv_nsec + mono.tv_nsec); 1065 now.tv_nsec + mono.tv_nsec);
899 return now; 1066 return now;
900} 1067}
1068
1069/*
1070 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1071 * without sampling the sequence number in xtime_lock.
1072 * jiffies is defined in the linker script...
1073 */
1074void do_timer(unsigned long ticks)
1075{
1076 jiffies_64 += ticks;
1077 update_wall_time();
1078 calc_global_load(ticks);
1079}
1080
1081/**
1082 * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic,
1083 * and sleep offsets.
1084 * @xtim: pointer to timespec to be set with xtime
1085 * @wtom: pointer to timespec to be set with wall_to_monotonic
1086 * @sleep: pointer to timespec to be set with time in suspend
1087 */
1088void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1089 struct timespec *wtom, struct timespec *sleep)
1090{
1091 unsigned long seq;
1092
1093 do {
1094 seq = read_seqbegin(&xtime_lock);
1095 *xtim = xtime;
1096 *wtom = wall_to_monotonic;
1097 *sleep = total_sleep_time;
1098 } while (read_seqretry(&xtime_lock, seq));
1099}
1100
1101/**
1102 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
1103 */
1104ktime_t ktime_get_monotonic_offset(void)
1105{
1106 unsigned long seq;
1107 struct timespec wtom;
1108
1109 do {
1110 seq = read_seqbegin(&xtime_lock);
1111 wtom = wall_to_monotonic;
1112 } while (read_seqretry(&xtime_lock, seq));
1113 return timespec_to_ktime(wtom);
1114}
1115
1116/**
1117 * xtime_update() - advances the timekeeping infrastructure
1118 * @ticks: number of ticks, that have elapsed since the last call.
1119 *
1120 * Must be called with interrupts disabled.
1121 */
1122void xtime_update(unsigned long ticks)
1123{
1124 write_seqlock(&xtime_lock);
1125 do_timer(ticks);
1126 write_sequnlock(&xtime_lock);
1127}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ab8f5e33fa92..3258455549f4 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -41,7 +41,7 @@ static void print_name_offset(struct seq_file *m, void *sym)
41 char symname[KSYM_NAME_LEN]; 41 char symname[KSYM_NAME_LEN];
42 42
43 if (lookup_symbol_name((unsigned long)sym, symname) < 0) 43 if (lookup_symbol_name((unsigned long)sym, symname) < 0)
44 SEQ_printf(m, "<%p>", sym); 44 SEQ_printf(m, "<%pK>", sym);
45 else 45 else
46 SEQ_printf(m, "%s", symname); 46 SEQ_printf(m, "%s", symname);
47} 47}
@@ -79,26 +79,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
79{ 79{
80 struct hrtimer *timer, tmp; 80 struct hrtimer *timer, tmp;
81 unsigned long next = 0, i; 81 unsigned long next = 0, i;
82 struct rb_node *curr; 82 struct timerqueue_node *curr;
83 unsigned long flags; 83 unsigned long flags;
84 84
85next_one: 85next_one:
86 i = 0; 86 i = 0;
87 raw_spin_lock_irqsave(&base->cpu_base->lock, flags); 87 raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
88 88
89 curr = base->first; 89 curr = timerqueue_getnext(&base->active);
90 /* 90 /*
91 * Crude but we have to do this O(N*N) thing, because 91 * Crude but we have to do this O(N*N) thing, because
92 * we have to unlock the base when printing: 92 * we have to unlock the base when printing:
93 */ 93 */
94 while (curr && i < next) { 94 while (curr && i < next) {
95 curr = rb_next(curr); 95 curr = timerqueue_iterate_next(curr);
96 i++; 96 i++;
97 } 97 }
98 98
99 if (curr) { 99 if (curr) {
100 100
101 timer = rb_entry(curr, struct hrtimer, node); 101 timer = container_of(curr, struct hrtimer, node);
102 tmp = *timer; 102 tmp = *timer;
103 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); 103 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
104 104
@@ -112,7 +112,7 @@ next_one:
112static void 112static void
113print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) 113print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
114{ 114{
115 SEQ_printf(m, " .base: %p\n", base); 115 SEQ_printf(m, " .base: %pK\n", base);
116 SEQ_printf(m, " .index: %d\n", 116 SEQ_printf(m, " .index: %d\n",
117 base->index); 117 base->index);
118 SEQ_printf(m, " .resolution: %Lu nsecs\n", 118 SEQ_printf(m, " .resolution: %Lu nsecs\n",
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 2f3b585b8d7d..a5d0a3a85dd8 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -236,7 +236,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
236 unsigned int timer_flag) 236 unsigned int timer_flag)
237{ 237{
238 /* 238 /*
239 * It doesnt matter which lock we take: 239 * It doesn't matter which lock we take:
240 */ 240 */
241 raw_spinlock_t *lock; 241 raw_spinlock_t *lock;
242 struct entry *entry, input; 242 struct entry *entry, input;
diff --git a/kernel/timer.c b/kernel/timer.c
index 97bf05baade7..8cff36119e4d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_event.h> 40#include <linux/irq_work.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43 43
@@ -88,18 +88,6 @@ struct tvec_base boot_tvec_bases;
88EXPORT_SYMBOL(boot_tvec_bases); 88EXPORT_SYMBOL(boot_tvec_bases);
89static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; 89static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
90 90
91/*
92 * Note that all tvec_bases are 2 byte aligned and lower bit of
93 * base in timer_list is guaranteed to be zero. Use the LSB to
94 * indicate whether the timer is deferrable.
95 *
96 * A deferrable timer will work normally when the system is busy, but
97 * will not cause a CPU to come out of idle just to service it; instead,
98 * the timer will be serviced when the CPU eventually wakes up with a
99 * subsequent non-deferrable timer.
100 */
101#define TBASE_DEFERRABLE_FLAG (0x1)
102
103/* Functions below help us manage 'deferrable' flag */ 91/* Functions below help us manage 'deferrable' flag */
104static inline unsigned int tbase_get_deferrable(struct tvec_base *base) 92static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
105{ 93{
@@ -113,8 +101,7 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
113 101
114static inline void timer_set_deferrable(struct timer_list *timer) 102static inline void timer_set_deferrable(struct timer_list *timer)
115{ 103{
116 timer->base = ((struct tvec_base *)((unsigned long)(timer->base) | 104 timer->base = TBASE_MAKE_DEFERRED(timer->base);
117 TBASE_DEFERRABLE_FLAG));
118} 105}
119 106
120static inline void 107static inline void
@@ -343,15 +330,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
343} 330}
344EXPORT_SYMBOL_GPL(set_timer_slack); 331EXPORT_SYMBOL_GPL(set_timer_slack);
345 332
346
347static inline void set_running_timer(struct tvec_base *base,
348 struct timer_list *timer)
349{
350#ifdef CONFIG_SMP
351 base->running_timer = timer;
352#endif
353}
354
355static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) 333static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
356{ 334{
357 unsigned long expires = timer->expires; 335 unsigned long expires = timer->expires;
@@ -426,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {}
426 404
427static struct debug_obj_descr timer_debug_descr; 405static struct debug_obj_descr timer_debug_descr;
428 406
407static void *timer_debug_hint(void *addr)
408{
409 return ((struct timer_list *) addr)->function;
410}
411
429/* 412/*
430 * fixup_init is called when: 413 * fixup_init is called when:
431 * - an active object is initialized 414 * - an active object is initialized
@@ -499,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
499 482
500static struct debug_obj_descr timer_debug_descr = { 483static struct debug_obj_descr timer_debug_descr = {
501 .name = "timer_list", 484 .name = "timer_list",
485 .debug_hint = timer_debug_hint,
502 .fixup_init = timer_fixup_init, 486 .fixup_init = timer_fixup_init,
503 .fixup_activate = timer_fixup_activate, 487 .fixup_activate = timer_fixup_activate,
504 .fixup_free = timer_fixup_free, 488 .fixup_free = timer_fixup_free,
@@ -765,16 +749,15 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
765 unsigned long expires_limit, mask; 749 unsigned long expires_limit, mask;
766 int bit; 750 int bit;
767 751
768 expires_limit = expires;
769
770 if (timer->slack >= 0) { 752 if (timer->slack >= 0) {
771 expires_limit = expires + timer->slack; 753 expires_limit = expires + timer->slack;
772 } else { 754 } else {
773 unsigned long now = jiffies; 755 long delta = expires - jiffies;
756
757 if (delta < 256)
758 return expires;
774 759
775 /* No slack, if already expired else auto slack 0.4% */ 760 expires_limit = expires + delta / 256;
776 if (time_after(expires, now))
777 expires_limit = expires + (expires - now)/256;
778 } 761 }
779 mask = expires ^ expires_limit; 762 mask = expires ^ expires_limit;
780 if (mask == 0) 763 if (mask == 0)
@@ -811,6 +794,8 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
811 */ 794 */
812int mod_timer(struct timer_list *timer, unsigned long expires) 795int mod_timer(struct timer_list *timer, unsigned long expires)
813{ 796{
797 expires = apply_slack(timer, expires);
798
814 /* 799 /*
815 * This is a common optimization triggered by the 800 * This is a common optimization triggered by the
816 * networking code - if the timer is re-modified 801 * networking code - if the timer is re-modified
@@ -819,8 +804,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
819 if (timer_pending(timer) && timer->expires == expires) 804 if (timer_pending(timer) && timer->expires == expires)
820 return 1; 805 return 1;
821 806
822 expires = apply_slack(timer, expires);
823
824 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); 807 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
825} 808}
826EXPORT_SYMBOL(mod_timer); 809EXPORT_SYMBOL(mod_timer);
@@ -936,15 +919,12 @@ int del_timer(struct timer_list *timer)
936} 919}
937EXPORT_SYMBOL(del_timer); 920EXPORT_SYMBOL(del_timer);
938 921
939#ifdef CONFIG_SMP
940/** 922/**
941 * try_to_del_timer_sync - Try to deactivate a timer 923 * try_to_del_timer_sync - Try to deactivate a timer
942 * @timer: timer do del 924 * @timer: timer do del
943 * 925 *
944 * This function tries to deactivate a timer. Upon successful (ret >= 0) 926 * This function tries to deactivate a timer. Upon successful (ret >= 0)
945 * exit the timer is not queued and the handler is not running on any CPU. 927 * exit the timer is not queued and the handler is not running on any CPU.
946 *
947 * It must not be called from interrupt contexts.
948 */ 928 */
949int try_to_del_timer_sync(struct timer_list *timer) 929int try_to_del_timer_sync(struct timer_list *timer)
950{ 930{
@@ -973,6 +953,7 @@ out:
973} 953}
974EXPORT_SYMBOL(try_to_del_timer_sync); 954EXPORT_SYMBOL(try_to_del_timer_sync);
975 955
956#ifdef CONFIG_SMP
976/** 957/**
977 * del_timer_sync - deactivate a timer and wait for the handler to finish. 958 * del_timer_sync - deactivate a timer and wait for the handler to finish.
978 * @timer: the timer to be deactivated 959 * @timer: the timer to be deactivated
@@ -988,6 +969,25 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
988 * add_timer_on(). Upon exit the timer is not queued and the handler is 969 * add_timer_on(). Upon exit the timer is not queued and the handler is
989 * not running on any CPU. 970 * not running on any CPU.
990 * 971 *
972 * Note: You must not hold locks that are held in interrupt context
973 * while calling this function. Even if the lock has nothing to do
974 * with the timer in question. Here's why:
975 *
976 * CPU0 CPU1
977 * ---- ----
978 * <SOFTIRQ>
979 * call_timer_fn();
980 * base->running_timer = mytimer;
981 * spin_lock_irq(somelock);
982 * <IRQ>
983 * spin_lock(somelock);
984 * del_timer_sync(mytimer);
985 * while (base->running_timer == mytimer);
986 *
987 * Now del_timer_sync() will never return and never release somelock.
988 * The interrupt on the other CPU is waiting to grab somelock but
989 * it has interrupted the softirq that CPU0 is waiting to finish.
990 *
991 * The function returns whether it has deactivated a pending timer or not. 991 * The function returns whether it has deactivated a pending timer or not.
992 */ 992 */
993int del_timer_sync(struct timer_list *timer) 993int del_timer_sync(struct timer_list *timer)
@@ -995,12 +995,20 @@ int del_timer_sync(struct timer_list *timer)
995#ifdef CONFIG_LOCKDEP 995#ifdef CONFIG_LOCKDEP
996 unsigned long flags; 996 unsigned long flags;
997 997
998 /*
999 * If lockdep gives a backtrace here, please reference
1000 * the synchronization rules above.
1001 */
998 local_irq_save(flags); 1002 local_irq_save(flags);
999 lock_map_acquire(&timer->lockdep_map); 1003 lock_map_acquire(&timer->lockdep_map);
1000 lock_map_release(&timer->lockdep_map); 1004 lock_map_release(&timer->lockdep_map);
1001 local_irq_restore(flags); 1005 local_irq_restore(flags);
1002#endif 1006#endif
1003 1007 /*
1008 * don't use it in hardirq context, because it
1009 * could lead to deadlock.
1010 */
1011 WARN_ON(in_irq());
1004 for (;;) { 1012 for (;;) {
1005 int ret = try_to_del_timer_sync(timer); 1013 int ret = try_to_del_timer_sync(timer);
1006 if (ret >= 0) 1014 if (ret >= 0)
@@ -1111,7 +1119,7 @@ static inline void __run_timers(struct tvec_base *base)
1111 1119
1112 timer_stats_account_timer(timer); 1120 timer_stats_account_timer(timer);
1113 1121
1114 set_running_timer(base, timer); 1122 base->running_timer = timer;
1115 detach_timer(timer, 1); 1123 detach_timer(timer, 1);
1116 1124
1117 spin_unlock_irq(&base->lock); 1125 spin_unlock_irq(&base->lock);
@@ -1119,7 +1127,7 @@ static inline void __run_timers(struct tvec_base *base)
1119 spin_lock_irq(&base->lock); 1127 spin_lock_irq(&base->lock);
1120 } 1128 }
1121 } 1129 }
1122 set_running_timer(base, NULL); 1130 base->running_timer = NULL;
1123 spin_unlock_irq(&base->lock); 1131 spin_unlock_irq(&base->lock);
1124} 1132}
1125 1133
@@ -1249,9 +1257,15 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
1249 */ 1257 */
1250unsigned long get_next_timer_interrupt(unsigned long now) 1258unsigned long get_next_timer_interrupt(unsigned long now)
1251{ 1259{
1252 struct tvec_base *base = __get_cpu_var(tvec_bases); 1260 struct tvec_base *base = __this_cpu_read(tvec_bases);
1253 unsigned long expires; 1261 unsigned long expires;
1254 1262
1263 /*
1264 * Pretend that there is no timer pending if the cpu is offline.
1265 * Possible pending timers will be migrated later to an active cpu.
1266 */
1267 if (cpu_is_offline(smp_processor_id()))
1268 return now + NEXT_TIMER_MAX_DELTA;
1255 spin_lock(&base->lock); 1269 spin_lock(&base->lock);
1256 if (time_before_eq(base->next_timer, base->timer_jiffies)) 1270 if (time_before_eq(base->next_timer, base->timer_jiffies))
1257 base->next_timer = __next_timer_interrupt(base); 1271 base->next_timer = __next_timer_interrupt(base);
@@ -1279,7 +1293,10 @@ void update_process_times(int user_tick)
1279 run_local_timers(); 1293 run_local_timers();
1280 rcu_check_callbacks(cpu, user_tick); 1294 rcu_check_callbacks(cpu, user_tick);
1281 printk_tick(); 1295 printk_tick();
1282 perf_event_do_pending(); 1296#ifdef CONFIG_IRQ_WORK
1297 if (in_irq())
1298 irq_work_run();
1299#endif
1283 scheduler_tick(); 1300 scheduler_tick();
1284 run_posix_cpu_timers(p); 1301 run_posix_cpu_timers(p);
1285} 1302}
@@ -1289,7 +1306,7 @@ void update_process_times(int user_tick)
1289 */ 1306 */
1290static void run_timer_softirq(struct softirq_action *h) 1307static void run_timer_softirq(struct softirq_action *h)
1291{ 1308{
1292 struct tvec_base *base = __get_cpu_var(tvec_bases); 1309 struct tvec_base *base = __this_cpu_read(tvec_bases);
1293 1310
1294 hrtimer_run_pending(); 1311 hrtimer_run_pending();
1295 1312
@@ -1306,19 +1323,6 @@ void run_local_timers(void)
1306 raise_softirq(TIMER_SOFTIRQ); 1323 raise_softirq(TIMER_SOFTIRQ);
1307} 1324}
1308 1325
1309/*
1310 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1311 * without sampling the sequence number in xtime_lock.
1312 * jiffies is defined in the linker script...
1313 */
1314
1315void do_timer(unsigned long ticks)
1316{
1317 jiffies_64 += ticks;
1318 update_wall_time();
1319 calc_global_load();
1320}
1321
1322#ifdef __ARCH_WANT_SYS_ALARM 1326#ifdef __ARCH_WANT_SYS_ALARM
1323 1327
1324/* 1328/*
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 538501c6ea50..2ad39e556cb4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS
49 help 49 help
50 See Documentation/trace/ftrace-design.txt 50 See Documentation/trace/ftrace-design.txt
51 51
52config HAVE_C_RECORDMCOUNT
53 bool
54 help
55 C version of recordmcount available?
56
52config TRACER_MAX_TRACE 57config TRACER_MAX_TRACE
53 bool 58 bool
54 59
@@ -64,6 +69,21 @@ config EVENT_TRACING
64 select CONTEXT_SWITCH_TRACER 69 select CONTEXT_SWITCH_TRACER
65 bool 70 bool
66 71
72config EVENT_POWER_TRACING_DEPRECATED
73 depends on EVENT_TRACING
74 bool "Deprecated power event trace API, to be removed"
75 default y
76 help
77 Provides old power event types:
78 C-state/idle accounting events:
79 power:power_start
80 power:power_end
81 and old cpufreq accounting event:
82 power:power_frequency
83 This is for userspace compatibility
84 and will vanish after 5 kernel iterations,
85 namely 2.6.41.
86
67config CONTEXT_SWITCH_TRACER 87config CONTEXT_SWITCH_TRACER
68 bool 88 bool
69 89
@@ -121,7 +141,7 @@ if FTRACE
121config FUNCTION_TRACER 141config FUNCTION_TRACER
122 bool "Kernel Function Tracer" 142 bool "Kernel Function Tracer"
123 depends on HAVE_FUNCTION_TRACER 143 depends on HAVE_FUNCTION_TRACER
124 select FRAME_POINTER 144 select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE
125 select KALLSYMS 145 select KALLSYMS
126 select GENERIC_TRACER 146 select GENERIC_TRACER
127 select CONTEXT_SWITCH_TRACER 147 select CONTEXT_SWITCH_TRACER
@@ -255,7 +275,7 @@ config PROFILE_ANNOTATED_BRANCHES
255 This tracer profiles all the the likely and unlikely macros 275 This tracer profiles all the the likely and unlikely macros
256 in the kernel. It will display the results in: 276 in the kernel. It will display the results in:
257 277
258 /sys/kernel/debug/tracing/profile_annotated_branch 278 /sys/kernel/debug/tracing/trace_stat/branch_annotated
259 279
260 Note: this will add a significant overhead; only turn this 280 Note: this will add a significant overhead; only turn this
261 on if you need to profile the system's use of these macros. 281 on if you need to profile the system's use of these macros.
@@ -268,7 +288,7 @@ config PROFILE_ALL_BRANCHES
268 taken in the kernel is recorded whether it hit or miss. 288 taken in the kernel is recorded whether it hit or miss.
269 The results will be displayed in: 289 The results will be displayed in:
270 290
271 /sys/kernel/debug/tracing/profile_branch 291 /sys/kernel/debug/tracing/trace_stat/branch_all
272 292
273 This option also enables the likely/unlikely profiler. 293 This option also enables the likely/unlikely profiler.
274 294
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 53f338190b26..761c510a06c5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
52endif 52endif
53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
55obj-$(CONFIG_EVENT_TRACING) += power-traces.o 55obj-$(CONFIG_TRACEPOINTS) += power-traces.o
56ifeq ($(CONFIG_TRACING),y) 56ifeq ($(CONFIG_TRACING),y)
57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o 57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
58endif 58endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 959f8d6c8cc1..6957aa298dfa 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,7 +23,6 @@
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/debugfs.h> 25#include <linux/debugfs.h>
26#include <linux/smp_lock.h>
27#include <linux/time.h> 26#include <linux/time.h>
28#include <linux/uaccess.h> 27#include <linux/uaccess.h>
29 28
@@ -139,6 +138,13 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
139 !blk_tracer_enabled)) 138 !blk_tracer_enabled))
140 return; 139 return;
141 140
141 /*
142 * If the BLK_TC_NOTIFY action mask isn't set, don't send any note
143 * message to the trace.
144 */
145 if (!(bt->act_mask & BLK_TC_NOTIFY))
146 return;
147
142 local_irq_save(flags); 148 local_irq_save(flags);
143 buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); 149 buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
144 va_start(args, fmt); 150 va_start(args, fmt);
@@ -169,7 +175,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
169static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), 175static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
170 BLK_TC_ACT(BLK_TC_WRITE) }; 176 BLK_TC_ACT(BLK_TC_WRITE) };
171 177
172#define BLK_TC_HARDBARRIER BLK_TC_BARRIER
173#define BLK_TC_RAHEAD BLK_TC_AHEAD 178#define BLK_TC_RAHEAD BLK_TC_AHEAD
174 179
175/* The ilog2() calls fall out because they're constant */ 180/* The ilog2() calls fall out because they're constant */
@@ -197,7 +202,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
197 return; 202 return;
198 203
199 what |= ddir_act[rw & WRITE]; 204 what |= ddir_act[rw & WRITE];
200 what |= MASK_TC_BIT(rw, HARDBARRIER);
201 what |= MASK_TC_BIT(rw, SYNC); 205 what |= MASK_TC_BIT(rw, SYNC);
202 what |= MASK_TC_BIT(rw, RAHEAD); 206 what |= MASK_TC_BIT(rw, RAHEAD);
203 what |= MASK_TC_BIT(rw, META); 207 what |= MASK_TC_BIT(rw, META);
@@ -326,6 +330,7 @@ static const struct file_operations blk_dropped_fops = {
326 .owner = THIS_MODULE, 330 .owner = THIS_MODULE,
327 .open = blk_dropped_open, 331 .open = blk_dropped_open,
328 .read = blk_dropped_read, 332 .read = blk_dropped_read,
333 .llseek = default_llseek,
329}; 334};
330 335
331static int blk_msg_open(struct inode *inode, struct file *filp) 336static int blk_msg_open(struct inode *inode, struct file *filp)
@@ -365,6 +370,7 @@ static const struct file_operations blk_msg_fops = {
365 .owner = THIS_MODULE, 370 .owner = THIS_MODULE,
366 .open = blk_msg_open, 371 .open = blk_msg_open,
367 .write = blk_msg_write, 372 .write = blk_msg_write,
373 .llseek = noop_llseek,
368}; 374};
369 375
370/* 376/*
@@ -639,7 +645,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
639 if (!q) 645 if (!q)
640 return -ENXIO; 646 return -ENXIO;
641 647
642 lock_kernel();
643 mutex_lock(&bdev->bd_mutex); 648 mutex_lock(&bdev->bd_mutex);
644 649
645 switch (cmd) { 650 switch (cmd) {
@@ -667,7 +672,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
667 } 672 }
668 673
669 mutex_unlock(&bdev->bd_mutex); 674 mutex_unlock(&bdev->bd_mutex);
670 unlock_kernel();
671 return ret; 675 return ret;
672} 676}
673 677
@@ -699,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q)
699 * 703 *
700 **/ 704 **/
701static void blk_add_trace_rq(struct request_queue *q, struct request *rq, 705static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
702 u32 what) 706 u32 what)
703{ 707{
704 struct blk_trace *bt = q->blk_trace; 708 struct blk_trace *bt = q->blk_trace;
705 int rw = rq->cmd_flags & 0x03;
706 709
707 if (likely(!bt)) 710 if (likely(!bt))
708 return; 711 return;
709 712
710 if (rq->cmd_flags & REQ_DISCARD)
711 rw |= REQ_DISCARD;
712
713 if (rq->cmd_flags & REQ_SECURE)
714 rw |= REQ_SECURE;
715
716 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 713 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
717 what |= BLK_TC_ACT(BLK_TC_PC); 714 what |= BLK_TC_ACT(BLK_TC_PC);
718 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, 715 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags,
719 what, rq->errors, rq->cmd_len, rq->cmd); 716 what, rq->errors, rq->cmd_len, rq->cmd);
720 } else { 717 } else {
721 what |= BLK_TC_ACT(BLK_TC_FS); 718 what |= BLK_TC_ACT(BLK_TC_FS);
722 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw, 719 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
723 what, rq->errors, 0, NULL); 720 rq->cmd_flags, what, rq->errors, 0, NULL);
724 } 721 }
725} 722}
726 723
@@ -761,53 +758,58 @@ static void blk_add_trace_rq_complete(void *ignore,
761 * @q: queue the io is for 758 * @q: queue the io is for
762 * @bio: the source bio 759 * @bio: the source bio
763 * @what: the action 760 * @what: the action
761 * @error: error, if any
764 * 762 *
765 * Description: 763 * Description:
766 * Records an action against a bio. Will log the bio offset + size. 764 * Records an action against a bio. Will log the bio offset + size.
767 * 765 *
768 **/ 766 **/
769static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, 767static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
770 u32 what) 768 u32 what, int error)
771{ 769{
772 struct blk_trace *bt = q->blk_trace; 770 struct blk_trace *bt = q->blk_trace;
773 771
774 if (likely(!bt)) 772 if (likely(!bt))
775 return; 773 return;
776 774
775 if (!error && !bio_flagged(bio, BIO_UPTODATE))
776 error = EIO;
777
777 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, 778 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
778 !bio_flagged(bio, BIO_UPTODATE), 0, NULL); 779 error, 0, NULL);
779} 780}
780 781
781static void blk_add_trace_bio_bounce(void *ignore, 782static void blk_add_trace_bio_bounce(void *ignore,
782 struct request_queue *q, struct bio *bio) 783 struct request_queue *q, struct bio *bio)
783{ 784{
784 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); 785 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
785} 786}
786 787
787static void blk_add_trace_bio_complete(void *ignore, 788static void blk_add_trace_bio_complete(void *ignore,
788 struct request_queue *q, struct bio *bio) 789 struct request_queue *q, struct bio *bio,
790 int error)
789{ 791{
790 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); 792 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
791} 793}
792 794
793static void blk_add_trace_bio_backmerge(void *ignore, 795static void blk_add_trace_bio_backmerge(void *ignore,
794 struct request_queue *q, 796 struct request_queue *q,
795 struct bio *bio) 797 struct bio *bio)
796{ 798{
797 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); 799 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
798} 800}
799 801
800static void blk_add_trace_bio_frontmerge(void *ignore, 802static void blk_add_trace_bio_frontmerge(void *ignore,
801 struct request_queue *q, 803 struct request_queue *q,
802 struct bio *bio) 804 struct bio *bio)
803{ 805{
804 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); 806 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
805} 807}
806 808
807static void blk_add_trace_bio_queue(void *ignore, 809static void blk_add_trace_bio_queue(void *ignore,
808 struct request_queue *q, struct bio *bio) 810 struct request_queue *q, struct bio *bio)
809{ 811{
810 blk_add_trace_bio(q, bio, BLK_TA_QUEUE); 812 blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
811} 813}
812 814
813static void blk_add_trace_getrq(void *ignore, 815static void blk_add_trace_getrq(void *ignore,
@@ -815,7 +817,7 @@ static void blk_add_trace_getrq(void *ignore,
815 struct bio *bio, int rw) 817 struct bio *bio, int rw)
816{ 818{
817 if (bio) 819 if (bio)
818 blk_add_trace_bio(q, bio, BLK_TA_GETRQ); 820 blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
819 else { 821 else {
820 struct blk_trace *bt = q->blk_trace; 822 struct blk_trace *bt = q->blk_trace;
821 823
@@ -830,7 +832,7 @@ static void blk_add_trace_sleeprq(void *ignore,
830 struct bio *bio, int rw) 832 struct bio *bio, int rw)
831{ 833{
832 if (bio) 834 if (bio)
833 blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); 835 blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
834 else { 836 else {
835 struct blk_trace *bt = q->blk_trace; 837 struct blk_trace *bt = q->blk_trace;
836 838
@@ -848,29 +850,21 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
848 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); 850 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
849} 851}
850 852
851static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q) 853static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
854 unsigned int depth, bool explicit)
852{ 855{
853 struct blk_trace *bt = q->blk_trace; 856 struct blk_trace *bt = q->blk_trace;
854 857
855 if (bt) { 858 if (bt) {
856 unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; 859 __be64 rpdu = cpu_to_be64(depth);
857 __be64 rpdu = cpu_to_be64(pdu); 860 u32 what;
858 861
859 __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, 862 if (explicit)
860 sizeof(rpdu), &rpdu); 863 what = BLK_TA_UNPLUG_IO;
861 } 864 else
862} 865 what = BLK_TA_UNPLUG_TIMER;
863
864static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
865{
866 struct blk_trace *bt = q->blk_trace;
867
868 if (bt) {
869 unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
870 __be64 rpdu = cpu_to_be64(pdu);
871 866
872 __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, 867 __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
873 sizeof(rpdu), &rpdu);
874 } 868 }
875} 869}
876 870
@@ -890,7 +884,7 @@ static void blk_add_trace_split(void *ignore,
890} 884}
891 885
892/** 886/**
893 * blk_add_trace_remap - Add a trace for a remap operation 887 * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
894 * @ignore: trace callback data parameter (not used) 888 * @ignore: trace callback data parameter (not used)
895 * @q: queue the io is for 889 * @q: queue the io is for
896 * @bio: the source bio 890 * @bio: the source bio
@@ -902,9 +896,9 @@ static void blk_add_trace_split(void *ignore,
902 * it spans a stripe (or similar). Add a trace for that action. 896 * it spans a stripe (or similar). Add a trace for that action.
903 * 897 *
904 **/ 898 **/
905static void blk_add_trace_remap(void *ignore, 899static void blk_add_trace_bio_remap(void *ignore,
906 struct request_queue *q, struct bio *bio, 900 struct request_queue *q, struct bio *bio,
907 dev_t dev, sector_t from) 901 dev_t dev, sector_t from)
908{ 902{
909 struct blk_trace *bt = q->blk_trace; 903 struct blk_trace *bt = q->blk_trace;
910 struct blk_io_trace_remap r; 904 struct blk_io_trace_remap r;
@@ -1013,13 +1007,11 @@ static void blk_register_tracepoints(void)
1013 WARN_ON(ret); 1007 WARN_ON(ret);
1014 ret = register_trace_block_plug(blk_add_trace_plug, NULL); 1008 ret = register_trace_block_plug(blk_add_trace_plug, NULL);
1015 WARN_ON(ret); 1009 WARN_ON(ret);
1016 ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); 1010 ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
1017 WARN_ON(ret);
1018 ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
1019 WARN_ON(ret); 1011 WARN_ON(ret);
1020 ret = register_trace_block_split(blk_add_trace_split, NULL); 1012 ret = register_trace_block_split(blk_add_trace_split, NULL);
1021 WARN_ON(ret); 1013 WARN_ON(ret);
1022 ret = register_trace_block_remap(blk_add_trace_remap, NULL); 1014 ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1023 WARN_ON(ret); 1015 WARN_ON(ret);
1024 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); 1016 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1025 WARN_ON(ret); 1017 WARN_ON(ret);
@@ -1028,10 +1020,9 @@ static void blk_register_tracepoints(void)
1028static void blk_unregister_tracepoints(void) 1020static void blk_unregister_tracepoints(void)
1029{ 1021{
1030 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); 1022 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1031 unregister_trace_block_remap(blk_add_trace_remap, NULL); 1023 unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1032 unregister_trace_block_split(blk_add_trace_split, NULL); 1024 unregister_trace_block_split(blk_add_trace_split, NULL);
1033 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); 1025 unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
1034 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
1035 unregister_trace_block_plug(blk_add_trace_plug, NULL); 1026 unregister_trace_block_plug(blk_add_trace_plug, NULL);
1036 unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); 1027 unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
1037 unregister_trace_block_getrq(blk_add_trace_getrq, NULL); 1028 unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
@@ -1652,10 +1643,9 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1652 struct block_device *bdev; 1643 struct block_device *bdev;
1653 ssize_t ret = -ENXIO; 1644 ssize_t ret = -ENXIO;
1654 1645
1655 lock_kernel();
1656 bdev = bdget(part_devt(p)); 1646 bdev = bdget(part_devt(p));
1657 if (bdev == NULL) 1647 if (bdev == NULL)
1658 goto out_unlock_kernel; 1648 goto out;
1659 1649
1660 q = blk_trace_get_queue(bdev); 1650 q = blk_trace_get_queue(bdev);
1661 if (q == NULL) 1651 if (q == NULL)
@@ -1683,8 +1673,7 @@ out_unlock_bdev:
1683 mutex_unlock(&bdev->bd_mutex); 1673 mutex_unlock(&bdev->bd_mutex);
1684out_bdput: 1674out_bdput:
1685 bdput(bdev); 1675 bdput(bdev);
1686out_unlock_kernel: 1676out:
1687 unlock_kernel();
1688 return ret; 1677 return ret;
1689} 1678}
1690 1679
@@ -1714,11 +1703,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1714 1703
1715 ret = -ENXIO; 1704 ret = -ENXIO;
1716 1705
1717 lock_kernel();
1718 p = dev_to_part(dev); 1706 p = dev_to_part(dev);
1719 bdev = bdget(part_devt(p)); 1707 bdev = bdget(part_devt(p));
1720 if (bdev == NULL) 1708 if (bdev == NULL)
1721 goto out_unlock_kernel; 1709 goto out;
1722 1710
1723 q = blk_trace_get_queue(bdev); 1711 q = blk_trace_get_queue(bdev);
1724 if (q == NULL) 1712 if (q == NULL)
@@ -1753,8 +1741,6 @@ out_unlock_bdev:
1753 mutex_unlock(&bdev->bd_mutex); 1741 mutex_unlock(&bdev->bd_mutex);
1754out_bdput: 1742out_bdput:
1755 bdput(bdev); 1743 bdput(bdev);
1756out_unlock_kernel:
1757 unlock_kernel();
1758out: 1744out:
1759 return ret ? ret : count; 1745 return ret ? ret : count;
1760} 1746}
@@ -1813,8 +1799,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1813 1799
1814 if (rw & REQ_RAHEAD) 1800 if (rw & REQ_RAHEAD)
1815 rwbs[i++] = 'A'; 1801 rwbs[i++] = 'A';
1816 if (rw & REQ_HARDBARRIER)
1817 rwbs[i++] = 'B';
1818 if (rw & REQ_SYNC) 1802 if (rw & REQ_SYNC)
1819 rwbs[i++] = 'S'; 1803 rwbs[i++] = 'S';
1820 if (rw & REQ_META) 1804 if (rw & REQ_META)
@@ -1825,21 +1809,5 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1825 rwbs[i] = '\0'; 1809 rwbs[i] = '\0';
1826} 1810}
1827 1811
1828void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
1829{
1830 int rw = rq->cmd_flags & 0x03;
1831 int bytes;
1832
1833 if (rq->cmd_flags & REQ_DISCARD)
1834 rw |= REQ_DISCARD;
1835
1836 if (rq->cmd_flags & REQ_SECURE)
1837 rw |= REQ_SECURE;
1838
1839 bytes = blk_rq_bytes(rq);
1840
1841 blk_fill_rwbs(rwbs, rw, bytes);
1842}
1843
1844#endif /* CONFIG_EVENT_TRACING */ 1812#endif /* CONFIG_EVENT_TRACING */
1845 1813
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fa7ece649fe1..908038f57440 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -39,20 +39,26 @@
39#include "trace_stat.h" 39#include "trace_stat.h"
40 40
41#define FTRACE_WARN_ON(cond) \ 41#define FTRACE_WARN_ON(cond) \
42 do { \ 42 ({ \
43 if (WARN_ON(cond)) \ 43 int ___r = cond; \
44 if (WARN_ON(___r)) \
44 ftrace_kill(); \ 45 ftrace_kill(); \
45 } while (0) 46 ___r; \
47 })
46 48
47#define FTRACE_WARN_ON_ONCE(cond) \ 49#define FTRACE_WARN_ON_ONCE(cond) \
48 do { \ 50 ({ \
49 if (WARN_ON_ONCE(cond)) \ 51 int ___r = cond; \
52 if (WARN_ON_ONCE(___r)) \
50 ftrace_kill(); \ 53 ftrace_kill(); \
51 } while (0) 54 ___r; \
55 })
52 56
53/* hash bits for specific function selection */ 57/* hash bits for specific function selection */
54#define FTRACE_HASH_BITS 7 58#define FTRACE_HASH_BITS 7
55#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) 59#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
60#define FTRACE_HASH_DEFAULT_BITS 10
61#define FTRACE_HASH_MAX_BITS 12
56 62
57/* ftrace_enabled is a method to turn ftrace on or off */ 63/* ftrace_enabled is a method to turn ftrace on or off */
58int ftrace_enabled __read_mostly; 64int ftrace_enabled __read_mostly;
@@ -81,28 +87,40 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
81 .func = ftrace_stub, 87 .func = ftrace_stub,
82}; 88};
83 89
84static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; 90static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
91static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
85ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 92ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
86ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 93ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
87ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 94ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
95static struct ftrace_ops global_ops;
96
97static void
98ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
88 99
89/* 100/*
90 * Traverse the ftrace_list, invoking all entries. The reason that we 101 * Traverse the ftrace_global_list, invoking all entries. The reason that we
91 * can use rcu_dereference_raw() is that elements removed from this list 102 * can use rcu_dereference_raw() is that elements removed from this list
92 * are simply leaked, so there is no need to interact with a grace-period 103 * are simply leaked, so there is no need to interact with a grace-period
93 * mechanism. The rcu_dereference_raw() calls are needed to handle 104 * mechanism. The rcu_dereference_raw() calls are needed to handle
94 * concurrent insertions into the ftrace_list. 105 * concurrent insertions into the ftrace_global_list.
95 * 106 *
96 * Silly Alpha and silly pointer-speculation compiler optimizations! 107 * Silly Alpha and silly pointer-speculation compiler optimizations!
97 */ 108 */
98static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 109static void ftrace_global_list_func(unsigned long ip,
110 unsigned long parent_ip)
99{ 111{
100 struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/ 112 struct ftrace_ops *op;
101 113
114 if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
115 return;
116
117 trace_recursion_set(TRACE_GLOBAL_BIT);
118 op = rcu_dereference_raw(ftrace_global_list); /*see above*/
102 while (op != &ftrace_list_end) { 119 while (op != &ftrace_list_end) {
103 op->func(ip, parent_ip); 120 op->func(ip, parent_ip);
104 op = rcu_dereference_raw(op->next); /*see above*/ 121 op = rcu_dereference_raw(op->next); /*see above*/
105 }; 122 };
123 trace_recursion_clear(TRACE_GLOBAL_BIT);
106} 124}
107 125
108static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) 126static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
@@ -147,46 +165,69 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
147} 165}
148#endif 166#endif
149 167
150static int __register_ftrace_function(struct ftrace_ops *ops) 168static void update_global_ops(void)
151{ 169{
152 ops->next = ftrace_list; 170 ftrace_func_t func;
171
153 /* 172 /*
154 * We are entering ops into the ftrace_list but another 173 * If there's only one function registered, then call that
155 * CPU might be walking that list. We need to make sure 174 * function directly. Otherwise, we need to iterate over the
156 * the ops->next pointer is valid before another CPU sees 175 * registered callers.
157 * the ops pointer included into the ftrace_list.
158 */ 176 */
159 rcu_assign_pointer(ftrace_list, ops); 177 if (ftrace_global_list == &ftrace_list_end ||
178 ftrace_global_list->next == &ftrace_list_end)
179 func = ftrace_global_list->func;
180 else
181 func = ftrace_global_list_func;
160 182
161 if (ftrace_enabled) { 183 /* If we filter on pids, update to use the pid function */
162 ftrace_func_t func; 184 if (!list_empty(&ftrace_pids)) {
185 set_ftrace_pid_function(func);
186 func = ftrace_pid_func;
187 }
163 188
164 if (ops->next == &ftrace_list_end) 189 global_ops.func = func;
165 func = ops->func; 190}
166 else
167 func = ftrace_list_func;
168 191
169 if (!list_empty(&ftrace_pids)) { 192static void update_ftrace_function(void)
170 set_ftrace_pid_function(func); 193{
171 func = ftrace_pid_func; 194 ftrace_func_t func;
172 } 195
196 update_global_ops();
197
198 /*
199 * If we are at the end of the list and this ops is
200 * not dynamic, then have the mcount trampoline call
201 * the function directly
202 */
203 if (ftrace_ops_list == &ftrace_list_end ||
204 (ftrace_ops_list->next == &ftrace_list_end &&
205 !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC)))
206 func = ftrace_ops_list->func;
207 else
208 func = ftrace_ops_list_func;
173 209
174 /*
175 * For one func, simply call it directly.
176 * For more than one func, call the chain.
177 */
178#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 210#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
179 ftrace_trace_function = func; 211 ftrace_trace_function = func;
180#else 212#else
181 __ftrace_trace_function = func; 213 __ftrace_trace_function = func;
182 ftrace_trace_function = ftrace_test_stop_func; 214 ftrace_trace_function = ftrace_test_stop_func;
183#endif 215#endif
184 } 216}
185 217
186 return 0; 218static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
219{
220 ops->next = *list;
221 /*
222 * We are entering ops into the list but another
223 * CPU might be walking that list. We need to make sure
224 * the ops->next pointer is valid before another CPU sees
225 * the ops pointer included into the list.
226 */
227 rcu_assign_pointer(*list, ops);
187} 228}
188 229
189static int __unregister_ftrace_function(struct ftrace_ops *ops) 230static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
190{ 231{
191 struct ftrace_ops **p; 232 struct ftrace_ops **p;
192 233
@@ -194,13 +235,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
194 * If we are removing the last function, then simply point 235 * If we are removing the last function, then simply point
195 * to the ftrace_stub. 236 * to the ftrace_stub.
196 */ 237 */
197 if (ftrace_list == ops && ops->next == &ftrace_list_end) { 238 if (*list == ops && ops->next == &ftrace_list_end) {
198 ftrace_trace_function = ftrace_stub; 239 *list = &ftrace_list_end;
199 ftrace_list = &ftrace_list_end;
200 return 0; 240 return 0;
201 } 241 }
202 242
203 for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) 243 for (p = list; *p != &ftrace_list_end; p = &(*p)->next)
204 if (*p == ops) 244 if (*p == ops)
205 break; 245 break;
206 246
@@ -208,53 +248,83 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
208 return -1; 248 return -1;
209 249
210 *p = (*p)->next; 250 *p = (*p)->next;
251 return 0;
252}
211 253
212 if (ftrace_enabled) { 254static int __register_ftrace_function(struct ftrace_ops *ops)
213 /* If we only have one func left, then call that directly */ 255{
214 if (ftrace_list->next == &ftrace_list_end) { 256 if (ftrace_disabled)
215 ftrace_func_t func = ftrace_list->func; 257 return -ENODEV;
216 258
217 if (!list_empty(&ftrace_pids)) { 259 if (FTRACE_WARN_ON(ops == &global_ops))
218 set_ftrace_pid_function(func); 260 return -EINVAL;
219 func = ftrace_pid_func; 261
220 } 262 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
221#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 263 return -EBUSY;
222 ftrace_trace_function = func; 264
223#else 265 if (!core_kernel_data((unsigned long)ops))
224 __ftrace_trace_function = func; 266 ops->flags |= FTRACE_OPS_FL_DYNAMIC;
225#endif 267
226 } 268 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
227 } 269 int first = ftrace_global_list == &ftrace_list_end;
270 add_ftrace_ops(&ftrace_global_list, ops);
271 ops->flags |= FTRACE_OPS_FL_ENABLED;
272 if (first)
273 add_ftrace_ops(&ftrace_ops_list, &global_ops);
274 } else
275 add_ftrace_ops(&ftrace_ops_list, ops);
276
277 if (ftrace_enabled)
278 update_ftrace_function();
228 279
229 return 0; 280 return 0;
230} 281}
231 282
232static void ftrace_update_pid_func(void) 283static int __unregister_ftrace_function(struct ftrace_ops *ops)
233{ 284{
234 ftrace_func_t func; 285 int ret;
235 286
236 if (ftrace_trace_function == ftrace_stub) 287 if (ftrace_disabled)
237 return; 288 return -ENODEV;
238 289
239#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 290 if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
240 func = ftrace_trace_function; 291 return -EBUSY;
241#else
242 func = __ftrace_trace_function;
243#endif
244 292
245 if (!list_empty(&ftrace_pids)) { 293 if (FTRACE_WARN_ON(ops == &global_ops))
246 set_ftrace_pid_function(func); 294 return -EINVAL;
247 func = ftrace_pid_func;
248 } else {
249 if (func == ftrace_pid_func)
250 func = ftrace_pid_function;
251 }
252 295
253#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 296 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
254 ftrace_trace_function = func; 297 ret = remove_ftrace_ops(&ftrace_global_list, ops);
255#else 298 if (!ret && ftrace_global_list == &ftrace_list_end)
256 __ftrace_trace_function = func; 299 ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops);
257#endif 300 if (!ret)
301 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
302 } else
303 ret = remove_ftrace_ops(&ftrace_ops_list, ops);
304
305 if (ret < 0)
306 return ret;
307
308 if (ftrace_enabled)
309 update_ftrace_function();
310
311 /*
312 * Dynamic ops may be freed, we must make sure that all
313 * callers are done before leaving this function.
314 */
315 if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
316 synchronize_sched();
317
318 return 0;
319}
320
321static void ftrace_update_pid_func(void)
322{
323 /* Only do something if we are tracing something */
324 if (ftrace_trace_function == ftrace_stub)
325 return;
326
327 update_ftrace_function();
258} 328}
259 329
260#ifdef CONFIG_FUNCTION_PROFILER 330#ifdef CONFIG_FUNCTION_PROFILER
@@ -800,6 +870,7 @@ static const struct file_operations ftrace_profile_fops = {
800 .open = tracing_open_generic, 870 .open = tracing_open_generic,
801 .read = ftrace_profile_read, 871 .read = ftrace_profile_read,
802 .write = ftrace_profile_write, 872 .write = ftrace_profile_write,
873 .llseek = default_llseek,
803}; 874};
804 875
805/* used to initialize the real stat files */ 876/* used to initialize the real stat files */
@@ -884,13 +955,38 @@ enum {
884 FTRACE_ENABLE_CALLS = (1 << 0), 955 FTRACE_ENABLE_CALLS = (1 << 0),
885 FTRACE_DISABLE_CALLS = (1 << 1), 956 FTRACE_DISABLE_CALLS = (1 << 1),
886 FTRACE_UPDATE_TRACE_FUNC = (1 << 2), 957 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
887 FTRACE_ENABLE_MCOUNT = (1 << 3), 958 FTRACE_START_FUNC_RET = (1 << 3),
888 FTRACE_DISABLE_MCOUNT = (1 << 4), 959 FTRACE_STOP_FUNC_RET = (1 << 4),
889 FTRACE_START_FUNC_RET = (1 << 5), 960};
890 FTRACE_STOP_FUNC_RET = (1 << 6), 961struct ftrace_func_entry {
962 struct hlist_node hlist;
963 unsigned long ip;
964};
965
966struct ftrace_hash {
967 unsigned long size_bits;
968 struct hlist_head *buckets;
969 unsigned long count;
970 struct rcu_head rcu;
971};
972
973/*
974 * We make these constant because no one should touch them,
975 * but they are used as the default "empty hash", to avoid allocating
976 * it all the time. These are in a read only section such that if
977 * anyone does try to modify it, it will cause an exception.
978 */
979static const struct hlist_head empty_buckets[1];
980static const struct ftrace_hash empty_hash = {
981 .buckets = (struct hlist_head *)empty_buckets,
891}; 982};
983#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash)
892 984
893static int ftrace_filtered; 985static struct ftrace_ops global_ops = {
986 .func = ftrace_stub,
987 .notrace_hash = EMPTY_HASH,
988 .filter_hash = EMPTY_HASH,
989};
894 990
895static struct dyn_ftrace *ftrace_new_addrs; 991static struct dyn_ftrace *ftrace_new_addrs;
896 992
@@ -913,6 +1009,269 @@ static struct ftrace_page *ftrace_pages;
913 1009
914static struct dyn_ftrace *ftrace_free_records; 1010static struct dyn_ftrace *ftrace_free_records;
915 1011
1012static struct ftrace_func_entry *
1013ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1014{
1015 unsigned long key;
1016 struct ftrace_func_entry *entry;
1017 struct hlist_head *hhd;
1018 struct hlist_node *n;
1019
1020 if (!hash->count)
1021 return NULL;
1022
1023 if (hash->size_bits > 0)
1024 key = hash_long(ip, hash->size_bits);
1025 else
1026 key = 0;
1027
1028 hhd = &hash->buckets[key];
1029
1030 hlist_for_each_entry_rcu(entry, n, hhd, hlist) {
1031 if (entry->ip == ip)
1032 return entry;
1033 }
1034 return NULL;
1035}
1036
1037static void __add_hash_entry(struct ftrace_hash *hash,
1038 struct ftrace_func_entry *entry)
1039{
1040 struct hlist_head *hhd;
1041 unsigned long key;
1042
1043 if (hash->size_bits)
1044 key = hash_long(entry->ip, hash->size_bits);
1045 else
1046 key = 0;
1047
1048 hhd = &hash->buckets[key];
1049 hlist_add_head(&entry->hlist, hhd);
1050 hash->count++;
1051}
1052
1053static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
1054{
1055 struct ftrace_func_entry *entry;
1056
1057 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
1058 if (!entry)
1059 return -ENOMEM;
1060
1061 entry->ip = ip;
1062 __add_hash_entry(hash, entry);
1063
1064 return 0;
1065}
1066
1067static void
1068free_hash_entry(struct ftrace_hash *hash,
1069 struct ftrace_func_entry *entry)
1070{
1071 hlist_del(&entry->hlist);
1072 kfree(entry);
1073 hash->count--;
1074}
1075
1076static void
1077remove_hash_entry(struct ftrace_hash *hash,
1078 struct ftrace_func_entry *entry)
1079{
1080 hlist_del(&entry->hlist);
1081 hash->count--;
1082}
1083
1084static void ftrace_hash_clear(struct ftrace_hash *hash)
1085{
1086 struct hlist_head *hhd;
1087 struct hlist_node *tp, *tn;
1088 struct ftrace_func_entry *entry;
1089 int size = 1 << hash->size_bits;
1090 int i;
1091
1092 if (!hash->count)
1093 return;
1094
1095 for (i = 0; i < size; i++) {
1096 hhd = &hash->buckets[i];
1097 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist)
1098 free_hash_entry(hash, entry);
1099 }
1100 FTRACE_WARN_ON(hash->count);
1101}
1102
1103static void free_ftrace_hash(struct ftrace_hash *hash)
1104{
1105 if (!hash || hash == EMPTY_HASH)
1106 return;
1107 ftrace_hash_clear(hash);
1108 kfree(hash->buckets);
1109 kfree(hash);
1110}
1111
1112static void __free_ftrace_hash_rcu(struct rcu_head *rcu)
1113{
1114 struct ftrace_hash *hash;
1115
1116 hash = container_of(rcu, struct ftrace_hash, rcu);
1117 free_ftrace_hash(hash);
1118}
1119
1120static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
1121{
1122 if (!hash || hash == EMPTY_HASH)
1123 return;
1124 call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
1125}
1126
1127static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
1128{
1129 struct ftrace_hash *hash;
1130 int size;
1131
1132 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
1133 if (!hash)
1134 return NULL;
1135
1136 size = 1 << size_bits;
1137 hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL);
1138
1139 if (!hash->buckets) {
1140 kfree(hash);
1141 return NULL;
1142 }
1143
1144 hash->size_bits = size_bits;
1145
1146 return hash;
1147}
1148
1149static struct ftrace_hash *
1150alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1151{
1152 struct ftrace_func_entry *entry;
1153 struct ftrace_hash *new_hash;
1154 struct hlist_node *tp;
1155 int size;
1156 int ret;
1157 int i;
1158
1159 new_hash = alloc_ftrace_hash(size_bits);
1160 if (!new_hash)
1161 return NULL;
1162
1163 /* Empty hash? */
1164 if (!hash || !hash->count)
1165 return new_hash;
1166
1167 size = 1 << hash->size_bits;
1168 for (i = 0; i < size; i++) {
1169 hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) {
1170 ret = add_hash_entry(new_hash, entry->ip);
1171 if (ret < 0)
1172 goto free_hash;
1173 }
1174 }
1175
1176 FTRACE_WARN_ON(new_hash->count != hash->count);
1177
1178 return new_hash;
1179
1180 free_hash:
1181 free_ftrace_hash(new_hash);
1182 return NULL;
1183}
1184
1185static int
1186ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
1187{
1188 struct ftrace_func_entry *entry;
1189 struct hlist_node *tp, *tn;
1190 struct hlist_head *hhd;
1191 struct ftrace_hash *old_hash;
1192 struct ftrace_hash *new_hash;
1193 unsigned long key;
1194 int size = src->count;
1195 int bits = 0;
1196 int i;
1197
1198 /*
1199 * If the new source is empty, just free dst and assign it
1200 * the empty_hash.
1201 */
1202 if (!src->count) {
1203 free_ftrace_hash_rcu(*dst);
1204 rcu_assign_pointer(*dst, EMPTY_HASH);
1205 return 0;
1206 }
1207
1208 /*
1209 * Make the hash size about 1/2 the # found
1210 */
1211 for (size /= 2; size; size >>= 1)
1212 bits++;
1213
1214 /* Don't allocate too much */
1215 if (bits > FTRACE_HASH_MAX_BITS)
1216 bits = FTRACE_HASH_MAX_BITS;
1217
1218 new_hash = alloc_ftrace_hash(bits);
1219 if (!new_hash)
1220 return -ENOMEM;
1221
1222 size = 1 << src->size_bits;
1223 for (i = 0; i < size; i++) {
1224 hhd = &src->buckets[i];
1225 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) {
1226 if (bits > 0)
1227 key = hash_long(entry->ip, bits);
1228 else
1229 key = 0;
1230 remove_hash_entry(src, entry);
1231 __add_hash_entry(new_hash, entry);
1232 }
1233 }
1234
1235 old_hash = *dst;
1236 rcu_assign_pointer(*dst, new_hash);
1237 free_ftrace_hash_rcu(old_hash);
1238
1239 return 0;
1240}
1241
1242/*
1243 * Test the hashes for this ops to see if we want to call
1244 * the ops->func or not.
1245 *
1246 * It's a match if the ip is in the ops->filter_hash or
1247 * the filter_hash does not exist or is empty,
1248 * AND
1249 * the ip is not in the ops->notrace_hash.
1250 *
1251 * This needs to be called with preemption disabled as
1252 * the hashes are freed with call_rcu_sched().
1253 */
1254static int
1255ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1256{
1257 struct ftrace_hash *filter_hash;
1258 struct ftrace_hash *notrace_hash;
1259 int ret;
1260
1261 filter_hash = rcu_dereference_raw(ops->filter_hash);
1262 notrace_hash = rcu_dereference_raw(ops->notrace_hash);
1263
1264 if ((!filter_hash || !filter_hash->count ||
1265 ftrace_lookup_ip(filter_hash, ip)) &&
1266 (!notrace_hash || !notrace_hash->count ||
1267 !ftrace_lookup_ip(notrace_hash, ip)))
1268 ret = 1;
1269 else
1270 ret = 0;
1271
1272 return ret;
1273}
1274
916/* 1275/*
917 * This is a double for. Do not use 'break' to break out of the loop, 1276 * This is a double for. Do not use 'break' to break out of the loop,
918 * you must use a goto. 1277 * you must use a goto.
@@ -927,6 +1286,105 @@ static struct dyn_ftrace *ftrace_free_records;
927 } \ 1286 } \
928 } 1287 }
929 1288
1289static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1290 int filter_hash,
1291 bool inc)
1292{
1293 struct ftrace_hash *hash;
1294 struct ftrace_hash *other_hash;
1295 struct ftrace_page *pg;
1296 struct dyn_ftrace *rec;
1297 int count = 0;
1298 int all = 0;
1299
1300 /* Only update if the ops has been registered */
1301 if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
1302 return;
1303
1304 /*
1305 * In the filter_hash case:
1306 * If the count is zero, we update all records.
1307 * Otherwise we just update the items in the hash.
1308 *
1309 * In the notrace_hash case:
1310 * We enable the update in the hash.
1311 * As disabling notrace means enabling the tracing,
1312 * and enabling notrace means disabling, the inc variable
1313 * gets inversed.
1314 */
1315 if (filter_hash) {
1316 hash = ops->filter_hash;
1317 other_hash = ops->notrace_hash;
1318 if (!hash || !hash->count)
1319 all = 1;
1320 } else {
1321 inc = !inc;
1322 hash = ops->notrace_hash;
1323 other_hash = ops->filter_hash;
1324 /*
1325 * If the notrace hash has no items,
1326 * then there's nothing to do.
1327 */
1328 if (hash && !hash->count)
1329 return;
1330 }
1331
1332 do_for_each_ftrace_rec(pg, rec) {
1333 int in_other_hash = 0;
1334 int in_hash = 0;
1335 int match = 0;
1336
1337 if (all) {
1338 /*
1339 * Only the filter_hash affects all records.
1340 * Update if the record is not in the notrace hash.
1341 */
1342 if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
1343 match = 1;
1344 } else {
1345 in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip);
1346 in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip);
1347
1348 /*
1349 *
1350 */
1351 if (filter_hash && in_hash && !in_other_hash)
1352 match = 1;
1353 else if (!filter_hash && in_hash &&
1354 (in_other_hash || !other_hash->count))
1355 match = 1;
1356 }
1357 if (!match)
1358 continue;
1359
1360 if (inc) {
1361 rec->flags++;
1362 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
1363 return;
1364 } else {
1365 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
1366 return;
1367 rec->flags--;
1368 }
1369 count++;
1370 /* Shortcut, if we handled all records, we are done. */
1371 if (!all && count == hash->count)
1372 return;
1373 } while_for_each_ftrace_rec();
1374}
1375
1376static void ftrace_hash_rec_disable(struct ftrace_ops *ops,
1377 int filter_hash)
1378{
1379 __ftrace_hash_rec_update(ops, filter_hash, 0);
1380}
1381
1382static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
1383 int filter_hash)
1384{
1385 __ftrace_hash_rec_update(ops, filter_hash, 1);
1386}
1387
930static void ftrace_free_rec(struct dyn_ftrace *rec) 1388static void ftrace_free_rec(struct dyn_ftrace *rec)
931{ 1389{
932 rec->freelist = ftrace_free_records; 1390 rec->freelist = ftrace_free_records;
@@ -1048,18 +1506,18 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1048 ftrace_addr = (unsigned long)FTRACE_ADDR; 1506 ftrace_addr = (unsigned long)FTRACE_ADDR;
1049 1507
1050 /* 1508 /*
1051 * If this record is not to be traced or we want to disable it, 1509 * If we are enabling tracing:
1052 * then disable it.
1053 * 1510 *
1054 * If we want to enable it and filtering is off, then enable it. 1511 * If the record has a ref count, then we need to enable it
1512 * because someone is using it.
1055 * 1513 *
1056 * If we want to enable it and filtering is on, enable it only if 1514 * Otherwise we make sure its disabled.
1057 * it's filtered 1515 *
1516 * If we are disabling tracing, then disable all records that
1517 * are enabled.
1058 */ 1518 */
1059 if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) { 1519 if (enable && (rec->flags & ~FTRACE_FL_MASK))
1060 if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) 1520 flag = FTRACE_FL_ENABLED;
1061 flag = FTRACE_FL_ENABLED;
1062 }
1063 1521
1064 /* If the state of this record hasn't changed, then do nothing */ 1522 /* If the state of this record hasn't changed, then do nothing */
1065 if ((rec->flags & FTRACE_FL_ENABLED) == flag) 1523 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
@@ -1080,19 +1538,16 @@ static void ftrace_replace_code(int enable)
1080 struct ftrace_page *pg; 1538 struct ftrace_page *pg;
1081 int failed; 1539 int failed;
1082 1540
1541 if (unlikely(ftrace_disabled))
1542 return;
1543
1083 do_for_each_ftrace_rec(pg, rec) { 1544 do_for_each_ftrace_rec(pg, rec) {
1084 /* 1545 /* Skip over free records */
1085 * Skip over free records, records that have 1546 if (rec->flags & FTRACE_FL_FREE)
1086 * failed and not converted.
1087 */
1088 if (rec->flags & FTRACE_FL_FREE ||
1089 rec->flags & FTRACE_FL_FAILED ||
1090 !(rec->flags & FTRACE_FL_CONVERTED))
1091 continue; 1547 continue;
1092 1548
1093 failed = __ftrace_replace_code(rec, enable); 1549 failed = __ftrace_replace_code(rec, enable);
1094 if (failed) { 1550 if (failed) {
1095 rec->flags |= FTRACE_FL_FAILED;
1096 ftrace_bug(failed, rec->ip); 1551 ftrace_bug(failed, rec->ip);
1097 /* Stop processing */ 1552 /* Stop processing */
1098 return; 1553 return;
@@ -1108,10 +1563,12 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
1108 1563
1109 ip = rec->ip; 1564 ip = rec->ip;
1110 1565
1566 if (unlikely(ftrace_disabled))
1567 return 0;
1568
1111 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); 1569 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
1112 if (ret) { 1570 if (ret) {
1113 ftrace_bug(ret, ip); 1571 ftrace_bug(ret, ip);
1114 rec->flags |= FTRACE_FL_FAILED;
1115 return 0; 1572 return 0;
1116 } 1573 }
1117 return 1; 1574 return 1;
@@ -1172,6 +1629,7 @@ static void ftrace_run_update_code(int command)
1172 1629
1173static ftrace_func_t saved_ftrace_func; 1630static ftrace_func_t saved_ftrace_func;
1174static int ftrace_start_up; 1631static int ftrace_start_up;
1632static int global_start_up;
1175 1633
1176static void ftrace_startup_enable(int command) 1634static void ftrace_startup_enable(int command)
1177{ 1635{
@@ -1186,19 +1644,38 @@ static void ftrace_startup_enable(int command)
1186 ftrace_run_update_code(command); 1644 ftrace_run_update_code(command);
1187} 1645}
1188 1646
1189static void ftrace_startup(int command) 1647static int ftrace_startup(struct ftrace_ops *ops, int command)
1190{ 1648{
1649 bool hash_enable = true;
1650
1191 if (unlikely(ftrace_disabled)) 1651 if (unlikely(ftrace_disabled))
1192 return; 1652 return -ENODEV;
1193 1653
1194 ftrace_start_up++; 1654 ftrace_start_up++;
1195 command |= FTRACE_ENABLE_CALLS; 1655 command |= FTRACE_ENABLE_CALLS;
1196 1656
1657 /* ops marked global share the filter hashes */
1658 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
1659 ops = &global_ops;
1660 /* Don't update hash if global is already set */
1661 if (global_start_up)
1662 hash_enable = false;
1663 global_start_up++;
1664 }
1665
1666 ops->flags |= FTRACE_OPS_FL_ENABLED;
1667 if (hash_enable)
1668 ftrace_hash_rec_enable(ops, 1);
1669
1197 ftrace_startup_enable(command); 1670 ftrace_startup_enable(command);
1671
1672 return 0;
1198} 1673}
1199 1674
1200static void ftrace_shutdown(int command) 1675static void ftrace_shutdown(struct ftrace_ops *ops, int command)
1201{ 1676{
1677 bool hash_disable = true;
1678
1202 if (unlikely(ftrace_disabled)) 1679 if (unlikely(ftrace_disabled))
1203 return; 1680 return;
1204 1681
@@ -1210,6 +1687,23 @@ static void ftrace_shutdown(int command)
1210 */ 1687 */
1211 WARN_ON_ONCE(ftrace_start_up < 0); 1688 WARN_ON_ONCE(ftrace_start_up < 0);
1212 1689
1690 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
1691 ops = &global_ops;
1692 global_start_up--;
1693 WARN_ON_ONCE(global_start_up < 0);
1694 /* Don't update hash if global still has users */
1695 if (global_start_up) {
1696 WARN_ON_ONCE(!ftrace_start_up);
1697 hash_disable = false;
1698 }
1699 }
1700
1701 if (hash_disable)
1702 ftrace_hash_rec_disable(ops, 1);
1703
1704 if (ops != &global_ops || !global_start_up)
1705 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
1706
1213 if (!ftrace_start_up) 1707 if (!ftrace_start_up)
1214 command |= FTRACE_DISABLE_CALLS; 1708 command |= FTRACE_DISABLE_CALLS;
1215 1709
@@ -1226,8 +1720,6 @@ static void ftrace_shutdown(int command)
1226 1720
1227static void ftrace_startup_sysctl(void) 1721static void ftrace_startup_sysctl(void)
1228{ 1722{
1229 int command = FTRACE_ENABLE_MCOUNT;
1230
1231 if (unlikely(ftrace_disabled)) 1723 if (unlikely(ftrace_disabled))
1232 return; 1724 return;
1233 1725
@@ -1235,23 +1727,17 @@ static void ftrace_startup_sysctl(void)
1235 saved_ftrace_func = NULL; 1727 saved_ftrace_func = NULL;
1236 /* ftrace_start_up is true if we want ftrace running */ 1728 /* ftrace_start_up is true if we want ftrace running */
1237 if (ftrace_start_up) 1729 if (ftrace_start_up)
1238 command |= FTRACE_ENABLE_CALLS; 1730 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1239
1240 ftrace_run_update_code(command);
1241} 1731}
1242 1732
1243static void ftrace_shutdown_sysctl(void) 1733static void ftrace_shutdown_sysctl(void)
1244{ 1734{
1245 int command = FTRACE_DISABLE_MCOUNT;
1246
1247 if (unlikely(ftrace_disabled)) 1735 if (unlikely(ftrace_disabled))
1248 return; 1736 return;
1249 1737
1250 /* ftrace_start_up is true if ftrace is running */ 1738 /* ftrace_start_up is true if ftrace is running */
1251 if (ftrace_start_up) 1739 if (ftrace_start_up)
1252 command |= FTRACE_DISABLE_CALLS; 1740 ftrace_run_update_code(FTRACE_DISABLE_CALLS);
1253
1254 ftrace_run_update_code(command);
1255} 1741}
1256 1742
1257static cycle_t ftrace_update_time; 1743static cycle_t ftrace_update_time;
@@ -1277,15 +1763,15 @@ static int ftrace_update_code(struct module *mod)
1277 p->flags = 0L; 1763 p->flags = 0L;
1278 1764
1279 /* 1765 /*
1280 * Do the initial record convertion from mcount jump 1766 * Do the initial record conversion from mcount jump
1281 * to the NOP instructions. 1767 * to the NOP instructions.
1282 */ 1768 */
1283 if (!ftrace_code_disable(mod, p)) { 1769 if (!ftrace_code_disable(mod, p)) {
1284 ftrace_free_rec(p); 1770 ftrace_free_rec(p);
1285 continue; 1771 /* Game over */
1772 break;
1286 } 1773 }
1287 1774
1288 p->flags |= FTRACE_FL_CONVERTED;
1289 ftrace_update_cnt++; 1775 ftrace_update_cnt++;
1290 1776
1291 /* 1777 /*
@@ -1360,32 +1846,39 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
1360enum { 1846enum {
1361 FTRACE_ITER_FILTER = (1 << 0), 1847 FTRACE_ITER_FILTER = (1 << 0),
1362 FTRACE_ITER_NOTRACE = (1 << 1), 1848 FTRACE_ITER_NOTRACE = (1 << 1),
1363 FTRACE_ITER_FAILURES = (1 << 2), 1849 FTRACE_ITER_PRINTALL = (1 << 2),
1364 FTRACE_ITER_PRINTALL = (1 << 3), 1850 FTRACE_ITER_HASH = (1 << 3),
1365 FTRACE_ITER_HASH = (1 << 4), 1851 FTRACE_ITER_ENABLED = (1 << 4),
1366}; 1852};
1367 1853
1368#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1854#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
1369 1855
1370struct ftrace_iterator { 1856struct ftrace_iterator {
1371 struct ftrace_page *pg; 1857 loff_t pos;
1372 int hidx; 1858 loff_t func_pos;
1373 int idx; 1859 struct ftrace_page *pg;
1374 unsigned flags; 1860 struct dyn_ftrace *func;
1375 struct trace_parser parser; 1861 struct ftrace_func_probe *probe;
1862 struct trace_parser parser;
1863 struct ftrace_hash *hash;
1864 struct ftrace_ops *ops;
1865 int hidx;
1866 int idx;
1867 unsigned flags;
1376}; 1868};
1377 1869
1378static void * 1870static void *
1379t_hash_next(struct seq_file *m, void *v, loff_t *pos) 1871t_hash_next(struct seq_file *m, loff_t *pos)
1380{ 1872{
1381 struct ftrace_iterator *iter = m->private; 1873 struct ftrace_iterator *iter = m->private;
1382 struct hlist_node *hnd = v; 1874 struct hlist_node *hnd = NULL;
1383 struct hlist_head *hhd; 1875 struct hlist_head *hhd;
1384 1876
1385 WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
1386
1387 (*pos)++; 1877 (*pos)++;
1878 iter->pos = *pos;
1388 1879
1880 if (iter->probe)
1881 hnd = &iter->probe->node;
1389 retry: 1882 retry:
1390 if (iter->hidx >= FTRACE_FUNC_HASHSIZE) 1883 if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
1391 return NULL; 1884 return NULL;
@@ -1408,7 +1901,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos)
1408 } 1901 }
1409 } 1902 }
1410 1903
1411 return hnd; 1904 if (WARN_ON_ONCE(!hnd))
1905 return NULL;
1906
1907 iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
1908
1909 return iter;
1412} 1910}
1413 1911
1414static void *t_hash_start(struct seq_file *m, loff_t *pos) 1912static void *t_hash_start(struct seq_file *m, loff_t *pos)
@@ -1417,26 +1915,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
1417 void *p = NULL; 1915 void *p = NULL;
1418 loff_t l; 1916 loff_t l;
1419 1917
1420 if (!(iter->flags & FTRACE_ITER_HASH)) 1918 if (iter->func_pos > *pos)
1421 *pos = 0; 1919 return NULL;
1422
1423 iter->flags |= FTRACE_ITER_HASH;
1424 1920
1425 iter->hidx = 0; 1921 iter->hidx = 0;
1426 for (l = 0; l <= *pos; ) { 1922 for (l = 0; l <= (*pos - iter->func_pos); ) {
1427 p = t_hash_next(m, p, &l); 1923 p = t_hash_next(m, &l);
1428 if (!p) 1924 if (!p)
1429 break; 1925 break;
1430 } 1926 }
1431 return p; 1927 if (!p)
1928 return NULL;
1929
1930 /* Only set this if we have an item */
1931 iter->flags |= FTRACE_ITER_HASH;
1932
1933 return iter;
1432} 1934}
1433 1935
1434static int t_hash_show(struct seq_file *m, void *v) 1936static int
1937t_hash_show(struct seq_file *m, struct ftrace_iterator *iter)
1435{ 1938{
1436 struct ftrace_func_probe *rec; 1939 struct ftrace_func_probe *rec;
1437 struct hlist_node *hnd = v;
1438 1940
1439 rec = hlist_entry(hnd, struct ftrace_func_probe, node); 1941 rec = iter->probe;
1942 if (WARN_ON_ONCE(!rec))
1943 return -EIO;
1440 1944
1441 if (rec->ops->print) 1945 if (rec->ops->print)
1442 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 1946 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
@@ -1454,15 +1958,20 @@ static void *
1454t_next(struct seq_file *m, void *v, loff_t *pos) 1958t_next(struct seq_file *m, void *v, loff_t *pos)
1455{ 1959{
1456 struct ftrace_iterator *iter = m->private; 1960 struct ftrace_iterator *iter = m->private;
1961 struct ftrace_ops *ops = &global_ops;
1457 struct dyn_ftrace *rec = NULL; 1962 struct dyn_ftrace *rec = NULL;
1458 1963
1964 if (unlikely(ftrace_disabled))
1965 return NULL;
1966
1459 if (iter->flags & FTRACE_ITER_HASH) 1967 if (iter->flags & FTRACE_ITER_HASH)
1460 return t_hash_next(m, v, pos); 1968 return t_hash_next(m, pos);
1461 1969
1462 (*pos)++; 1970 (*pos)++;
1971 iter->pos = iter->func_pos = *pos;
1463 1972
1464 if (iter->flags & FTRACE_ITER_PRINTALL) 1973 if (iter->flags & FTRACE_ITER_PRINTALL)
1465 return NULL; 1974 return t_hash_start(m, pos);
1466 1975
1467 retry: 1976 retry:
1468 if (iter->idx >= iter->pg->index) { 1977 if (iter->idx >= iter->pg->index) {
@@ -1475,38 +1984,59 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1475 rec = &iter->pg->records[iter->idx++]; 1984 rec = &iter->pg->records[iter->idx++];
1476 if ((rec->flags & FTRACE_FL_FREE) || 1985 if ((rec->flags & FTRACE_FL_FREE) ||
1477 1986
1478 (!(iter->flags & FTRACE_ITER_FAILURES) &&
1479 (rec->flags & FTRACE_FL_FAILED)) ||
1480
1481 ((iter->flags & FTRACE_ITER_FAILURES) &&
1482 !(rec->flags & FTRACE_FL_FAILED)) ||
1483
1484 ((iter->flags & FTRACE_ITER_FILTER) && 1987 ((iter->flags & FTRACE_ITER_FILTER) &&
1485 !(rec->flags & FTRACE_FL_FILTER)) || 1988 !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
1486 1989
1487 ((iter->flags & FTRACE_ITER_NOTRACE) && 1990 ((iter->flags & FTRACE_ITER_NOTRACE) &&
1488 !(rec->flags & FTRACE_FL_NOTRACE))) { 1991 !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) ||
1992
1993 ((iter->flags & FTRACE_ITER_ENABLED) &&
1994 !(rec->flags & ~FTRACE_FL_MASK))) {
1995
1489 rec = NULL; 1996 rec = NULL;
1490 goto retry; 1997 goto retry;
1491 } 1998 }
1492 } 1999 }
1493 2000
1494 return rec; 2001 if (!rec)
2002 return t_hash_start(m, pos);
2003
2004 iter->func = rec;
2005
2006 return iter;
2007}
2008
2009static void reset_iter_read(struct ftrace_iterator *iter)
2010{
2011 iter->pos = 0;
2012 iter->func_pos = 0;
2013 iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH);
1495} 2014}
1496 2015
1497static void *t_start(struct seq_file *m, loff_t *pos) 2016static void *t_start(struct seq_file *m, loff_t *pos)
1498{ 2017{
1499 struct ftrace_iterator *iter = m->private; 2018 struct ftrace_iterator *iter = m->private;
2019 struct ftrace_ops *ops = &global_ops;
1500 void *p = NULL; 2020 void *p = NULL;
1501 loff_t l; 2021 loff_t l;
1502 2022
1503 mutex_lock(&ftrace_lock); 2023 mutex_lock(&ftrace_lock);
2024
2025 if (unlikely(ftrace_disabled))
2026 return NULL;
2027
2028 /*
2029 * If an lseek was done, then reset and start from beginning.
2030 */
2031 if (*pos < iter->pos)
2032 reset_iter_read(iter);
2033
1504 /* 2034 /*
1505 * For set_ftrace_filter reading, if we have the filter 2035 * For set_ftrace_filter reading, if we have the filter
1506 * off, we can short cut and just print out that all 2036 * off, we can short cut and just print out that all
1507 * functions are enabled. 2037 * functions are enabled.
1508 */ 2038 */
1509 if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) { 2039 if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) {
1510 if (*pos > 0) 2040 if (*pos > 0)
1511 return t_hash_start(m, pos); 2041 return t_hash_start(m, pos);
1512 iter->flags |= FTRACE_ITER_PRINTALL; 2042 iter->flags |= FTRACE_ITER_PRINTALL;
@@ -1518,6 +2048,11 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1518 if (iter->flags & FTRACE_ITER_HASH) 2048 if (iter->flags & FTRACE_ITER_HASH)
1519 return t_hash_start(m, pos); 2049 return t_hash_start(m, pos);
1520 2050
2051 /*
2052 * Unfortunately, we need to restart at ftrace_pages_start
2053 * every time we let go of the ftrace_mutex. This is because
2054 * those pointers can change without the lock.
2055 */
1521 iter->pg = ftrace_pages_start; 2056 iter->pg = ftrace_pages_start;
1522 iter->idx = 0; 2057 iter->idx = 0;
1523 for (l = 0; l <= *pos; ) { 2058 for (l = 0; l <= *pos; ) {
@@ -1526,10 +2061,14 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1526 break; 2061 break;
1527 } 2062 }
1528 2063
1529 if (!p && iter->flags & FTRACE_ITER_FILTER) 2064 if (!p) {
1530 return t_hash_start(m, pos); 2065 if (iter->flags & FTRACE_ITER_FILTER)
2066 return t_hash_start(m, pos);
1531 2067
1532 return p; 2068 return NULL;
2069 }
2070
2071 return iter;
1533} 2072}
1534 2073
1535static void t_stop(struct seq_file *m, void *p) 2074static void t_stop(struct seq_file *m, void *p)
@@ -1540,20 +2079,26 @@ static void t_stop(struct seq_file *m, void *p)
1540static int t_show(struct seq_file *m, void *v) 2079static int t_show(struct seq_file *m, void *v)
1541{ 2080{
1542 struct ftrace_iterator *iter = m->private; 2081 struct ftrace_iterator *iter = m->private;
1543 struct dyn_ftrace *rec = v; 2082 struct dyn_ftrace *rec;
1544 2083
1545 if (iter->flags & FTRACE_ITER_HASH) 2084 if (iter->flags & FTRACE_ITER_HASH)
1546 return t_hash_show(m, v); 2085 return t_hash_show(m, iter);
1547 2086
1548 if (iter->flags & FTRACE_ITER_PRINTALL) { 2087 if (iter->flags & FTRACE_ITER_PRINTALL) {
1549 seq_printf(m, "#### all functions enabled ####\n"); 2088 seq_printf(m, "#### all functions enabled ####\n");
1550 return 0; 2089 return 0;
1551 } 2090 }
1552 2091
2092 rec = iter->func;
2093
1553 if (!rec) 2094 if (!rec)
1554 return 0; 2095 return 0;
1555 2096
1556 seq_printf(m, "%ps\n", (void *)rec->ip); 2097 seq_printf(m, "%ps", (void *)rec->ip);
2098 if (iter->flags & FTRACE_ITER_ENABLED)
2099 seq_printf(m, " (%ld)",
2100 rec->flags & ~FTRACE_FL_MASK);
2101 seq_printf(m, "\n");
1557 2102
1558 return 0; 2103 return 0;
1559} 2104}
@@ -1593,44 +2138,46 @@ ftrace_avail_open(struct inode *inode, struct file *file)
1593} 2138}
1594 2139
1595static int 2140static int
1596ftrace_failures_open(struct inode *inode, struct file *file) 2141ftrace_enabled_open(struct inode *inode, struct file *file)
1597{ 2142{
1598 int ret;
1599 struct seq_file *m;
1600 struct ftrace_iterator *iter; 2143 struct ftrace_iterator *iter;
2144 int ret;
2145
2146 if (unlikely(ftrace_disabled))
2147 return -ENODEV;
2148
2149 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2150 if (!iter)
2151 return -ENOMEM;
2152
2153 iter->pg = ftrace_pages_start;
2154 iter->flags = FTRACE_ITER_ENABLED;
1601 2155
1602 ret = ftrace_avail_open(inode, file); 2156 ret = seq_open(file, &show_ftrace_seq_ops);
1603 if (!ret) { 2157 if (!ret) {
1604 m = (struct seq_file *)file->private_data; 2158 struct seq_file *m = file->private_data;
1605 iter = (struct ftrace_iterator *)m->private; 2159
1606 iter->flags = FTRACE_ITER_FAILURES; 2160 m->private = iter;
2161 } else {
2162 kfree(iter);
1607 } 2163 }
1608 2164
1609 return ret; 2165 return ret;
1610} 2166}
1611 2167
1612 2168static void ftrace_filter_reset(struct ftrace_hash *hash)
1613static void ftrace_filter_reset(int enable)
1614{ 2169{
1615 struct ftrace_page *pg;
1616 struct dyn_ftrace *rec;
1617 unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1618
1619 mutex_lock(&ftrace_lock); 2170 mutex_lock(&ftrace_lock);
1620 if (enable) 2171 ftrace_hash_clear(hash);
1621 ftrace_filtered = 0;
1622 do_for_each_ftrace_rec(pg, rec) {
1623 if (rec->flags & FTRACE_FL_FAILED)
1624 continue;
1625 rec->flags &= ~type;
1626 } while_for_each_ftrace_rec();
1627 mutex_unlock(&ftrace_lock); 2172 mutex_unlock(&ftrace_lock);
1628} 2173}
1629 2174
1630static int 2175static int
1631ftrace_regex_open(struct inode *inode, struct file *file, int enable) 2176ftrace_regex_open(struct ftrace_ops *ops, int flag,
2177 struct inode *inode, struct file *file)
1632{ 2178{
1633 struct ftrace_iterator *iter; 2179 struct ftrace_iterator *iter;
2180 struct ftrace_hash *hash;
1634 int ret = 0; 2181 int ret = 0;
1635 2182
1636 if (unlikely(ftrace_disabled)) 2183 if (unlikely(ftrace_disabled))
@@ -1645,21 +2192,42 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1645 return -ENOMEM; 2192 return -ENOMEM;
1646 } 2193 }
1647 2194
2195 if (flag & FTRACE_ITER_NOTRACE)
2196 hash = ops->notrace_hash;
2197 else
2198 hash = ops->filter_hash;
2199
2200 iter->ops = ops;
2201 iter->flags = flag;
2202
2203 if (file->f_mode & FMODE_WRITE) {
2204 mutex_lock(&ftrace_lock);
2205 iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);
2206 mutex_unlock(&ftrace_lock);
2207
2208 if (!iter->hash) {
2209 trace_parser_put(&iter->parser);
2210 kfree(iter);
2211 return -ENOMEM;
2212 }
2213 }
2214
1648 mutex_lock(&ftrace_regex_lock); 2215 mutex_lock(&ftrace_regex_lock);
2216
1649 if ((file->f_mode & FMODE_WRITE) && 2217 if ((file->f_mode & FMODE_WRITE) &&
1650 (file->f_flags & O_TRUNC)) 2218 (file->f_flags & O_TRUNC))
1651 ftrace_filter_reset(enable); 2219 ftrace_filter_reset(iter->hash);
1652 2220
1653 if (file->f_mode & FMODE_READ) { 2221 if (file->f_mode & FMODE_READ) {
1654 iter->pg = ftrace_pages_start; 2222 iter->pg = ftrace_pages_start;
1655 iter->flags = enable ? FTRACE_ITER_FILTER :
1656 FTRACE_ITER_NOTRACE;
1657 2223
1658 ret = seq_open(file, &show_ftrace_seq_ops); 2224 ret = seq_open(file, &show_ftrace_seq_ops);
1659 if (!ret) { 2225 if (!ret) {
1660 struct seq_file *m = file->private_data; 2226 struct seq_file *m = file->private_data;
1661 m->private = iter; 2227 m->private = iter;
1662 } else { 2228 } else {
2229 /* Failed */
2230 free_ftrace_hash(iter->hash);
1663 trace_parser_put(&iter->parser); 2231 trace_parser_put(&iter->parser);
1664 kfree(iter); 2232 kfree(iter);
1665 } 2233 }
@@ -1673,13 +2241,15 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1673static int 2241static int
1674ftrace_filter_open(struct inode *inode, struct file *file) 2242ftrace_filter_open(struct inode *inode, struct file *file)
1675{ 2243{
1676 return ftrace_regex_open(inode, file, 1); 2244 return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER,
2245 inode, file);
1677} 2246}
1678 2247
1679static int 2248static int
1680ftrace_notrace_open(struct inode *inode, struct file *file) 2249ftrace_notrace_open(struct inode *inode, struct file *file)
1681{ 2250{
1682 return ftrace_regex_open(inode, file, 0); 2251 return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE,
2252 inode, file);
1683} 2253}
1684 2254
1685static loff_t 2255static loff_t
@@ -1724,86 +2294,99 @@ static int ftrace_match(char *str, char *regex, int len, int type)
1724} 2294}
1725 2295
1726static int 2296static int
1727ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) 2297enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not)
2298{
2299 struct ftrace_func_entry *entry;
2300 int ret = 0;
2301
2302 entry = ftrace_lookup_ip(hash, rec->ip);
2303 if (not) {
2304 /* Do nothing if it doesn't exist */
2305 if (!entry)
2306 return 0;
2307
2308 free_hash_entry(hash, entry);
2309 } else {
2310 /* Do nothing if it exists */
2311 if (entry)
2312 return 0;
2313
2314 ret = add_hash_entry(hash, rec->ip);
2315 }
2316 return ret;
2317}
2318
2319static int
2320ftrace_match_record(struct dyn_ftrace *rec, char *mod,
2321 char *regex, int len, int type)
1728{ 2322{
1729 char str[KSYM_SYMBOL_LEN]; 2323 char str[KSYM_SYMBOL_LEN];
2324 char *modname;
2325
2326 kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
2327
2328 if (mod) {
2329 /* module lookup requires matching the module */
2330 if (!modname || strcmp(modname, mod))
2331 return 0;
2332
2333 /* blank search means to match all funcs in the mod */
2334 if (!len)
2335 return 1;
2336 }
1730 2337
1731 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
1732 return ftrace_match(str, regex, len, type); 2338 return ftrace_match(str, regex, len, type);
1733} 2339}
1734 2340
1735static int ftrace_match_records(char *buff, int len, int enable) 2341static int
2342match_records(struct ftrace_hash *hash, char *buff,
2343 int len, char *mod, int not)
1736{ 2344{
1737 unsigned int search_len; 2345 unsigned search_len = 0;
1738 struct ftrace_page *pg; 2346 struct ftrace_page *pg;
1739 struct dyn_ftrace *rec; 2347 struct dyn_ftrace *rec;
1740 unsigned long flag; 2348 int type = MATCH_FULL;
1741 char *search; 2349 char *search = buff;
1742 int type;
1743 int not;
1744 int found = 0; 2350 int found = 0;
2351 int ret;
1745 2352
1746 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 2353 if (len) {
1747 type = filter_parse_regex(buff, len, &search, &not); 2354 type = filter_parse_regex(buff, len, &search, &not);
1748 2355 search_len = strlen(search);
1749 search_len = strlen(search); 2356 }
1750 2357
1751 mutex_lock(&ftrace_lock); 2358 mutex_lock(&ftrace_lock);
1752 do_for_each_ftrace_rec(pg, rec) {
1753 2359
1754 if (rec->flags & FTRACE_FL_FAILED) 2360 if (unlikely(ftrace_disabled))
1755 continue; 2361 goto out_unlock;
1756 2362
1757 if (ftrace_match_record(rec, search, search_len, type)) { 2363 do_for_each_ftrace_rec(pg, rec) {
1758 if (not) 2364
1759 rec->flags &= ~flag; 2365 if (ftrace_match_record(rec, mod, search, search_len, type)) {
1760 else 2366 ret = enter_record(hash, rec, not);
1761 rec->flags |= flag; 2367 if (ret < 0) {
2368 found = ret;
2369 goto out_unlock;
2370 }
1762 found = 1; 2371 found = 1;
1763 } 2372 }
1764 /*
1765 * Only enable filtering if we have a function that
1766 * is filtered on.
1767 */
1768 if (enable && (rec->flags & FTRACE_FL_FILTER))
1769 ftrace_filtered = 1;
1770 } while_for_each_ftrace_rec(); 2373 } while_for_each_ftrace_rec();
2374 out_unlock:
1771 mutex_unlock(&ftrace_lock); 2375 mutex_unlock(&ftrace_lock);
1772 2376
1773 return found; 2377 return found;
1774} 2378}
1775 2379
1776static int 2380static int
1777ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, 2381ftrace_match_records(struct ftrace_hash *hash, char *buff, int len)
1778 char *regex, int len, int type)
1779{ 2382{
1780 char str[KSYM_SYMBOL_LEN]; 2383 return match_records(hash, buff, len, NULL, 0);
1781 char *modname;
1782
1783 kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
1784
1785 if (!modname || strcmp(modname, mod))
1786 return 0;
1787
1788 /* blank search means to match all funcs in the mod */
1789 if (len)
1790 return ftrace_match(str, regex, len, type);
1791 else
1792 return 1;
1793} 2384}
1794 2385
1795static int ftrace_match_module_records(char *buff, char *mod, int enable) 2386static int
2387ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
1796{ 2388{
1797 unsigned search_len = 0;
1798 struct ftrace_page *pg;
1799 struct dyn_ftrace *rec;
1800 int type = MATCH_FULL;
1801 char *search = buff;
1802 unsigned long flag;
1803 int not = 0; 2389 int not = 0;
1804 int found = 0;
1805
1806 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1807 2390
1808 /* blank or '*' mean the same */ 2391 /* blank or '*' mean the same */
1809 if (strcmp(buff, "*") == 0) 2392 if (strcmp(buff, "*") == 0)
@@ -1815,32 +2398,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable)
1815 not = 1; 2398 not = 1;
1816 } 2399 }
1817 2400
1818 if (strlen(buff)) { 2401 return match_records(hash, buff, strlen(buff), mod, not);
1819 type = filter_parse_regex(buff, strlen(buff), &search, &not);
1820 search_len = strlen(search);
1821 }
1822
1823 mutex_lock(&ftrace_lock);
1824 do_for_each_ftrace_rec(pg, rec) {
1825
1826 if (rec->flags & FTRACE_FL_FAILED)
1827 continue;
1828
1829 if (ftrace_match_module_record(rec, mod,
1830 search, search_len, type)) {
1831 if (not)
1832 rec->flags &= ~flag;
1833 else
1834 rec->flags |= flag;
1835 found = 1;
1836 }
1837 if (enable && (rec->flags & FTRACE_FL_FILTER))
1838 ftrace_filtered = 1;
1839
1840 } while_for_each_ftrace_rec();
1841 mutex_unlock(&ftrace_lock);
1842
1843 return found;
1844} 2402}
1845 2403
1846/* 2404/*
@@ -1851,7 +2409,10 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable)
1851static int 2409static int
1852ftrace_mod_callback(char *func, char *cmd, char *param, int enable) 2410ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
1853{ 2411{
2412 struct ftrace_ops *ops = &global_ops;
2413 struct ftrace_hash *hash;
1854 char *mod; 2414 char *mod;
2415 int ret = -EINVAL;
1855 2416
1856 /* 2417 /*
1857 * cmd == 'mod' because we only registered this func 2418 * cmd == 'mod' because we only registered this func
@@ -1863,15 +2424,24 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
1863 2424
1864 /* we must have a module name */ 2425 /* we must have a module name */
1865 if (!param) 2426 if (!param)
1866 return -EINVAL; 2427 return ret;
1867 2428
1868 mod = strsep(&param, ":"); 2429 mod = strsep(&param, ":");
1869 if (!strlen(mod)) 2430 if (!strlen(mod))
1870 return -EINVAL; 2431 return ret;
1871 2432
1872 if (ftrace_match_module_records(func, mod, enable)) 2433 if (enable)
1873 return 0; 2434 hash = ops->filter_hash;
1874 return -EINVAL; 2435 else
2436 hash = ops->notrace_hash;
2437
2438 ret = ftrace_match_module_records(hash, func, mod);
2439 if (!ret)
2440 ret = -EINVAL;
2441 if (ret < 0)
2442 return ret;
2443
2444 return 0;
1875} 2445}
1876 2446
1877static struct ftrace_func_command ftrace_mod_cmd = { 2447static struct ftrace_func_command ftrace_mod_cmd = {
@@ -1922,6 +2492,7 @@ static int ftrace_probe_registered;
1922 2492
1923static void __enable_ftrace_function_probe(void) 2493static void __enable_ftrace_function_probe(void)
1924{ 2494{
2495 int ret;
1925 int i; 2496 int i;
1926 2497
1927 if (ftrace_probe_registered) 2498 if (ftrace_probe_registered)
@@ -1936,13 +2507,16 @@ static void __enable_ftrace_function_probe(void)
1936 if (i == FTRACE_FUNC_HASHSIZE) 2507 if (i == FTRACE_FUNC_HASHSIZE)
1937 return; 2508 return;
1938 2509
1939 __register_ftrace_function(&trace_probe_ops); 2510 ret = __register_ftrace_function(&trace_probe_ops);
1940 ftrace_startup(0); 2511 if (!ret)
2512 ret = ftrace_startup(&trace_probe_ops, 0);
2513
1941 ftrace_probe_registered = 1; 2514 ftrace_probe_registered = 1;
1942} 2515}
1943 2516
1944static void __disable_ftrace_function_probe(void) 2517static void __disable_ftrace_function_probe(void)
1945{ 2518{
2519 int ret;
1946 int i; 2520 int i;
1947 2521
1948 if (!ftrace_probe_registered) 2522 if (!ftrace_probe_registered)
@@ -1955,8 +2529,10 @@ static void __disable_ftrace_function_probe(void)
1955 } 2529 }
1956 2530
1957 /* no more funcs left */ 2531 /* no more funcs left */
1958 __unregister_ftrace_function(&trace_probe_ops); 2532 ret = __unregister_ftrace_function(&trace_probe_ops);
1959 ftrace_shutdown(0); 2533 if (!ret)
2534 ftrace_shutdown(&trace_probe_ops, 0);
2535
1960 ftrace_probe_registered = 0; 2536 ftrace_probe_registered = 0;
1961} 2537}
1962 2538
@@ -1992,12 +2568,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
1992 return -EINVAL; 2568 return -EINVAL;
1993 2569
1994 mutex_lock(&ftrace_lock); 2570 mutex_lock(&ftrace_lock);
1995 do_for_each_ftrace_rec(pg, rec) {
1996 2571
1997 if (rec->flags & FTRACE_FL_FAILED) 2572 if (unlikely(ftrace_disabled))
1998 continue; 2573 goto out_unlock;
2574
2575 do_for_each_ftrace_rec(pg, rec) {
1999 2576
2000 if (!ftrace_match_record(rec, search, len, type)) 2577 if (!ftrace_match_record(rec, NULL, search, len, type))
2001 continue; 2578 continue;
2002 2579
2003 entry = kmalloc(sizeof(*entry), GFP_KERNEL); 2580 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
@@ -2158,7 +2735,8 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd)
2158 return ret; 2735 return ret;
2159} 2736}
2160 2737
2161static int ftrace_process_regex(char *buff, int len, int enable) 2738static int ftrace_process_regex(struct ftrace_hash *hash,
2739 char *buff, int len, int enable)
2162{ 2740{
2163 char *func, *command, *next = buff; 2741 char *func, *command, *next = buff;
2164 struct ftrace_func_command *p; 2742 struct ftrace_func_command *p;
@@ -2167,9 +2745,12 @@ static int ftrace_process_regex(char *buff, int len, int enable)
2167 func = strsep(&next, ":"); 2745 func = strsep(&next, ":");
2168 2746
2169 if (!next) { 2747 if (!next) {
2170 if (ftrace_match_records(func, len, enable)) 2748 ret = ftrace_match_records(hash, func, len);
2171 return 0; 2749 if (!ret)
2172 return ret; 2750 ret = -EINVAL;
2751 if (ret < 0)
2752 return ret;
2753 return 0;
2173 } 2754 }
2174 2755
2175 /* command found */ 2756 /* command found */
@@ -2202,6 +2783,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2202 2783
2203 mutex_lock(&ftrace_regex_lock); 2784 mutex_lock(&ftrace_regex_lock);
2204 2785
2786 ret = -ENODEV;
2787 if (unlikely(ftrace_disabled))
2788 goto out_unlock;
2789
2205 if (file->f_mode & FMODE_READ) { 2790 if (file->f_mode & FMODE_READ) {
2206 struct seq_file *m = file->private_data; 2791 struct seq_file *m = file->private_data;
2207 iter = m->private; 2792 iter = m->private;
@@ -2213,7 +2798,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2213 2798
2214 if (read >= 0 && trace_parser_loaded(parser) && 2799 if (read >= 0 && trace_parser_loaded(parser) &&
2215 !trace_parser_cont(parser)) { 2800 !trace_parser_cont(parser)) {
2216 ret = ftrace_process_regex(parser->buffer, 2801 ret = ftrace_process_regex(iter->hash, parser->buffer,
2217 parser->idx, enable); 2802 parser->idx, enable);
2218 trace_parser_clear(parser); 2803 trace_parser_clear(parser);
2219 if (ret) 2804 if (ret)
@@ -2241,22 +2826,49 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
2241 return ftrace_regex_write(file, ubuf, cnt, ppos, 0); 2826 return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
2242} 2827}
2243 2828
2244static void 2829static int
2245ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) 2830ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
2831 int reset, int enable)
2246{ 2832{
2833 struct ftrace_hash **orig_hash;
2834 struct ftrace_hash *hash;
2835 int ret;
2836
2837 /* All global ops uses the global ops filters */
2838 if (ops->flags & FTRACE_OPS_FL_GLOBAL)
2839 ops = &global_ops;
2840
2247 if (unlikely(ftrace_disabled)) 2841 if (unlikely(ftrace_disabled))
2248 return; 2842 return -ENODEV;
2843
2844 if (enable)
2845 orig_hash = &ops->filter_hash;
2846 else
2847 orig_hash = &ops->notrace_hash;
2848
2849 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
2850 if (!hash)
2851 return -ENOMEM;
2249 2852
2250 mutex_lock(&ftrace_regex_lock); 2853 mutex_lock(&ftrace_regex_lock);
2251 if (reset) 2854 if (reset)
2252 ftrace_filter_reset(enable); 2855 ftrace_filter_reset(hash);
2253 if (buf) 2856 if (buf)
2254 ftrace_match_records(buf, len, enable); 2857 ftrace_match_records(hash, buf, len);
2858
2859 mutex_lock(&ftrace_lock);
2860 ret = ftrace_hash_move(orig_hash, hash);
2861 mutex_unlock(&ftrace_lock);
2862
2255 mutex_unlock(&ftrace_regex_lock); 2863 mutex_unlock(&ftrace_regex_lock);
2864
2865 free_ftrace_hash(hash);
2866 return ret;
2256} 2867}
2257 2868
2258/** 2869/**
2259 * ftrace_set_filter - set a function to filter on in ftrace 2870 * ftrace_set_filter - set a function to filter on in ftrace
2871 * @ops - the ops to set the filter with
2260 * @buf - the string that holds the function filter text. 2872 * @buf - the string that holds the function filter text.
2261 * @len - the length of the string. 2873 * @len - the length of the string.
2262 * @reset - non zero to reset all filters before applying this filter. 2874 * @reset - non zero to reset all filters before applying this filter.
@@ -2264,13 +2876,16 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
2264 * Filters denote which functions should be enabled when tracing is enabled. 2876 * Filters denote which functions should be enabled when tracing is enabled.
2265 * If @buf is NULL and reset is set, all functions will be enabled for tracing. 2877 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
2266 */ 2878 */
2267void ftrace_set_filter(unsigned char *buf, int len, int reset) 2879void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
2880 int len, int reset)
2268{ 2881{
2269 ftrace_set_regex(buf, len, reset, 1); 2882 ftrace_set_regex(ops, buf, len, reset, 1);
2270} 2883}
2884EXPORT_SYMBOL_GPL(ftrace_set_filter);
2271 2885
2272/** 2886/**
2273 * ftrace_set_notrace - set a function to not trace in ftrace 2887 * ftrace_set_notrace - set a function to not trace in ftrace
2888 * @ops - the ops to set the notrace filter with
2274 * @buf - the string that holds the function notrace text. 2889 * @buf - the string that holds the function notrace text.
2275 * @len - the length of the string. 2890 * @len - the length of the string.
2276 * @reset - non zero to reset all filters before applying this filter. 2891 * @reset - non zero to reset all filters before applying this filter.
@@ -2279,10 +2894,44 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset)
2279 * is enabled. If @buf is NULL and reset is set, all functions will be enabled 2894 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
2280 * for tracing. 2895 * for tracing.
2281 */ 2896 */
2282void ftrace_set_notrace(unsigned char *buf, int len, int reset) 2897void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
2898 int len, int reset)
2283{ 2899{
2284 ftrace_set_regex(buf, len, reset, 0); 2900 ftrace_set_regex(ops, buf, len, reset, 0);
2285} 2901}
2902EXPORT_SYMBOL_GPL(ftrace_set_notrace);
2903/**
2904 * ftrace_set_filter - set a function to filter on in ftrace
2905 * @ops - the ops to set the filter with
2906 * @buf - the string that holds the function filter text.
2907 * @len - the length of the string.
2908 * @reset - non zero to reset all filters before applying this filter.
2909 *
2910 * Filters denote which functions should be enabled when tracing is enabled.
2911 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
2912 */
2913void ftrace_set_global_filter(unsigned char *buf, int len, int reset)
2914{
2915 ftrace_set_regex(&global_ops, buf, len, reset, 1);
2916}
2917EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
2918
2919/**
2920 * ftrace_set_notrace - set a function to not trace in ftrace
2921 * @ops - the ops to set the notrace filter with
2922 * @buf - the string that holds the function notrace text.
2923 * @len - the length of the string.
2924 * @reset - non zero to reset all filters before applying this filter.
2925 *
2926 * Notrace Filters denote which functions should not be enabled when tracing
2927 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
2928 * for tracing.
2929 */
2930void ftrace_set_global_notrace(unsigned char *buf, int len, int reset)
2931{
2932 ftrace_set_regex(&global_ops, buf, len, reset, 0);
2933}
2934EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);
2286 2935
2287/* 2936/*
2288 * command line interface to allow users to set filters on boot up. 2937 * command line interface to allow users to set filters on boot up.
@@ -2333,22 +2982,23 @@ static void __init set_ftrace_early_graph(char *buf)
2333} 2982}
2334#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2983#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2335 2984
2336static void __init set_ftrace_early_filter(char *buf, int enable) 2985static void __init
2986set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
2337{ 2987{
2338 char *func; 2988 char *func;
2339 2989
2340 while (buf) { 2990 while (buf) {
2341 func = strsep(&buf, ","); 2991 func = strsep(&buf, ",");
2342 ftrace_set_regex(func, strlen(func), 0, enable); 2992 ftrace_set_regex(ops, func, strlen(func), 0, enable);
2343 } 2993 }
2344} 2994}
2345 2995
2346static void __init set_ftrace_early_filters(void) 2996static void __init set_ftrace_early_filters(void)
2347{ 2997{
2348 if (ftrace_filter_buf[0]) 2998 if (ftrace_filter_buf[0])
2349 set_ftrace_early_filter(ftrace_filter_buf, 1); 2999 set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1);
2350 if (ftrace_notrace_buf[0]) 3000 if (ftrace_notrace_buf[0])
2351 set_ftrace_early_filter(ftrace_notrace_buf, 0); 3001 set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0);
2352#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3002#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2353 if (ftrace_graph_buf[0]) 3003 if (ftrace_graph_buf[0])
2354 set_ftrace_early_graph(ftrace_graph_buf); 3004 set_ftrace_early_graph(ftrace_graph_buf);
@@ -2356,11 +3006,14 @@ static void __init set_ftrace_early_filters(void)
2356} 3006}
2357 3007
2358static int 3008static int
2359ftrace_regex_release(struct inode *inode, struct file *file, int enable) 3009ftrace_regex_release(struct inode *inode, struct file *file)
2360{ 3010{
2361 struct seq_file *m = (struct seq_file *)file->private_data; 3011 struct seq_file *m = (struct seq_file *)file->private_data;
2362 struct ftrace_iterator *iter; 3012 struct ftrace_iterator *iter;
3013 struct ftrace_hash **orig_hash;
2363 struct trace_parser *parser; 3014 struct trace_parser *parser;
3015 int filter_hash;
3016 int ret;
2364 3017
2365 mutex_lock(&ftrace_regex_lock); 3018 mutex_lock(&ftrace_regex_lock);
2366 if (file->f_mode & FMODE_READ) { 3019 if (file->f_mode & FMODE_READ) {
@@ -2373,33 +3026,41 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2373 parser = &iter->parser; 3026 parser = &iter->parser;
2374 if (trace_parser_loaded(parser)) { 3027 if (trace_parser_loaded(parser)) {
2375 parser->buffer[parser->idx] = 0; 3028 parser->buffer[parser->idx] = 0;
2376 ftrace_match_records(parser->buffer, parser->idx, enable); 3029 ftrace_match_records(iter->hash, parser->buffer, parser->idx);
2377 } 3030 }
2378 3031
2379 mutex_lock(&ftrace_lock);
2380 if (ftrace_start_up && ftrace_enabled)
2381 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
2382 mutex_unlock(&ftrace_lock);
2383
2384 trace_parser_put(parser); 3032 trace_parser_put(parser);
3033
3034 if (file->f_mode & FMODE_WRITE) {
3035 filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
3036
3037 if (filter_hash)
3038 orig_hash = &iter->ops->filter_hash;
3039 else
3040 orig_hash = &iter->ops->notrace_hash;
3041
3042 mutex_lock(&ftrace_lock);
3043 /*
3044 * Remove the current set, update the hash and add
3045 * them back.
3046 */
3047 ftrace_hash_rec_disable(iter->ops, filter_hash);
3048 ret = ftrace_hash_move(orig_hash, iter->hash);
3049 if (!ret) {
3050 ftrace_hash_rec_enable(iter->ops, filter_hash);
3051 if (iter->ops->flags & FTRACE_OPS_FL_ENABLED
3052 && ftrace_enabled)
3053 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
3054 }
3055 mutex_unlock(&ftrace_lock);
3056 }
3057 free_ftrace_hash(iter->hash);
2385 kfree(iter); 3058 kfree(iter);
2386 3059
2387 mutex_unlock(&ftrace_regex_lock); 3060 mutex_unlock(&ftrace_regex_lock);
2388 return 0; 3061 return 0;
2389} 3062}
2390 3063
2391static int
2392ftrace_filter_release(struct inode *inode, struct file *file)
2393{
2394 return ftrace_regex_release(inode, file, 1);
2395}
2396
2397static int
2398ftrace_notrace_release(struct inode *inode, struct file *file)
2399{
2400 return ftrace_regex_release(inode, file, 0);
2401}
2402
2403static const struct file_operations ftrace_avail_fops = { 3064static const struct file_operations ftrace_avail_fops = {
2404 .open = ftrace_avail_open, 3065 .open = ftrace_avail_open,
2405 .read = seq_read, 3066 .read = seq_read,
@@ -2407,8 +3068,8 @@ static const struct file_operations ftrace_avail_fops = {
2407 .release = seq_release_private, 3068 .release = seq_release_private,
2408}; 3069};
2409 3070
2410static const struct file_operations ftrace_failures_fops = { 3071static const struct file_operations ftrace_enabled_fops = {
2411 .open = ftrace_failures_open, 3072 .open = ftrace_enabled_open,
2412 .read = seq_read, 3073 .read = seq_read,
2413 .llseek = seq_lseek, 3074 .llseek = seq_lseek,
2414 .release = seq_release_private, 3075 .release = seq_release_private,
@@ -2418,8 +3079,8 @@ static const struct file_operations ftrace_filter_fops = {
2418 .open = ftrace_filter_open, 3079 .open = ftrace_filter_open,
2419 .read = seq_read, 3080 .read = seq_read,
2420 .write = ftrace_filter_write, 3081 .write = ftrace_filter_write,
2421 .llseek = no_llseek, 3082 .llseek = ftrace_regex_lseek,
2422 .release = ftrace_filter_release, 3083 .release = ftrace_regex_release,
2423}; 3084};
2424 3085
2425static const struct file_operations ftrace_notrace_fops = { 3086static const struct file_operations ftrace_notrace_fops = {
@@ -2427,7 +3088,7 @@ static const struct file_operations ftrace_notrace_fops = {
2427 .read = seq_read, 3088 .read = seq_read,
2428 .write = ftrace_notrace_write, 3089 .write = ftrace_notrace_write,
2429 .llseek = ftrace_regex_lseek, 3090 .llseek = ftrace_regex_lseek,
2430 .release = ftrace_notrace_release, 3091 .release = ftrace_regex_release,
2431}; 3092};
2432 3093
2433#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3094#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -2536,9 +3197,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2536 bool exists; 3197 bool exists;
2537 int i; 3198 int i;
2538 3199
2539 if (ftrace_disabled)
2540 return -ENODEV;
2541
2542 /* decode regex */ 3200 /* decode regex */
2543 type = filter_parse_regex(buffer, strlen(buffer), &search, &not); 3201 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2544 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) 3202 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
@@ -2547,12 +3205,18 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2547 search_len = strlen(search); 3205 search_len = strlen(search);
2548 3206
2549 mutex_lock(&ftrace_lock); 3207 mutex_lock(&ftrace_lock);
3208
3209 if (unlikely(ftrace_disabled)) {
3210 mutex_unlock(&ftrace_lock);
3211 return -ENODEV;
3212 }
3213
2550 do_for_each_ftrace_rec(pg, rec) { 3214 do_for_each_ftrace_rec(pg, rec) {
2551 3215
2552 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) 3216 if (rec->flags & FTRACE_FL_FREE)
2553 continue; 3217 continue;
2554 3218
2555 if (ftrace_match_record(rec, search, search_len, type)) { 3219 if (ftrace_match_record(rec, NULL, search, search_len, type)) {
2556 /* if it is in the array */ 3220 /* if it is in the array */
2557 exists = false; 3221 exists = false;
2558 for (i = 0; i < *idx; i++) { 3222 for (i = 0; i < *idx; i++) {
@@ -2632,6 +3296,7 @@ static const struct file_operations ftrace_graph_fops = {
2632 .read = seq_read, 3296 .read = seq_read,
2633 .write = ftrace_graph_write, 3297 .write = ftrace_graph_write,
2634 .release = ftrace_graph_release, 3298 .release = ftrace_graph_release,
3299 .llseek = seq_lseek,
2635}; 3300};
2636#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 3301#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2637 3302
@@ -2641,8 +3306,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
2641 trace_create_file("available_filter_functions", 0444, 3306 trace_create_file("available_filter_functions", 0444,
2642 d_tracer, NULL, &ftrace_avail_fops); 3307 d_tracer, NULL, &ftrace_avail_fops);
2643 3308
2644 trace_create_file("failures", 0444, 3309 trace_create_file("enabled_functions", 0444,
2645 d_tracer, NULL, &ftrace_failures_fops); 3310 d_tracer, NULL, &ftrace_enabled_fops);
2646 3311
2647 trace_create_file("set_ftrace_filter", 0644, d_tracer, 3312 trace_create_file("set_ftrace_filter", 0644, d_tracer,
2648 NULL, &ftrace_filter_fops); 3313 NULL, &ftrace_filter_fops);
@@ -2682,7 +3347,10 @@ static int ftrace_process_locs(struct module *mod,
2682 ftrace_record_ip(addr); 3347 ftrace_record_ip(addr);
2683 } 3348 }
2684 3349
2685 /* disable interrupts to prevent kstop machine */ 3350 /*
3351 * Disable interrupts to prevent interrupts from executing
3352 * code that is being modified.
3353 */
2686 local_irq_save(flags); 3354 local_irq_save(flags);
2687 ftrace_update_code(mod); 3355 ftrace_update_code(mod);
2688 local_irq_restore(flags); 3356 local_irq_restore(flags);
@@ -2697,10 +3365,11 @@ void ftrace_release_mod(struct module *mod)
2697 struct dyn_ftrace *rec; 3365 struct dyn_ftrace *rec;
2698 struct ftrace_page *pg; 3366 struct ftrace_page *pg;
2699 3367
3368 mutex_lock(&ftrace_lock);
3369
2700 if (ftrace_disabled) 3370 if (ftrace_disabled)
2701 return; 3371 goto out_unlock;
2702 3372
2703 mutex_lock(&ftrace_lock);
2704 do_for_each_ftrace_rec(pg, rec) { 3373 do_for_each_ftrace_rec(pg, rec) {
2705 if (within_module_core(rec->ip, mod)) { 3374 if (within_module_core(rec->ip, mod)) {
2706 /* 3375 /*
@@ -2711,6 +3380,7 @@ void ftrace_release_mod(struct module *mod)
2711 ftrace_free_rec(rec); 3380 ftrace_free_rec(rec);
2712 } 3381 }
2713 } while_for_each_ftrace_rec(); 3382 } while_for_each_ftrace_rec();
3383 out_unlock:
2714 mutex_unlock(&ftrace_lock); 3384 mutex_unlock(&ftrace_lock);
2715} 3385}
2716 3386
@@ -2797,6 +3467,10 @@ void __init ftrace_init(void)
2797 3467
2798#else 3468#else
2799 3469
3470static struct ftrace_ops global_ops = {
3471 .func = ftrace_stub,
3472};
3473
2800static int __init ftrace_nodyn_init(void) 3474static int __init ftrace_nodyn_init(void)
2801{ 3475{
2802 ftrace_enabled = 1; 3476 ftrace_enabled = 1;
@@ -2807,12 +3481,47 @@ device_initcall(ftrace_nodyn_init);
2807static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } 3481static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
2808static inline void ftrace_startup_enable(int command) { } 3482static inline void ftrace_startup_enable(int command) { }
2809/* Keep as macros so we do not need to define the commands */ 3483/* Keep as macros so we do not need to define the commands */
2810# define ftrace_startup(command) do { } while (0) 3484# define ftrace_startup(ops, command) \
2811# define ftrace_shutdown(command) do { } while (0) 3485 ({ \
3486 (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
3487 0; \
3488 })
3489# define ftrace_shutdown(ops, command) do { } while (0)
2812# define ftrace_startup_sysctl() do { } while (0) 3490# define ftrace_startup_sysctl() do { } while (0)
2813# define ftrace_shutdown_sysctl() do { } while (0) 3491# define ftrace_shutdown_sysctl() do { } while (0)
3492
3493static inline int
3494ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
3495{
3496 return 1;
3497}
3498
2814#endif /* CONFIG_DYNAMIC_FTRACE */ 3499#endif /* CONFIG_DYNAMIC_FTRACE */
2815 3500
3501static void
3502ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
3503{
3504 struct ftrace_ops *op;
3505
3506 if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
3507 return;
3508
3509 trace_recursion_set(TRACE_INTERNAL_BIT);
3510 /*
3511 * Some of the ops may be dynamically allocated,
3512 * they must be freed after a synchronize_sched().
3513 */
3514 preempt_disable_notrace();
3515 op = rcu_dereference_raw(ftrace_ops_list);
3516 while (op != &ftrace_list_end) {
3517 if (ftrace_ops_test(op, ip))
3518 op->func(ip, parent_ip);
3519 op = rcu_dereference_raw(op->next);
3520 };
3521 preempt_enable_notrace();
3522 trace_recursion_clear(TRACE_INTERNAL_BIT);
3523}
3524
2816static void clear_ftrace_swapper(void) 3525static void clear_ftrace_swapper(void)
2817{ 3526{
2818 struct task_struct *p; 3527 struct task_struct *p;
@@ -3105,19 +3814,23 @@ void ftrace_kill(void)
3105 */ 3814 */
3106int register_ftrace_function(struct ftrace_ops *ops) 3815int register_ftrace_function(struct ftrace_ops *ops)
3107{ 3816{
3108 int ret; 3817 int ret = -1;
3109
3110 if (unlikely(ftrace_disabled))
3111 return -1;
3112 3818
3113 mutex_lock(&ftrace_lock); 3819 mutex_lock(&ftrace_lock);
3114 3820
3821 if (unlikely(ftrace_disabled))
3822 goto out_unlock;
3823
3115 ret = __register_ftrace_function(ops); 3824 ret = __register_ftrace_function(ops);
3116 ftrace_startup(0); 3825 if (!ret)
3826 ret = ftrace_startup(ops, 0);
3117 3827
3828
3829 out_unlock:
3118 mutex_unlock(&ftrace_lock); 3830 mutex_unlock(&ftrace_lock);
3119 return ret; 3831 return ret;
3120} 3832}
3833EXPORT_SYMBOL_GPL(register_ftrace_function);
3121 3834
3122/** 3835/**
3123 * unregister_ftrace_function - unregister a function for profiling. 3836 * unregister_ftrace_function - unregister a function for profiling.
@@ -3131,25 +3844,27 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
3131 3844
3132 mutex_lock(&ftrace_lock); 3845 mutex_lock(&ftrace_lock);
3133 ret = __unregister_ftrace_function(ops); 3846 ret = __unregister_ftrace_function(ops);
3134 ftrace_shutdown(0); 3847 if (!ret)
3848 ftrace_shutdown(ops, 0);
3135 mutex_unlock(&ftrace_lock); 3849 mutex_unlock(&ftrace_lock);
3136 3850
3137 return ret; 3851 return ret;
3138} 3852}
3853EXPORT_SYMBOL_GPL(unregister_ftrace_function);
3139 3854
3140int 3855int
3141ftrace_enable_sysctl(struct ctl_table *table, int write, 3856ftrace_enable_sysctl(struct ctl_table *table, int write,
3142 void __user *buffer, size_t *lenp, 3857 void __user *buffer, size_t *lenp,
3143 loff_t *ppos) 3858 loff_t *ppos)
3144{ 3859{
3145 int ret; 3860 int ret = -ENODEV;
3146
3147 if (unlikely(ftrace_disabled))
3148 return -ENODEV;
3149 3861
3150 mutex_lock(&ftrace_lock); 3862 mutex_lock(&ftrace_lock);
3151 3863
3152 ret = proc_dointvec(table, write, buffer, lenp, ppos); 3864 if (unlikely(ftrace_disabled))
3865 goto out;
3866
3867 ret = proc_dointvec(table, write, buffer, lenp, ppos);
3153 3868
3154 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) 3869 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3155 goto out; 3870 goto out;
@@ -3161,11 +3876,11 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3161 ftrace_startup_sysctl(); 3876 ftrace_startup_sysctl();
3162 3877
3163 /* we are starting ftrace again */ 3878 /* we are starting ftrace again */
3164 if (ftrace_list != &ftrace_list_end) { 3879 if (ftrace_ops_list != &ftrace_list_end) {
3165 if (ftrace_list->next == &ftrace_list_end) 3880 if (ftrace_ops_list->next == &ftrace_list_end)
3166 ftrace_trace_function = ftrace_list->func; 3881 ftrace_trace_function = ftrace_ops_list->func;
3167 else 3882 else
3168 ftrace_trace_function = ftrace_list_func; 3883 ftrace_trace_function = ftrace_ops_list_func;
3169 } 3884 }
3170 3885
3171 } else { 3886 } else {
@@ -3289,7 +4004,7 @@ static int start_graph_tracing(void)
3289 /* The cpu_boot init_task->ret_stack will never be freed */ 4004 /* The cpu_boot init_task->ret_stack will never be freed */
3290 for_each_online_cpu(cpu) { 4005 for_each_online_cpu(cpu) {
3291 if (!idle_task(cpu)->ret_stack) 4006 if (!idle_task(cpu)->ret_stack)
3292 ftrace_graph_init_task(idle_task(cpu)); 4007 ftrace_graph_init_idle_task(idle_task(cpu), cpu);
3293 } 4008 }
3294 4009
3295 do { 4010 do {
@@ -3354,7 +4069,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
3354 ftrace_graph_return = retfunc; 4069 ftrace_graph_return = retfunc;
3355 ftrace_graph_entry = entryfunc; 4070 ftrace_graph_entry = entryfunc;
3356 4071
3357 ftrace_startup(FTRACE_START_FUNC_RET); 4072 ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
3358 4073
3359out: 4074out:
3360 mutex_unlock(&ftrace_lock); 4075 mutex_unlock(&ftrace_lock);
@@ -3371,7 +4086,7 @@ void unregister_ftrace_graph(void)
3371 ftrace_graph_active--; 4086 ftrace_graph_active--;
3372 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 4087 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
3373 ftrace_graph_entry = ftrace_graph_entry_stub; 4088 ftrace_graph_entry = ftrace_graph_entry_stub;
3374 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 4089 ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
3375 unregister_pm_notifier(&ftrace_suspend_notifier); 4090 unregister_pm_notifier(&ftrace_suspend_notifier);
3376 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 4091 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
3377 4092
@@ -3379,6 +4094,49 @@ void unregister_ftrace_graph(void)
3379 mutex_unlock(&ftrace_lock); 4094 mutex_unlock(&ftrace_lock);
3380} 4095}
3381 4096
4097static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
4098
4099static void
4100graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
4101{
4102 atomic_set(&t->tracing_graph_pause, 0);
4103 atomic_set(&t->trace_overrun, 0);
4104 t->ftrace_timestamp = 0;
4105 /* make curr_ret_stack visible before we add the ret_stack */
4106 smp_wmb();
4107 t->ret_stack = ret_stack;
4108}
4109
4110/*
4111 * Allocate a return stack for the idle task. May be the first
4112 * time through, or it may be done by CPU hotplug online.
4113 */
4114void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
4115{
4116 t->curr_ret_stack = -1;
4117 /*
4118 * The idle task has no parent, it either has its own
4119 * stack or no stack at all.
4120 */
4121 if (t->ret_stack)
4122 WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
4123
4124 if (ftrace_graph_active) {
4125 struct ftrace_ret_stack *ret_stack;
4126
4127 ret_stack = per_cpu(idle_ret_stack, cpu);
4128 if (!ret_stack) {
4129 ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
4130 * sizeof(struct ftrace_ret_stack),
4131 GFP_KERNEL);
4132 if (!ret_stack)
4133 return;
4134 per_cpu(idle_ret_stack, cpu) = ret_stack;
4135 }
4136 graph_init_task(t, ret_stack);
4137 }
4138}
4139
3382/* Allocate a return stack for newly created task */ 4140/* Allocate a return stack for newly created task */
3383void ftrace_graph_init_task(struct task_struct *t) 4141void ftrace_graph_init_task(struct task_struct *t)
3384{ 4142{
@@ -3394,12 +4152,7 @@ void ftrace_graph_init_task(struct task_struct *t)
3394 GFP_KERNEL); 4152 GFP_KERNEL);
3395 if (!ret_stack) 4153 if (!ret_stack)
3396 return; 4154 return;
3397 atomic_set(&t->tracing_graph_pause, 0); 4155 graph_init_task(t, ret_stack);
3398 atomic_set(&t->trace_overrun, 0);
3399 t->ftrace_timestamp = 0;
3400 /* make curr_ret_stack visable before we add the ret_stack */
3401 smp_wmb();
3402 t->ret_stack = ret_stack;
3403 } 4156 }
3404} 4157}
3405 4158
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index a22582a06161..f55fcf61b223 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,8 @@
13#define CREATE_TRACE_POINTS 13#define CREATE_TRACE_POINTS
14#include <trace/events/power.h> 14#include <trace/events/power.h>
15 15
16EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); 16#ifdef EVENT_POWER_TRACING_DEPRECATED
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18#endif
19EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
17 20
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bca96377fd4e..b0c7aa407943 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5,7 +5,6 @@
5 */ 5 */
6#include <linux/ring_buffer.h> 6#include <linux/ring_buffer.h>
7#include <linux/trace_clock.h> 7#include <linux/trace_clock.h>
8#include <linux/ftrace_irq.h>
9#include <linux/spinlock.h> 8#include <linux/spinlock.h>
10#include <linux/debugfs.h> 9#include <linux/debugfs.h>
11#include <linux/uaccess.h> 10#include <linux/uaccess.h>
@@ -224,6 +223,9 @@ enum {
224 RB_LEN_TIME_STAMP = 16, 223 RB_LEN_TIME_STAMP = 16,
225}; 224};
226 225
226#define skip_time_extend(event) \
227 ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
228
227static inline int rb_null_event(struct ring_buffer_event *event) 229static inline int rb_null_event(struct ring_buffer_event *event)
228{ 230{
229 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; 231 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -248,8 +250,12 @@ rb_event_data_length(struct ring_buffer_event *event)
248 return length + RB_EVNT_HDR_SIZE; 250 return length + RB_EVNT_HDR_SIZE;
249} 251}
250 252
251/* inline for ring buffer fast paths */ 253/*
252static unsigned 254 * Return the length of the given event. Will return
255 * the length of the time extend if the event is a
256 * time extend.
257 */
258static inline unsigned
253rb_event_length(struct ring_buffer_event *event) 259rb_event_length(struct ring_buffer_event *event)
254{ 260{
255 switch (event->type_len) { 261 switch (event->type_len) {
@@ -274,13 +280,41 @@ rb_event_length(struct ring_buffer_event *event)
274 return 0; 280 return 0;
275} 281}
276 282
283/*
284 * Return total length of time extend and data,
285 * or just the event length for all other events.
286 */
287static inline unsigned
288rb_event_ts_length(struct ring_buffer_event *event)
289{
290 unsigned len = 0;
291
292 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
293 /* time extends include the data event after it */
294 len = RB_LEN_TIME_EXTEND;
295 event = skip_time_extend(event);
296 }
297 return len + rb_event_length(event);
298}
299
277/** 300/**
278 * ring_buffer_event_length - return the length of the event 301 * ring_buffer_event_length - return the length of the event
279 * @event: the event to get the length of 302 * @event: the event to get the length of
303 *
304 * Returns the size of the data load of a data event.
305 * If the event is something other than a data event, it
306 * returns the size of the event itself. With the exception
307 * of a TIME EXTEND, where it still returns the size of the
308 * data load of the data event after it.
280 */ 309 */
281unsigned ring_buffer_event_length(struct ring_buffer_event *event) 310unsigned ring_buffer_event_length(struct ring_buffer_event *event)
282{ 311{
283 unsigned length = rb_event_length(event); 312 unsigned length;
313
314 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
315 event = skip_time_extend(event);
316
317 length = rb_event_length(event);
284 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 318 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
285 return length; 319 return length;
286 length -= RB_EVNT_HDR_SIZE; 320 length -= RB_EVNT_HDR_SIZE;
@@ -294,6 +328,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
294static void * 328static void *
295rb_event_data(struct ring_buffer_event *event) 329rb_event_data(struct ring_buffer_event *event)
296{ 330{
331 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
332 event = skip_time_extend(event);
297 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); 333 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
298 /* If length is in len field, then array[0] has the data */ 334 /* If length is in len field, then array[0] has the data */
299 if (event->type_len) 335 if (event->type_len)
@@ -404,9 +440,6 @@ static inline int test_time_stamp(u64 delta)
404/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ 440/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
405#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) 441#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
406 442
407/* Max number of timestamps that can fit on a page */
408#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
409
410int ring_buffer_print_page_header(struct trace_seq *s) 443int ring_buffer_print_page_header(struct trace_seq *s)
411{ 444{
412 struct buffer_data_page field; 445 struct buffer_data_page field;
@@ -635,7 +668,7 @@ static struct list_head *rb_list_head(struct list_head *list)
635 * the reader page). But if the next page is a header page, 668 * the reader page). But if the next page is a header page,
636 * its flags will be non zero. 669 * its flags will be non zero.
637 */ 670 */
638static int inline 671static inline int
639rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, 672rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
640 struct buffer_page *page, struct list_head *list) 673 struct buffer_page *page, struct list_head *list)
641{ 674{
@@ -1395,6 +1428,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1395} 1428}
1396EXPORT_SYMBOL_GPL(ring_buffer_resize); 1429EXPORT_SYMBOL_GPL(ring_buffer_resize);
1397 1430
1431void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
1432{
1433 mutex_lock(&buffer->mutex);
1434 if (val)
1435 buffer->flags |= RB_FL_OVERWRITE;
1436 else
1437 buffer->flags &= ~RB_FL_OVERWRITE;
1438 mutex_unlock(&buffer->mutex);
1439}
1440EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
1441
1398static inline void * 1442static inline void *
1399__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) 1443__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
1400{ 1444{
@@ -1434,7 +1478,7 @@ static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1434 return local_read(&bpage->entries) & RB_WRITE_MASK; 1478 return local_read(&bpage->entries) & RB_WRITE_MASK;
1435} 1479}
1436 1480
1437/* Size is determined by what has been commited */ 1481/* Size is determined by what has been committed */
1438static inline unsigned rb_page_size(struct buffer_page *bpage) 1482static inline unsigned rb_page_size(struct buffer_page *bpage)
1439{ 1483{
1440 return rb_page_commit(bpage); 1484 return rb_page_commit(bpage);
@@ -1546,6 +1590,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1546 iter->head = 0; 1590 iter->head = 0;
1547} 1591}
1548 1592
1593/* Slow path, do not inline */
1594static noinline struct ring_buffer_event *
1595rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
1596{
1597 event->type_len = RINGBUF_TYPE_TIME_EXTEND;
1598
1599 /* Not the first event on the page? */
1600 if (rb_event_index(event)) {
1601 event->time_delta = delta & TS_MASK;
1602 event->array[0] = delta >> TS_SHIFT;
1603 } else {
1604 /* nope, just zero it */
1605 event->time_delta = 0;
1606 event->array[0] = 0;
1607 }
1608
1609 return skip_time_extend(event);
1610}
1611
1549/** 1612/**
1550 * ring_buffer_update_event - update event type and data 1613 * ring_buffer_update_event - update event type and data
1551 * @event: the even to update 1614 * @event: the even to update
@@ -1558,28 +1621,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1558 * data field. 1621 * data field.
1559 */ 1622 */
1560static void 1623static void
1561rb_update_event(struct ring_buffer_event *event, 1624rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
1562 unsigned type, unsigned length) 1625 struct ring_buffer_event *event, unsigned length,
1626 int add_timestamp, u64 delta)
1563{ 1627{
1564 event->type_len = type; 1628 /* Only a commit updates the timestamp */
1565 1629 if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
1566 switch (type) { 1630 delta = 0;
1567
1568 case RINGBUF_TYPE_PADDING:
1569 case RINGBUF_TYPE_TIME_EXTEND:
1570 case RINGBUF_TYPE_TIME_STAMP:
1571 break;
1572 1631
1573 case 0: 1632 /*
1574 length -= RB_EVNT_HDR_SIZE; 1633 * If we need to add a timestamp, then we
1575 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) 1634 * add it to the start of the resevered space.
1576 event->array[0] = length; 1635 */
1577 else 1636 if (unlikely(add_timestamp)) {
1578 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); 1637 event = rb_add_time_stamp(event, delta);
1579 break; 1638 length -= RB_LEN_TIME_EXTEND;
1580 default: 1639 delta = 0;
1581 BUG();
1582 } 1640 }
1641
1642 event->time_delta = delta;
1643 length -= RB_EVNT_HDR_SIZE;
1644 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
1645 event->type_len = 0;
1646 event->array[0] = length;
1647 } else
1648 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
1583} 1649}
1584 1650
1585/* 1651/*
@@ -1823,10 +1889,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1823 local_sub(length, &tail_page->write); 1889 local_sub(length, &tail_page->write);
1824} 1890}
1825 1891
1826static struct ring_buffer_event * 1892/*
1893 * This is the slow path, force gcc not to inline it.
1894 */
1895static noinline struct ring_buffer_event *
1827rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 1896rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1828 unsigned long length, unsigned long tail, 1897 unsigned long length, unsigned long tail,
1829 struct buffer_page *tail_page, u64 *ts) 1898 struct buffer_page *tail_page, u64 ts)
1830{ 1899{
1831 struct buffer_page *commit_page = cpu_buffer->commit_page; 1900 struct buffer_page *commit_page = cpu_buffer->commit_page;
1832 struct ring_buffer *buffer = cpu_buffer->buffer; 1901 struct ring_buffer *buffer = cpu_buffer->buffer;
@@ -1909,8 +1978,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1909 * Nested commits always have zero deltas, so 1978 * Nested commits always have zero deltas, so
1910 * just reread the time stamp 1979 * just reread the time stamp
1911 */ 1980 */
1912 *ts = rb_time_stamp(buffer); 1981 ts = rb_time_stamp(buffer);
1913 next_page->page->time_stamp = *ts; 1982 next_page->page->time_stamp = ts;
1914 } 1983 }
1915 1984
1916 out_again: 1985 out_again:
@@ -1929,12 +1998,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1929 1998
1930static struct ring_buffer_event * 1999static struct ring_buffer_event *
1931__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 2000__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1932 unsigned type, unsigned long length, u64 *ts) 2001 unsigned long length, u64 ts,
2002 u64 delta, int add_timestamp)
1933{ 2003{
1934 struct buffer_page *tail_page; 2004 struct buffer_page *tail_page;
1935 struct ring_buffer_event *event; 2005 struct ring_buffer_event *event;
1936 unsigned long tail, write; 2006 unsigned long tail, write;
1937 2007
2008 /*
2009 * If the time delta since the last event is too big to
2010 * hold in the time field of the event, then we append a
2011 * TIME EXTEND event ahead of the data event.
2012 */
2013 if (unlikely(add_timestamp))
2014 length += RB_LEN_TIME_EXTEND;
2015
1938 tail_page = cpu_buffer->tail_page; 2016 tail_page = cpu_buffer->tail_page;
1939 write = local_add_return(length, &tail_page->write); 2017 write = local_add_return(length, &tail_page->write);
1940 2018
@@ -1943,7 +2021,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1943 tail = write - length; 2021 tail = write - length;
1944 2022
1945 /* See if we shot pass the end of this buffer page */ 2023 /* See if we shot pass the end of this buffer page */
1946 if (write > BUF_PAGE_SIZE) 2024 if (unlikely(write > BUF_PAGE_SIZE))
1947 return rb_move_tail(cpu_buffer, length, tail, 2025 return rb_move_tail(cpu_buffer, length, tail,
1948 tail_page, ts); 2026 tail_page, ts);
1949 2027
@@ -1951,18 +2029,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1951 2029
1952 event = __rb_page_index(tail_page, tail); 2030 event = __rb_page_index(tail_page, tail);
1953 kmemcheck_annotate_bitfield(event, bitfield); 2031 kmemcheck_annotate_bitfield(event, bitfield);
1954 rb_update_event(event, type, length); 2032 rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
1955 2033
1956 /* The passed in type is zero for DATA */ 2034 local_inc(&tail_page->entries);
1957 if (likely(!type))
1958 local_inc(&tail_page->entries);
1959 2035
1960 /* 2036 /*
1961 * If this is the first commit on the page, then update 2037 * If this is the first commit on the page, then update
1962 * its timestamp. 2038 * its timestamp.
1963 */ 2039 */
1964 if (!tail) 2040 if (!tail)
1965 tail_page->page->time_stamp = *ts; 2041 tail_page->page->time_stamp = ts;
1966 2042
1967 return event; 2043 return event;
1968} 2044}
@@ -1977,7 +2053,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1977 unsigned long addr; 2053 unsigned long addr;
1978 2054
1979 new_index = rb_event_index(event); 2055 new_index = rb_event_index(event);
1980 old_index = new_index + rb_event_length(event); 2056 old_index = new_index + rb_event_ts_length(event);
1981 addr = (unsigned long)event; 2057 addr = (unsigned long)event;
1982 addr &= PAGE_MASK; 2058 addr &= PAGE_MASK;
1983 2059
@@ -2003,76 +2079,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
2003 return 0; 2079 return 0;
2004} 2080}
2005 2081
2006static int
2007rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2008 u64 *ts, u64 *delta)
2009{
2010 struct ring_buffer_event *event;
2011 int ret;
2012
2013 WARN_ONCE(*delta > (1ULL << 59),
2014 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
2015 (unsigned long long)*delta,
2016 (unsigned long long)*ts,
2017 (unsigned long long)cpu_buffer->write_stamp);
2018
2019 /*
2020 * The delta is too big, we to add a
2021 * new timestamp.
2022 */
2023 event = __rb_reserve_next(cpu_buffer,
2024 RINGBUF_TYPE_TIME_EXTEND,
2025 RB_LEN_TIME_EXTEND,
2026 ts);
2027 if (!event)
2028 return -EBUSY;
2029
2030 if (PTR_ERR(event) == -EAGAIN)
2031 return -EAGAIN;
2032
2033 /* Only a commited time event can update the write stamp */
2034 if (rb_event_is_commit(cpu_buffer, event)) {
2035 /*
2036 * If this is the first on the page, then it was
2037 * updated with the page itself. Try to discard it
2038 * and if we can't just make it zero.
2039 */
2040 if (rb_event_index(event)) {
2041 event->time_delta = *delta & TS_MASK;
2042 event->array[0] = *delta >> TS_SHIFT;
2043 } else {
2044 /* try to discard, since we do not need this */
2045 if (!rb_try_to_discard(cpu_buffer, event)) {
2046 /* nope, just zero it */
2047 event->time_delta = 0;
2048 event->array[0] = 0;
2049 }
2050 }
2051 cpu_buffer->write_stamp = *ts;
2052 /* let the caller know this was the commit */
2053 ret = 1;
2054 } else {
2055 /* Try to discard the event */
2056 if (!rb_try_to_discard(cpu_buffer, event)) {
2057 /* Darn, this is just wasted space */
2058 event->time_delta = 0;
2059 event->array[0] = 0;
2060 }
2061 ret = 0;
2062 }
2063
2064 *delta = 0;
2065
2066 return ret;
2067}
2068
2069static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) 2082static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
2070{ 2083{
2071 local_inc(&cpu_buffer->committing); 2084 local_inc(&cpu_buffer->committing);
2072 local_inc(&cpu_buffer->commits); 2085 local_inc(&cpu_buffer->commits);
2073} 2086}
2074 2087
2075static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) 2088static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
2076{ 2089{
2077 unsigned long commits; 2090 unsigned long commits;
2078 2091
@@ -2110,9 +2123,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2110 unsigned long length) 2123 unsigned long length)
2111{ 2124{
2112 struct ring_buffer_event *event; 2125 struct ring_buffer_event *event;
2113 u64 ts, delta = 0; 2126 u64 ts, delta;
2114 int commit = 0;
2115 int nr_loops = 0; 2127 int nr_loops = 0;
2128 int add_timestamp;
2129 u64 diff;
2116 2130
2117 rb_start_commit(cpu_buffer); 2131 rb_start_commit(cpu_buffer);
2118 2132
@@ -2133,6 +2147,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2133 2147
2134 length = rb_calculate_event_length(length); 2148 length = rb_calculate_event_length(length);
2135 again: 2149 again:
2150 add_timestamp = 0;
2151 delta = 0;
2152
2136 /* 2153 /*
2137 * We allow for interrupts to reenter here and do a trace. 2154 * We allow for interrupts to reenter here and do a trace.
2138 * If one does, it will cause this original code to loop 2155 * If one does, it will cause this original code to loop
@@ -2146,56 +2163,40 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2146 goto out_fail; 2163 goto out_fail;
2147 2164
2148 ts = rb_time_stamp(cpu_buffer->buffer); 2165 ts = rb_time_stamp(cpu_buffer->buffer);
2166 diff = ts - cpu_buffer->write_stamp;
2149 2167
2150 /* 2168 /* make sure this diff is calculated here */
2151 * Only the first commit can update the timestamp. 2169 barrier();
2152 * Yes there is a race here. If an interrupt comes in
2153 * just after the conditional and it traces too, then it
2154 * will also check the deltas. More than one timestamp may
2155 * also be made. But only the entry that did the actual
2156 * commit will be something other than zero.
2157 */
2158 if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
2159 rb_page_write(cpu_buffer->tail_page) ==
2160 rb_commit_index(cpu_buffer))) {
2161 u64 diff;
2162
2163 diff = ts - cpu_buffer->write_stamp;
2164
2165 /* make sure this diff is calculated here */
2166 barrier();
2167
2168 /* Did the write stamp get updated already? */
2169 if (unlikely(ts < cpu_buffer->write_stamp))
2170 goto get_event;
2171 2170
2171 /* Did the write stamp get updated already? */
2172 if (likely(ts >= cpu_buffer->write_stamp)) {
2172 delta = diff; 2173 delta = diff;
2173 if (unlikely(test_time_stamp(delta))) { 2174 if (unlikely(test_time_stamp(delta))) {
2174 2175 int local_clock_stable = 1;
2175 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); 2176#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
2176 if (commit == -EBUSY) 2177 local_clock_stable = sched_clock_stable;
2177 goto out_fail; 2178#endif
2178 2179 WARN_ONCE(delta > (1ULL << 59),
2179 if (commit == -EAGAIN) 2180 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
2180 goto again; 2181 (unsigned long long)delta,
2181 2182 (unsigned long long)ts,
2182 RB_WARN_ON(cpu_buffer, commit < 0); 2183 (unsigned long long)cpu_buffer->write_stamp,
2184 local_clock_stable ? "" :
2185 "If you just came from a suspend/resume,\n"
2186 "please switch to the trace global clock:\n"
2187 " echo global > /sys/kernel/debug/tracing/trace_clock\n");
2188 add_timestamp = 1;
2183 } 2189 }
2184 } 2190 }
2185 2191
2186 get_event: 2192 event = __rb_reserve_next(cpu_buffer, length, ts,
2187 event = __rb_reserve_next(cpu_buffer, 0, length, &ts); 2193 delta, add_timestamp);
2188 if (unlikely(PTR_ERR(event) == -EAGAIN)) 2194 if (unlikely(PTR_ERR(event) == -EAGAIN))
2189 goto again; 2195 goto again;
2190 2196
2191 if (!event) 2197 if (!event)
2192 goto out_fail; 2198 goto out_fail;
2193 2199
2194 if (!rb_event_is_commit(cpu_buffer, event))
2195 delta = 0;
2196
2197 event->time_delta = delta;
2198
2199 return event; 2200 return event;
2200 2201
2201 out_fail: 2202 out_fail:
@@ -2207,32 +2208,39 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2207 2208
2208#define TRACE_RECURSIVE_DEPTH 16 2209#define TRACE_RECURSIVE_DEPTH 16
2209 2210
2210static int trace_recursive_lock(void) 2211/* Keep this code out of the fast path cache */
2212static noinline void trace_recursive_fail(void)
2211{ 2213{
2212 current->trace_recursion++;
2213
2214 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
2215 return 0;
2216
2217 /* Disable all tracing before we do anything else */ 2214 /* Disable all tracing before we do anything else */
2218 tracing_off_permanent(); 2215 tracing_off_permanent();
2219 2216
2220 printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" 2217 printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
2221 "HC[%lu]:SC[%lu]:NMI[%lu]\n", 2218 "HC[%lu]:SC[%lu]:NMI[%lu]\n",
2222 current->trace_recursion, 2219 trace_recursion_buffer(),
2223 hardirq_count() >> HARDIRQ_SHIFT, 2220 hardirq_count() >> HARDIRQ_SHIFT,
2224 softirq_count() >> SOFTIRQ_SHIFT, 2221 softirq_count() >> SOFTIRQ_SHIFT,
2225 in_nmi()); 2222 in_nmi());
2226 2223
2227 WARN_ON_ONCE(1); 2224 WARN_ON_ONCE(1);
2225}
2226
2227static inline int trace_recursive_lock(void)
2228{
2229 trace_recursion_inc();
2230
2231 if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH))
2232 return 0;
2233
2234 trace_recursive_fail();
2235
2228 return -1; 2236 return -1;
2229} 2237}
2230 2238
2231static void trace_recursive_unlock(void) 2239static inline void trace_recursive_unlock(void)
2232{ 2240{
2233 WARN_ON_ONCE(!current->trace_recursion); 2241 WARN_ON_ONCE(!trace_recursion_buffer());
2234 2242
2235 current->trace_recursion--; 2243 trace_recursion_dec();
2236} 2244}
2237 2245
2238#else 2246#else
@@ -2308,12 +2316,28 @@ static void
2308rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, 2316rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2309 struct ring_buffer_event *event) 2317 struct ring_buffer_event *event)
2310{ 2318{
2319 u64 delta;
2320
2311 /* 2321 /*
2312 * The event first in the commit queue updates the 2322 * The event first in the commit queue updates the
2313 * time stamp. 2323 * time stamp.
2314 */ 2324 */
2315 if (rb_event_is_commit(cpu_buffer, event)) 2325 if (rb_event_is_commit(cpu_buffer, event)) {
2316 cpu_buffer->write_stamp += event->time_delta; 2326 /*
2327 * A commit event that is first on a page
2328 * updates the write timestamp with the page stamp
2329 */
2330 if (!rb_event_index(event))
2331 cpu_buffer->write_stamp =
2332 cpu_buffer->commit_page->page->time_stamp;
2333 else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
2334 delta = event->array[0];
2335 delta <<= TS_SHIFT;
2336 delta += event->time_delta;
2337 cpu_buffer->write_stamp += delta;
2338 } else
2339 cpu_buffer->write_stamp += event->time_delta;
2340 }
2317} 2341}
2318 2342
2319static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 2343static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2353,6 +2377,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
2353 2377
2354static inline void rb_event_discard(struct ring_buffer_event *event) 2378static inline void rb_event_discard(struct ring_buffer_event *event)
2355{ 2379{
2380 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
2381 event = skip_time_extend(event);
2382
2356 /* array[0] holds the actual length for the discarded event */ 2383 /* array[0] holds the actual length for the discarded event */
2357 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; 2384 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
2358 event->type_len = RINGBUF_TYPE_PADDING; 2385 event->type_len = RINGBUF_TYPE_PADDING;
@@ -2606,6 +2633,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
2606} 2633}
2607EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); 2634EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
2608 2635
2636/*
2637 * The total entries in the ring buffer is the running counter
2638 * of entries entered into the ring buffer, minus the sum of
2639 * the entries read from the ring buffer and the number of
2640 * entries that were overwritten.
2641 */
2642static inline unsigned long
2643rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
2644{
2645 return local_read(&cpu_buffer->entries) -
2646 (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
2647}
2648
2609/** 2649/**
2610 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer 2650 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
2611 * @buffer: The ring buffer 2651 * @buffer: The ring buffer
@@ -2614,16 +2654,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
2614unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) 2654unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
2615{ 2655{
2616 struct ring_buffer_per_cpu *cpu_buffer; 2656 struct ring_buffer_per_cpu *cpu_buffer;
2617 unsigned long ret;
2618 2657
2619 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2658 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2620 return 0; 2659 return 0;
2621 2660
2622 cpu_buffer = buffer->buffers[cpu]; 2661 cpu_buffer = buffer->buffers[cpu];
2623 ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
2624 - cpu_buffer->read;
2625 2662
2626 return ret; 2663 return rb_num_of_entries(cpu_buffer);
2627} 2664}
2628EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); 2665EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
2629 2666
@@ -2684,8 +2721,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2684 /* if you care about this being correct, lock the buffer */ 2721 /* if you care about this being correct, lock the buffer */
2685 for_each_buffer_cpu(buffer, cpu) { 2722 for_each_buffer_cpu(buffer, cpu) {
2686 cpu_buffer = buffer->buffers[cpu]; 2723 cpu_buffer = buffer->buffers[cpu];
2687 entries += (local_read(&cpu_buffer->entries) - 2724 entries += rb_num_of_entries(cpu_buffer);
2688 local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
2689 } 2725 }
2690 2726
2691 return entries; 2727 return entries;
@@ -2896,7 +2932,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2896 /* 2932 /*
2897 * cpu_buffer->pages just needs to point to the buffer, it 2933 * cpu_buffer->pages just needs to point to the buffer, it
2898 * has no specific buffer page to point to. Lets move it out 2934 * has no specific buffer page to point to. Lets move it out
2899 * of our way so we don't accidently swap it. 2935 * of our way so we don't accidentally swap it.
2900 */ 2936 */
2901 cpu_buffer->pages = reader->list.prev; 2937 cpu_buffer->pages = reader->list.prev;
2902 2938
@@ -3040,12 +3076,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
3040 3076
3041 again: 3077 again:
3042 /* 3078 /*
3043 * We repeat when a timestamp is encountered. It is possible 3079 * We repeat when a time extend is encountered.
3044 * to get multiple timestamps from an interrupt entering just 3080 * Since the time extend is always attached to a data event,
3045 * as one timestamp is about to be written, or from discarded 3081 * we should never loop more than once.
3046 * commits. The most that we can have is the number on a single page. 3082 * (We never hit the following condition more than twice).
3047 */ 3083 */
3048 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) 3084 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
3049 return NULL; 3085 return NULL;
3050 3086
3051 reader = rb_get_reader_page(cpu_buffer); 3087 reader = rb_get_reader_page(cpu_buffer);
@@ -3121,14 +3157,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3121 return NULL; 3157 return NULL;
3122 3158
3123 /* 3159 /*
3124 * We repeat when a timestamp is encountered. 3160 * We repeat when a time extend is encountered.
3125 * We can get multiple timestamps by nested interrupts or also 3161 * Since the time extend is always attached to a data event,
3126 * if filtering is on (discarding commits). Since discarding 3162 * we should never loop more than once.
3127 * commits can be frequent we can get a lot of timestamps. 3163 * (We never hit the following condition more than twice).
3128 * But we limit them by not adding timestamps if they begin
3129 * at the start of a page.
3130 */ 3164 */
3131 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) 3165 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
3132 return NULL; 3166 return NULL;
3133 3167
3134 if (rb_per_cpu_empty(cpu_buffer)) 3168 if (rb_per_cpu_empty(cpu_buffer))
@@ -3826,7 +3860,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3826 if (len > (commit - read)) 3860 if (len > (commit - read))
3827 len = (commit - read); 3861 len = (commit - read);
3828 3862
3829 size = rb_event_length(event); 3863 /* Always keep the time extend and data together */
3864 size = rb_event_ts_length(event);
3830 3865
3831 if (len < size) 3866 if (len < size)
3832 goto out_unlock; 3867 goto out_unlock;
@@ -3836,6 +3871,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3836 3871
3837 /* Need to copy one event at a time */ 3872 /* Need to copy one event at a time */
3838 do { 3873 do {
3874 /* We need the size of one event, because
3875 * rb_advance_reader only advances by one event,
3876 * whereas rb_event_ts_length may include the size of
3877 * one or two events.
3878 * We have already ensured there's enough space if this
3879 * is a time extend. */
3880 size = rb_event_length(event);
3839 memcpy(bpage->data + pos, rpage->data + rpos, size); 3881 memcpy(bpage->data + pos, rpage->data + rpos, size);
3840 3882
3841 len -= size; 3883 len -= size;
@@ -3848,8 +3890,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3848 break; 3890 break;
3849 3891
3850 event = rb_reader_event(cpu_buffer); 3892 event = rb_reader_event(cpu_buffer);
3851 size = rb_event_length(event); 3893 /* Always keep the time extend and data together */
3852 } while (len > size); 3894 size = rb_event_ts_length(event);
3895 } while (len >= size);
3853 3896
3854 /* update bpage */ 3897 /* update bpage */
3855 local_set(&bpage->commit, pos); 3898 local_set(&bpage->commit, pos);
@@ -3965,6 +4008,7 @@ static const struct file_operations rb_simple_fops = {
3965 .open = tracing_open_generic, 4008 .open = tracing_open_generic,
3966 .read = rb_simple_read, 4009 .read = rb_simple_read,
3967 .write = rb_simple_write, 4010 .write = rb_simple_write,
4011 .llseek = default_llseek,
3968}; 4012};
3969 4013
3970 4014
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9ec59f541156..ee9c921d7f21 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,7 +17,6 @@
17#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/smp_lock.h>
21#include <linux/notifier.h> 20#include <linux/notifier.h>
22#include <linux/irqflags.h> 21#include <linux/irqflags.h>
23#include <linux/debugfs.h> 22#include <linux/debugfs.h>
@@ -42,8 +41,6 @@
42#include "trace.h" 41#include "trace.h"
43#include "trace_output.h" 42#include "trace_output.h"
44 43
45#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
46
47/* 44/*
48 * On boot up, the ring buffer is set to the minimum size, so that 45 * On boot up, the ring buffer is set to the minimum size, so that
49 * we do not waste memory on systems that are not using tracing. 46 * we do not waste memory on systems that are not using tracing.
@@ -341,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
341/* trace_flags holds trace_options default values */ 338/* trace_flags holds trace_options default values */
342unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 339unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
343 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 340 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
344 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; 341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
345 342
346static int trace_stop_count; 343static int trace_stop_count;
347static DEFINE_SPINLOCK(tracing_start_lock); 344static DEFINE_SPINLOCK(tracing_start_lock);
@@ -426,6 +423,7 @@ static const char *trace_options[] = {
426 "sleep-time", 423 "sleep-time",
427 "graph-time", 424 "graph-time",
428 "record-cmd", 425 "record-cmd",
426 "overwrite",
429 NULL 427 NULL
430}; 428};
431 429
@@ -781,6 +779,11 @@ __acquires(kernel_lock)
781 tracing_reset_online_cpus(tr); 779 tracing_reset_online_cpus(tr);
782 780
783 current_trace = type; 781 current_trace = type;
782
783 /* If we expanded the buffers, make sure the max is expanded too */
784 if (ring_buffer_expanded && type->use_max_tr)
785 ring_buffer_resize(max_tr.buffer, trace_buf_size);
786
784 /* the test is responsible for initializing and enabling */ 787 /* the test is responsible for initializing and enabling */
785 pr_info("Testing tracer %s: ", type->name); 788 pr_info("Testing tracer %s: ", type->name);
786 ret = type->selftest(type, tr); 789 ret = type->selftest(type, tr);
@@ -793,6 +796,10 @@ __acquires(kernel_lock)
793 /* Only reset on passing, to avoid touching corrupted buffers */ 796 /* Only reset on passing, to avoid touching corrupted buffers */
794 tracing_reset_online_cpus(tr); 797 tracing_reset_online_cpus(tr);
795 798
799 /* Shrink the max buffer again */
800 if (ring_buffer_expanded && type->use_max_tr)
801 ring_buffer_resize(max_tr.buffer, 1);
802
796 printk(KERN_CONT "PASSED\n"); 803 printk(KERN_CONT "PASSED\n");
797 } 804 }
798#endif 805#endif
@@ -1103,7 +1110,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
1103 1110
1104 entry->preempt_count = pc & 0xff; 1111 entry->preempt_count = pc & 0xff;
1105 entry->pid = (tsk) ? tsk->pid : 0; 1112 entry->pid = (tsk) ? tsk->pid : 0;
1106 entry->lock_depth = (tsk) ? tsk->lock_depth : 0; 1113 entry->padding = 0;
1107 entry->flags = 1114 entry->flags =
1108#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 1115#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
1109 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 1116 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1284,6 +1291,8 @@ void trace_dump_stack(void)
1284 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); 1291 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
1285} 1292}
1286 1293
1294static DEFINE_PER_CPU(int, user_stack_count);
1295
1287void 1296void
1288ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) 1297ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1289{ 1298{
@@ -1302,10 +1311,20 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1302 if (unlikely(in_nmi())) 1311 if (unlikely(in_nmi()))
1303 return; 1312 return;
1304 1313
1314 /*
1315 * prevent recursion, since the user stack tracing may
1316 * trigger other kernel events.
1317 */
1318 preempt_disable();
1319 if (__this_cpu_read(user_stack_count))
1320 goto out;
1321
1322 __this_cpu_inc(user_stack_count);
1323
1305 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, 1324 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1306 sizeof(*entry), flags, pc); 1325 sizeof(*entry), flags, pc);
1307 if (!event) 1326 if (!event)
1308 return; 1327 goto out_drop_count;
1309 entry = ring_buffer_event_data(event); 1328 entry = ring_buffer_event_data(event);
1310 1329
1311 entry->tgid = current->tgid; 1330 entry->tgid = current->tgid;
@@ -1319,6 +1338,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1319 save_stack_trace_user(&trace); 1338 save_stack_trace_user(&trace);
1320 if (!filter_check_discard(call, entry, buffer, event)) 1339 if (!filter_check_discard(call, entry, buffer, event))
1321 ring_buffer_unlock_commit(buffer, event); 1340 ring_buffer_unlock_commit(buffer, event);
1341
1342 out_drop_count:
1343 __this_cpu_dec(user_stack_count);
1344 out:
1345 preempt_enable();
1322} 1346}
1323 1347
1324#ifdef UNUSED 1348#ifdef UNUSED
@@ -1733,10 +1757,9 @@ static void print_lat_help_header(struct seq_file *m)
1733 seq_puts(m, "# | / _----=> need-resched \n"); 1757 seq_puts(m, "# | / _----=> need-resched \n");
1734 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 1758 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1735 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 1759 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1736 seq_puts(m, "# |||| /_--=> lock-depth \n"); 1760 seq_puts(m, "# |||| / delay \n");
1737 seq_puts(m, "# |||||/ delay \n"); 1761 seq_puts(m, "# cmd pid ||||| time | caller \n");
1738 seq_puts(m, "# cmd pid |||||| time | caller \n"); 1762 seq_puts(m, "# \\ / ||||| \\ | / \n");
1739 seq_puts(m, "# \\ / |||||| \\ | / \n");
1740} 1763}
1741 1764
1742static void print_func_help_header(struct seq_file *m) 1765static void print_func_help_header(struct seq_file *m)
@@ -1991,9 +2014,10 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
1991{ 2014{
1992 enum print_line_t ret; 2015 enum print_line_t ret;
1993 2016
1994 if (iter->lost_events) 2017 if (iter->lost_events &&
1995 trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", 2018 !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
1996 iter->cpu, iter->lost_events); 2019 iter->cpu, iter->lost_events))
2020 return TRACE_TYPE_PARTIAL_LINE;
1997 2021
1998 if (iter->trace && iter->trace->print_line) { 2022 if (iter->trace && iter->trace->print_line) {
1999 ret = iter->trace->print_line(iter); 2023 ret = iter->trace->print_line(iter);
@@ -2196,7 +2220,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
2196 2220
2197static int tracing_release(struct inode *inode, struct file *file) 2221static int tracing_release(struct inode *inode, struct file *file)
2198{ 2222{
2199 struct seq_file *m = (struct seq_file *)file->private_data; 2223 struct seq_file *m = file->private_data;
2200 struct trace_iterator *iter; 2224 struct trace_iterator *iter;
2201 int cpu; 2225 int cpu;
2202 2226
@@ -2320,11 +2344,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf,
2320 return count; 2344 return count;
2321} 2345}
2322 2346
2347static loff_t tracing_seek(struct file *file, loff_t offset, int origin)
2348{
2349 if (file->f_mode & FMODE_READ)
2350 return seq_lseek(file, offset, origin);
2351 else
2352 return 0;
2353}
2354
2323static const struct file_operations tracing_fops = { 2355static const struct file_operations tracing_fops = {
2324 .open = tracing_open, 2356 .open = tracing_open,
2325 .read = seq_read, 2357 .read = seq_read,
2326 .write = tracing_write_stub, 2358 .write = tracing_write_stub,
2327 .llseek = seq_lseek, 2359 .llseek = tracing_seek,
2328 .release = tracing_release, 2360 .release = tracing_release,
2329}; 2361};
2330 2362
@@ -2505,6 +2537,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2505 2537
2506 if (mask == TRACE_ITER_RECORD_CMD) 2538 if (mask == TRACE_ITER_RECORD_CMD)
2507 trace_event_enable_cmd_record(enabled); 2539 trace_event_enable_cmd_record(enabled);
2540
2541 if (mask == TRACE_ITER_OVERWRITE)
2542 ring_buffer_change_overwrite(global_trace.buffer, enabled);
2508} 2543}
2509 2544
2510static ssize_t 2545static ssize_t
@@ -2686,6 +2721,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2686 2721
2687 mutex_lock(&trace_types_lock); 2722 mutex_lock(&trace_types_lock);
2688 if (tracer_enabled ^ val) { 2723 if (tracer_enabled ^ val) {
2724
2725 /* Only need to warn if this is used to change the state */
2726 WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
2727
2689 if (val) { 2728 if (val) {
2690 tracer_enabled = 1; 2729 tracer_enabled = 1;
2691 if (current_trace->start) 2730 if (current_trace->start)
@@ -3192,6 +3231,14 @@ waitagain:
3192 3231
3193 if (iter->seq.len >= cnt) 3232 if (iter->seq.len >= cnt)
3194 break; 3233 break;
3234
3235 /*
3236 * Setting the full flag means we reached the trace_seq buffer
3237 * size and we should leave by partial output condition above.
3238 * One of the trace_seq_* functions is not used properly.
3239 */
3240 WARN_ONCE(iter->seq.full, "full flag set for trace type %d",
3241 iter->ent->type);
3195 } 3242 }
3196 trace_access_unlock(iter->cpu_file); 3243 trace_access_unlock(iter->cpu_file);
3197 trace_event_read_unlock(); 3244 trace_event_read_unlock();
@@ -3202,7 +3249,7 @@ waitagain:
3202 trace_seq_init(&iter->seq); 3249 trace_seq_init(&iter->seq);
3203 3250
3204 /* 3251 /*
3205 * If there was nothing to send to user, inspite of consuming trace 3252 * If there was nothing to send to user, in spite of consuming trace
3206 * entries, go back to wait for more entries. 3253 * entries, go back to wait for more entries.
3207 */ 3254 */
3208 if (sret == -EBUSY) 3255 if (sret == -EBUSY)
@@ -3996,13 +4043,9 @@ static void tracing_init_debugfs_percpu(long cpu)
3996{ 4043{
3997 struct dentry *d_percpu = tracing_dentry_percpu(); 4044 struct dentry *d_percpu = tracing_dentry_percpu();
3998 struct dentry *d_cpu; 4045 struct dentry *d_cpu;
3999 /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ 4046 char cpu_dir[30]; /* 30 characters should be more than enough */
4000 char cpu_dir[7];
4001 4047
4002 if (cpu > 999 || cpu < 0) 4048 snprintf(cpu_dir, 30, "cpu%ld", cpu);
4003 return;
4004
4005 sprintf(cpu_dir, "cpu%ld", cpu);
4006 d_cpu = debugfs_create_dir(cpu_dir, d_percpu); 4049 d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
4007 if (!d_cpu) { 4050 if (!d_cpu) {
4008 pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); 4051 pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
@@ -4531,9 +4574,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4531__init static int tracer_alloc_buffers(void) 4574__init static int tracer_alloc_buffers(void)
4532{ 4575{
4533 int ring_buf_size; 4576 int ring_buf_size;
4577 enum ring_buffer_flags rb_flags;
4534 int i; 4578 int i;
4535 int ret = -ENOMEM; 4579 int ret = -ENOMEM;
4536 4580
4581
4537 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) 4582 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
4538 goto out; 4583 goto out;
4539 4584
@@ -4546,12 +4591,13 @@ __init static int tracer_alloc_buffers(void)
4546 else 4591 else
4547 ring_buf_size = 1; 4592 ring_buf_size = 1;
4548 4593
4594 rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
4595
4549 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 4596 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
4550 cpumask_copy(tracing_cpumask, cpu_all_mask); 4597 cpumask_copy(tracing_cpumask, cpu_all_mask);
4551 4598
4552 /* TODO: make the number of buffers hot pluggable with CPUS */ 4599 /* TODO: make the number of buffers hot pluggable with CPUS */
4553 global_trace.buffer = ring_buffer_alloc(ring_buf_size, 4600 global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
4554 TRACE_BUFFER_FLAGS);
4555 if (!global_trace.buffer) { 4601 if (!global_trace.buffer) {
4556 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 4602 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
4557 WARN_ON(1); 4603 WARN_ON(1);
@@ -4561,7 +4607,7 @@ __init static int tracer_alloc_buffers(void)
4561 4607
4562 4608
4563#ifdef CONFIG_TRACER_MAX_TRACE 4609#ifdef CONFIG_TRACER_MAX_TRACE
4564 max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); 4610 max_tr.buffer = ring_buffer_alloc(1, rb_flags);
4565 if (!max_tr.buffer) { 4611 if (!max_tr.buffer) {
4566 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 4612 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
4567 WARN_ON(1); 4613 WARN_ON(1);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d39b3c5454a5..229f8591f61d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -272,8 +272,8 @@ struct tracer {
272 /* If you handled the flag setting, return 0 */ 272 /* If you handled the flag setting, return 0 */
273 int (*set_flag)(u32 old_flags, u32 bit, int set); 273 int (*set_flag)(u32 old_flags, u32 bit, int set);
274 struct tracer *next; 274 struct tracer *next;
275 int print_max;
276 struct tracer_flags *flags; 275 struct tracer_flags *flags;
276 int print_max;
277 int use_max_tr; 277 int use_max_tr;
278}; 278};
279 279
@@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr,
343 unsigned long ip, 343 unsigned long ip,
344 unsigned long parent_ip, 344 unsigned long parent_ip,
345 unsigned long flags, int pc); 345 unsigned long flags, int pc);
346void trace_graph_function(struct trace_array *tr,
347 unsigned long ip,
348 unsigned long parent_ip,
349 unsigned long flags, int pc);
346void trace_default_header(struct seq_file *m); 350void trace_default_header(struct seq_file *m);
347void print_trace_header(struct seq_file *m, struct trace_iterator *iter); 351void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
348int trace_empty(struct trace_iterator *iter); 352int trace_empty(struct trace_iterator *iter);
@@ -415,6 +419,8 @@ extern void trace_find_cmdline(int pid, char comm[]);
415extern unsigned long ftrace_update_tot_cnt; 419extern unsigned long ftrace_update_tot_cnt;
416#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func 420#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
417extern int DYN_FTRACE_TEST_NAME(void); 421extern int DYN_FTRACE_TEST_NAME(void);
422#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
423extern int DYN_FTRACE_TEST_NAME2(void);
418#endif 424#endif
419 425
420extern int ring_buffer_expanded; 426extern int ring_buffer_expanded;
@@ -602,6 +608,7 @@ enum trace_iterator_flags {
602 TRACE_ITER_SLEEP_TIME = 0x40000, 608 TRACE_ITER_SLEEP_TIME = 0x40000,
603 TRACE_ITER_GRAPH_TIME = 0x80000, 609 TRACE_ITER_GRAPH_TIME = 0x80000,
604 TRACE_ITER_RECORD_CMD = 0x100000, 610 TRACE_ITER_RECORD_CMD = 0x100000,
611 TRACE_ITER_OVERWRITE = 0x200000,
605}; 612};
606 613
607/* 614/*
@@ -657,8 +664,10 @@ struct ftrace_event_field {
657}; 664};
658 665
659struct event_filter { 666struct event_filter {
660 int n_preds; 667 int n_preds; /* Number assigned */
661 struct filter_pred **preds; 668 int a_preds; /* allocated */
669 struct filter_pred *preds;
670 struct filter_pred *root;
662 char *filter_string; 671 char *filter_string;
663}; 672};
664 673
@@ -670,11 +679,23 @@ struct event_subsystem {
670 int nr_events; 679 int nr_events;
671}; 680};
672 681
682#define FILTER_PRED_INVALID ((unsigned short)-1)
683#define FILTER_PRED_IS_RIGHT (1 << 15)
684#define FILTER_PRED_FOLD (1 << 15)
685
686/*
687 * The max preds is the size of unsigned short with
688 * two flags at the MSBs. One bit is used for both the IS_RIGHT
689 * and FOLD flags. The other is reserved.
690 *
691 * 2^14 preds is way more than enough.
692 */
693#define MAX_FILTER_PRED 16384
694
673struct filter_pred; 695struct filter_pred;
674struct regex; 696struct regex;
675 697
676typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, 698typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
677 int val1, int val2);
678 699
679typedef int (*regex_match_func)(char *str, struct regex *r, int len); 700typedef int (*regex_match_func)(char *str, struct regex *r, int len);
680 701
@@ -696,11 +717,23 @@ struct filter_pred {
696 filter_pred_fn_t fn; 717 filter_pred_fn_t fn;
697 u64 val; 718 u64 val;
698 struct regex regex; 719 struct regex regex;
699 char *field_name; 720 /*
721 * Leaf nodes use field_name, ops is used by AND and OR
722 * nodes. The field_name is always freed when freeing a pred.
723 * We can overload field_name for ops and have it freed
724 * as well.
725 */
726 union {
727 char *field_name;
728 unsigned short *ops;
729 };
700 int offset; 730 int offset;
701 int not; 731 int not;
702 int op; 732 int op;
703 int pop_n; 733 unsigned short index;
734 unsigned short parent;
735 unsigned short left;
736 unsigned short right;
704}; 737};
705 738
706extern struct list_head ftrace_common_fields; 739extern struct list_head ftrace_common_fields;
@@ -751,4 +784,19 @@ extern const char *__stop___trace_bprintk_fmt[];
751 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) 784 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
752#include "trace_entries.h" 785#include "trace_entries.h"
753 786
787/* Only current can touch trace_recursion */
788#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
789#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
790
791/* Ring buffer has the 10 LSB bits to count */
792#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
793
794/* for function tracing recursion */
795#define TRACE_INTERNAL_BIT (1<<11)
796#define TRACE_GLOBAL_BIT (1<<12)
797
798#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0)
799#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0)
800#define trace_recursion_test(bit) ((current)->trace_recursion & (bit))
801
754#endif /* _LINUX_KERNEL_TRACE_H */ 802#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 685a67d55db0..6302747a1398 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -46,7 +46,7 @@ u64 notrace trace_clock_local(void)
46} 46}
47 47
48/* 48/*
49 * trace_clock(): 'inbetween' trace clock. Not completely serialized, 49 * trace_clock(): 'between' trace clock. Not completely serialized,
50 * but not completely incorrect when crossing CPUs either. 50 * but not completely incorrect when crossing CPUs either.
51 * 51 *
52 * This is based on cpu_clock(), which will allow at most ~1 jiffy of 52 * This is based on cpu_clock(), which will allow at most ~1 jiffy of
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e3dfecaf13e6..e32744c84d94 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -27,7 +27,7 @@
27 * in the structure. 27 * in the structure.
28 * 28 *
29 * * for structures within structures, the format of the internal 29 * * for structures within structures, the format of the internal
30 * structure is layed out. This allows the internal structure 30 * structure is laid out. This allows the internal structure
31 * to be deciphered for the format file. Although these macros 31 * to be deciphered for the format file. Although these macros
32 * may become out of sync with the internal structure, they 32 * may become out of sync with the internal structure, they
33 * will create a compile error if it happens. Since the 33 * will create a compile error if it happens. Since the
@@ -53,7 +53,7 @@
53 */ 53 */
54 54
55/* 55/*
56 * Function trace entry - function address and parent function addres: 56 * Function trace entry - function address and parent function address:
57 */ 57 */
58FTRACE_ENTRY(function, ftrace_entry, 58FTRACE_ENTRY(function, ftrace_entry,
59 59
@@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
109 */ 109 */
110#define FTRACE_CTX_FIELDS \ 110#define FTRACE_CTX_FIELDS \
111 __field( unsigned int, prev_pid ) \ 111 __field( unsigned int, prev_pid ) \
112 __field( unsigned int, next_pid ) \
113 __field( unsigned int, next_cpu ) \
112 __field( unsigned char, prev_prio ) \ 114 __field( unsigned char, prev_prio ) \
113 __field( unsigned char, prev_state ) \ 115 __field( unsigned char, prev_state ) \
114 __field( unsigned int, next_pid ) \
115 __field( unsigned char, next_prio ) \ 116 __field( unsigned char, next_prio ) \
116 __field( unsigned char, next_state ) \ 117 __field( unsigned char, next_state )
117 __field( unsigned int, next_cpu )
118 118
119FTRACE_ENTRY(context_switch, ctx_switch_entry, 119FTRACE_ENTRY(context_switch, ctx_switch_entry,
120 120
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 31cc4cb0dbf2..19a359d5e6d5 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,7 +9,7 @@
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include "trace.h" 10#include "trace.h"
11 11
12static char *perf_trace_buf[4]; 12static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
13 13
14/* 14/*
15 * Force it to be aligned to unsigned long to avoid misaligned accesses 15 * Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -21,17 +21,46 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
21/* Count the events in use (per event id, not per instance) */ 21/* Count the events in use (per event id, not per instance) */
22static int total_ref_count; 22static int total_ref_count;
23 23
24static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event)
26{
27 /* No tracing, just counting, so no obvious leak */
28 if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
29 return 0;
30
31 /* Some events are ok to be traced by non-root users... */
32 if (p_event->attach_state == PERF_ATTACH_TASK) {
33 if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
34 return 0;
35 }
36
37 /*
38 * ...otherwise raw tracepoint data can be a severe data leak,
39 * only allow root to have these.
40 */
41 if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
42 return -EPERM;
43
44 return 0;
45}
46
24static int perf_trace_event_init(struct ftrace_event_call *tp_event, 47static int perf_trace_event_init(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event) 48 struct perf_event *p_event)
26{ 49{
27 struct hlist_head *list; 50 struct hlist_head __percpu *list;
28 int ret = -ENOMEM; 51 int ret;
29 int cpu; 52 int cpu;
30 53
54 ret = perf_trace_event_perm(tp_event, p_event);
55 if (ret)
56 return ret;
57
31 p_event->tp_event = tp_event; 58 p_event->tp_event = tp_event;
32 if (tp_event->perf_refcount++ > 0) 59 if (tp_event->perf_refcount++ > 0)
33 return 0; 60 return 0;
34 61
62 ret = -ENOMEM;
63
35 list = alloc_percpu(struct hlist_head); 64 list = alloc_percpu(struct hlist_head);
36 if (!list) 65 if (!list)
37 goto fail; 66 goto fail;
@@ -42,11 +71,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
42 tp_event->perf_events = list; 71 tp_event->perf_events = list;
43 72
44 if (!total_ref_count) { 73 if (!total_ref_count) {
45 char *buf; 74 char __percpu *buf;
46 int i; 75 int i;
47 76
48 for (i = 0; i < 4; i++) { 77 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
49 buf = (char *)alloc_percpu(perf_trace_t); 78 buf = (char __percpu *)alloc_percpu(perf_trace_t);
50 if (!buf) 79 if (!buf)
51 goto fail; 80 goto fail;
52 81
@@ -65,7 +94,7 @@ fail:
65 if (!total_ref_count) { 94 if (!total_ref_count) {
66 int i; 95 int i;
67 96
68 for (i = 0; i < 4; i++) { 97 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
69 free_percpu(perf_trace_buf[i]); 98 free_percpu(perf_trace_buf[i]);
70 perf_trace_buf[i] = NULL; 99 perf_trace_buf[i] = NULL;
71 } 100 }
@@ -101,22 +130,26 @@ int perf_trace_init(struct perf_event *p_event)
101 return ret; 130 return ret;
102} 131}
103 132
104int perf_trace_enable(struct perf_event *p_event) 133int perf_trace_add(struct perf_event *p_event, int flags)
105{ 134{
106 struct ftrace_event_call *tp_event = p_event->tp_event; 135 struct ftrace_event_call *tp_event = p_event->tp_event;
136 struct hlist_head __percpu *pcpu_list;
107 struct hlist_head *list; 137 struct hlist_head *list;
108 138
109 list = tp_event->perf_events; 139 pcpu_list = tp_event->perf_events;
110 if (WARN_ON_ONCE(!list)) 140 if (WARN_ON_ONCE(!pcpu_list))
111 return -EINVAL; 141 return -EINVAL;
112 142
113 list = this_cpu_ptr(list); 143 if (!(flags & PERF_EF_START))
144 p_event->hw.state = PERF_HES_STOPPED;
145
146 list = this_cpu_ptr(pcpu_list);
114 hlist_add_head_rcu(&p_event->hlist_entry, list); 147 hlist_add_head_rcu(&p_event->hlist_entry, list);
115 148
116 return 0; 149 return 0;
117} 150}
118 151
119void perf_trace_disable(struct perf_event *p_event) 152void perf_trace_del(struct perf_event *p_event, int flags)
120{ 153{
121 hlist_del_rcu(&p_event->hlist_entry); 154 hlist_del_rcu(&p_event->hlist_entry);
122} 155}
@@ -142,7 +175,7 @@ void perf_trace_destroy(struct perf_event *p_event)
142 tp_event->perf_events = NULL; 175 tp_event->perf_events = NULL;
143 176
144 if (!--total_ref_count) { 177 if (!--total_ref_count) {
145 for (i = 0; i < 4; i++) { 178 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
146 free_percpu(perf_trace_buf[i]); 179 free_percpu(perf_trace_buf[i]);
147 perf_trace_buf[i] = NULL; 180 perf_trace_buf[i] = NULL;
148 } 181 }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4c758f146328..686ec399f2a8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,6 +27,12 @@
27 27
28DEFINE_MUTEX(event_mutex); 28DEFINE_MUTEX(event_mutex);
29 29
30DEFINE_MUTEX(event_storage_mutex);
31EXPORT_SYMBOL_GPL(event_storage_mutex);
32
33char event_storage[EVENT_STORAGE_SIZE];
34EXPORT_SYMBOL_GPL(event_storage);
35
30LIST_HEAD(ftrace_events); 36LIST_HEAD(ftrace_events);
31LIST_HEAD(ftrace_common_fields); 37LIST_HEAD(ftrace_common_fields);
32 38
@@ -110,7 +116,7 @@ static int trace_define_common_fields(void)
110 __common_field(unsigned char, flags); 116 __common_field(unsigned char, flags);
111 __common_field(unsigned char, preempt_count); 117 __common_field(unsigned char, preempt_count);
112 __common_field(int, pid); 118 __common_field(int, pid);
113 __common_field(int, lock_depth); 119 __common_field(int, padding);
114 120
115 return ret; 121 return ret;
116} 122}
@@ -320,6 +326,7 @@ int trace_set_clr_event(const char *system, const char *event, int set)
320{ 326{
321 return __ftrace_set_clr_event(NULL, system, event, set); 327 return __ftrace_set_clr_event(NULL, system, event, set);
322} 328}
329EXPORT_SYMBOL_GPL(trace_set_clr_event);
323 330
324/* 128 should be much more than enough */ 331/* 128 should be much more than enough */
325#define EVENT_BUF_SIZE 127 332#define EVENT_BUF_SIZE 127
@@ -600,21 +607,29 @@ out:
600 607
601enum { 608enum {
602 FORMAT_HEADER = 1, 609 FORMAT_HEADER = 1,
603 FORMAT_PRINTFMT = 2, 610 FORMAT_FIELD_SEPERATOR = 2,
611 FORMAT_PRINTFMT = 3,
604}; 612};
605 613
606static void *f_next(struct seq_file *m, void *v, loff_t *pos) 614static void *f_next(struct seq_file *m, void *v, loff_t *pos)
607{ 615{
608 struct ftrace_event_call *call = m->private; 616 struct ftrace_event_call *call = m->private;
609 struct ftrace_event_field *field; 617 struct ftrace_event_field *field;
610 struct list_head *head; 618 struct list_head *common_head = &ftrace_common_fields;
619 struct list_head *head = trace_get_fields(call);
611 620
612 (*pos)++; 621 (*pos)++;
613 622
614 switch ((unsigned long)v) { 623 switch ((unsigned long)v) {
615 case FORMAT_HEADER: 624 case FORMAT_HEADER:
616 head = &ftrace_common_fields; 625 if (unlikely(list_empty(common_head)))
626 return NULL;
617 627
628 field = list_entry(common_head->prev,
629 struct ftrace_event_field, link);
630 return field;
631
632 case FORMAT_FIELD_SEPERATOR:
618 if (unlikely(list_empty(head))) 633 if (unlikely(list_empty(head)))
619 return NULL; 634 return NULL;
620 635
@@ -626,31 +641,10 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
626 return NULL; 641 return NULL;
627 } 642 }
628 643
629 head = trace_get_fields(call);
630
631 /*
632 * To separate common fields from event fields, the
633 * LSB is set on the first event field. Clear it in case.
634 */
635 v = (void *)((unsigned long)v & ~1L);
636
637 field = v; 644 field = v;
638 /* 645 if (field->link.prev == common_head)
639 * If this is a common field, and at the end of the list, then 646 return (void *)FORMAT_FIELD_SEPERATOR;
640 * continue with main list. 647 else if (field->link.prev == head)
641 */
642 if (field->link.prev == &ftrace_common_fields) {
643 if (unlikely(list_empty(head)))
644 return NULL;
645 field = list_entry(head->prev, struct ftrace_event_field, link);
646 /* Set the LSB to notify f_show to print an extra newline */
647 field = (struct ftrace_event_field *)
648 ((unsigned long)field | 1);
649 return field;
650 }
651
652 /* If we are done tell f_show to print the format */
653 if (field->link.prev == head)
654 return (void *)FORMAT_PRINTFMT; 648 return (void *)FORMAT_PRINTFMT;
655 649
656 field = list_entry(field->link.prev, struct ftrace_event_field, link); 650 field = list_entry(field->link.prev, struct ftrace_event_field, link);
@@ -688,22 +682,16 @@ static int f_show(struct seq_file *m, void *v)
688 seq_printf(m, "format:\n"); 682 seq_printf(m, "format:\n");
689 return 0; 683 return 0;
690 684
685 case FORMAT_FIELD_SEPERATOR:
686 seq_putc(m, '\n');
687 return 0;
688
691 case FORMAT_PRINTFMT: 689 case FORMAT_PRINTFMT:
692 seq_printf(m, "\nprint fmt: %s\n", 690 seq_printf(m, "\nprint fmt: %s\n",
693 call->print_fmt); 691 call->print_fmt);
694 return 0; 692 return 0;
695 } 693 }
696 694
697 /*
698 * To separate common fields from event fields, the
699 * LSB is set on the first event field. Clear it and
700 * print a newline if it is set.
701 */
702 if ((unsigned long)v & 1) {
703 seq_putc(m, '\n');
704 v = (void *)((unsigned long)v & ~1L);
705 }
706
707 field = v; 695 field = v;
708 696
709 /* 697 /*
@@ -951,6 +939,7 @@ static const struct file_operations ftrace_enable_fops = {
951 .open = tracing_open_generic, 939 .open = tracing_open_generic,
952 .read = event_enable_read, 940 .read = event_enable_read,
953 .write = event_enable_write, 941 .write = event_enable_write,
942 .llseek = default_llseek,
954}; 943};
955 944
956static const struct file_operations ftrace_event_format_fops = { 945static const struct file_operations ftrace_event_format_fops = {
@@ -963,29 +952,34 @@ static const struct file_operations ftrace_event_format_fops = {
963static const struct file_operations ftrace_event_id_fops = { 952static const struct file_operations ftrace_event_id_fops = {
964 .open = tracing_open_generic, 953 .open = tracing_open_generic,
965 .read = event_id_read, 954 .read = event_id_read,
955 .llseek = default_llseek,
966}; 956};
967 957
968static const struct file_operations ftrace_event_filter_fops = { 958static const struct file_operations ftrace_event_filter_fops = {
969 .open = tracing_open_generic, 959 .open = tracing_open_generic,
970 .read = event_filter_read, 960 .read = event_filter_read,
971 .write = event_filter_write, 961 .write = event_filter_write,
962 .llseek = default_llseek,
972}; 963};
973 964
974static const struct file_operations ftrace_subsystem_filter_fops = { 965static const struct file_operations ftrace_subsystem_filter_fops = {
975 .open = tracing_open_generic, 966 .open = tracing_open_generic,
976 .read = subsystem_filter_read, 967 .read = subsystem_filter_read,
977 .write = subsystem_filter_write, 968 .write = subsystem_filter_write,
969 .llseek = default_llseek,
978}; 970};
979 971
980static const struct file_operations ftrace_system_enable_fops = { 972static const struct file_operations ftrace_system_enable_fops = {
981 .open = tracing_open_generic, 973 .open = tracing_open_generic,
982 .read = system_enable_read, 974 .read = system_enable_read,
983 .write = system_enable_write, 975 .write = system_enable_write,
976 .llseek = default_llseek,
984}; 977};
985 978
986static const struct file_operations ftrace_show_header_fops = { 979static const struct file_operations ftrace_show_header_fops = {
987 .open = tracing_open_generic, 980 .open = tracing_open_generic,
988 .read = show_header, 981 .read = show_header,
982 .llseek = default_llseek,
989}; 983};
990 984
991static struct dentry *event_trace_events_dir(void) 985static struct dentry *event_trace_events_dir(void)
@@ -1291,7 +1285,7 @@ trace_create_file_ops(struct module *mod)
1291static void trace_module_add_events(struct module *mod) 1285static void trace_module_add_events(struct module *mod)
1292{ 1286{
1293 struct ftrace_module_file_ops *file_ops = NULL; 1287 struct ftrace_module_file_ops *file_ops = NULL;
1294 struct ftrace_event_call *call, *start, *end; 1288 struct ftrace_event_call **call, **start, **end;
1295 1289
1296 start = mod->trace_events; 1290 start = mod->trace_events;
1297 end = mod->trace_events + mod->num_trace_events; 1291 end = mod->trace_events + mod->num_trace_events;
@@ -1304,7 +1298,7 @@ static void trace_module_add_events(struct module *mod)
1304 return; 1298 return;
1305 1299
1306 for_each_event(call, start, end) { 1300 for_each_event(call, start, end) {
1307 __trace_add_event_call(call, mod, 1301 __trace_add_event_call(*call, mod,
1308 &file_ops->id, &file_ops->enable, 1302 &file_ops->id, &file_ops->enable,
1309 &file_ops->filter, &file_ops->format); 1303 &file_ops->filter, &file_ops->format);
1310 } 1304 }
@@ -1374,8 +1368,8 @@ static struct notifier_block trace_module_nb = {
1374 .priority = 0, 1368 .priority = 0,
1375}; 1369};
1376 1370
1377extern struct ftrace_event_call __start_ftrace_events[]; 1371extern struct ftrace_event_call *__start_ftrace_events[];
1378extern struct ftrace_event_call __stop_ftrace_events[]; 1372extern struct ftrace_event_call *__stop_ftrace_events[];
1379 1373
1380static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; 1374static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
1381 1375
@@ -1391,7 +1385,7 @@ __setup("trace_event=", setup_trace_event);
1391 1385
1392static __init int event_trace_init(void) 1386static __init int event_trace_init(void)
1393{ 1387{
1394 struct ftrace_event_call *call; 1388 struct ftrace_event_call **call;
1395 struct dentry *d_tracer; 1389 struct dentry *d_tracer;
1396 struct dentry *entry; 1390 struct dentry *entry;
1397 struct dentry *d_events; 1391 struct dentry *d_events;
@@ -1437,7 +1431,7 @@ static __init int event_trace_init(void)
1437 pr_warning("tracing: Failed to allocate common fields"); 1431 pr_warning("tracing: Failed to allocate common fields");
1438 1432
1439 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { 1433 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
1440 __trace_add_event_call(call, NULL, &ftrace_event_id_fops, 1434 __trace_add_event_call(*call, NULL, &ftrace_event_id_fops,
1441 &ftrace_enable_fops, 1435 &ftrace_enable_fops,
1442 &ftrace_event_filter_fops, 1436 &ftrace_event_filter_fops,
1443 &ftrace_event_format_fops); 1437 &ftrace_event_format_fops);
@@ -1663,7 +1657,12 @@ static struct ftrace_ops trace_ops __initdata =
1663 1657
1664static __init void event_trace_self_test_with_function(void) 1658static __init void event_trace_self_test_with_function(void)
1665{ 1659{
1666 register_ftrace_function(&trace_ops); 1660 int ret;
1661 ret = register_ftrace_function(&trace_ops);
1662 if (WARN_ON(ret < 0)) {
1663 pr_info("Failed to enable function tracer for event tests\n");
1664 return;
1665 }
1667 pr_info("Running tests again, along with the function tracer\n"); 1666 pr_info("Running tests again, along with the function tracer\n");
1668 event_trace_self_tests(); 1667 event_trace_self_tests();
1669 unregister_ftrace_function(&trace_ops); 1668 unregister_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 36d40104b17f..8008ddcfbf20 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -123,9 +123,13 @@ struct filter_parse_state {
123 } operand; 123 } operand;
124}; 124};
125 125
126struct pred_stack {
127 struct filter_pred **preds;
128 int index;
129};
130
126#define DEFINE_COMPARISON_PRED(type) \ 131#define DEFINE_COMPARISON_PRED(type) \
127static int filter_pred_##type(struct filter_pred *pred, void *event, \ 132static int filter_pred_##type(struct filter_pred *pred, void *event) \
128 int val1, int val2) \
129{ \ 133{ \
130 type *addr = (type *)(event + pred->offset); \ 134 type *addr = (type *)(event + pred->offset); \
131 type val = (type)pred->val; \ 135 type val = (type)pred->val; \
@@ -152,8 +156,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \
152} 156}
153 157
154#define DEFINE_EQUALITY_PRED(size) \ 158#define DEFINE_EQUALITY_PRED(size) \
155static int filter_pred_##size(struct filter_pred *pred, void *event, \ 159static int filter_pred_##size(struct filter_pred *pred, void *event) \
156 int val1, int val2) \
157{ \ 160{ \
158 u##size *addr = (u##size *)(event + pred->offset); \ 161 u##size *addr = (u##size *)(event + pred->offset); \
159 u##size val = (u##size)pred->val; \ 162 u##size val = (u##size)pred->val; \
@@ -178,23 +181,8 @@ DEFINE_EQUALITY_PRED(32);
178DEFINE_EQUALITY_PRED(16); 181DEFINE_EQUALITY_PRED(16);
179DEFINE_EQUALITY_PRED(8); 182DEFINE_EQUALITY_PRED(8);
180 183
181static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
182 void *event __attribute((unused)),
183 int val1, int val2)
184{
185 return val1 && val2;
186}
187
188static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
189 void *event __attribute((unused)),
190 int val1, int val2)
191{
192 return val1 || val2;
193}
194
195/* Filter predicate for fixed sized arrays of characters */ 184/* Filter predicate for fixed sized arrays of characters */
196static int filter_pred_string(struct filter_pred *pred, void *event, 185static int filter_pred_string(struct filter_pred *pred, void *event)
197 int val1, int val2)
198{ 186{
199 char *addr = (char *)(event + pred->offset); 187 char *addr = (char *)(event + pred->offset);
200 int cmp, match; 188 int cmp, match;
@@ -207,8 +195,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
207} 195}
208 196
209/* Filter predicate for char * pointers */ 197/* Filter predicate for char * pointers */
210static int filter_pred_pchar(struct filter_pred *pred, void *event, 198static int filter_pred_pchar(struct filter_pred *pred, void *event)
211 int val1, int val2)
212{ 199{
213 char **addr = (char **)(event + pred->offset); 200 char **addr = (char **)(event + pred->offset);
214 int cmp, match; 201 int cmp, match;
@@ -231,8 +218,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
231 * and add it to the address of the entry, and at last we have 218 * and add it to the address of the entry, and at last we have
232 * the address of the string. 219 * the address of the string.
233 */ 220 */
234static int filter_pred_strloc(struct filter_pred *pred, void *event, 221static int filter_pred_strloc(struct filter_pred *pred, void *event)
235 int val1, int val2)
236{ 222{
237 u32 str_item = *(u32 *)(event + pred->offset); 223 u32 str_item = *(u32 *)(event + pred->offset);
238 int str_loc = str_item & 0xffff; 224 int str_loc = str_item & 0xffff;
@@ -247,8 +233,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
247 return match; 233 return match;
248} 234}
249 235
250static int filter_pred_none(struct filter_pred *pred, void *event, 236static int filter_pred_none(struct filter_pred *pred, void *event)
251 int val1, int val2)
252{ 237{
253 return 0; 238 return 0;
254} 239}
@@ -377,32 +362,147 @@ static void filter_build_regex(struct filter_pred *pred)
377 pred->not ^= not; 362 pred->not ^= not;
378} 363}
379 364
365enum move_type {
366 MOVE_DOWN,
367 MOVE_UP_FROM_LEFT,
368 MOVE_UP_FROM_RIGHT
369};
370
371static struct filter_pred *
372get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
373 int index, enum move_type *move)
374{
375 if (pred->parent & FILTER_PRED_IS_RIGHT)
376 *move = MOVE_UP_FROM_RIGHT;
377 else
378 *move = MOVE_UP_FROM_LEFT;
379 pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT];
380
381 return pred;
382}
383
384/*
385 * A series of AND or ORs where found together. Instead of
386 * climbing up and down the tree branches, an array of the
387 * ops were made in order of checks. We can just move across
388 * the array and short circuit if needed.
389 */
390static int process_ops(struct filter_pred *preds,
391 struct filter_pred *op, void *rec)
392{
393 struct filter_pred *pred;
394 int match = 0;
395 int type;
396 int i;
397
398 /*
399 * Micro-optimization: We set type to true if op
400 * is an OR and false otherwise (AND). Then we
401 * just need to test if the match is equal to
402 * the type, and if it is, we can short circuit the
403 * rest of the checks:
404 *
405 * if ((match && op->op == OP_OR) ||
406 * (!match && op->op == OP_AND))
407 * return match;
408 */
409 type = op->op == OP_OR;
410
411 for (i = 0; i < op->val; i++) {
412 pred = &preds[op->ops[i]];
413 match = pred->fn(pred, rec);
414 if (!!match == type)
415 return match;
416 }
417 return match;
418}
419
380/* return 1 if event matches, 0 otherwise (discard) */ 420/* return 1 if event matches, 0 otherwise (discard) */
381int filter_match_preds(struct event_filter *filter, void *rec) 421int filter_match_preds(struct event_filter *filter, void *rec)
382{ 422{
383 int match, top = 0, val1 = 0, val2 = 0; 423 int match = -1;
384 int stack[MAX_FILTER_PRED]; 424 enum move_type move = MOVE_DOWN;
425 struct filter_pred *preds;
385 struct filter_pred *pred; 426 struct filter_pred *pred;
386 int i; 427 struct filter_pred *root;
428 int n_preds;
429 int done = 0;
430
431 /* no filter is considered a match */
432 if (!filter)
433 return 1;
434
435 n_preds = filter->n_preds;
436
437 if (!n_preds)
438 return 1;
439
440 /*
441 * n_preds, root and filter->preds are protect with preemption disabled.
442 */
443 preds = rcu_dereference_sched(filter->preds);
444 root = rcu_dereference_sched(filter->root);
445 if (!root)
446 return 1;
447
448 pred = root;
387 449
388 for (i = 0; i < filter->n_preds; i++) { 450 /* match is currently meaningless */
389 pred = filter->preds[i]; 451 match = -1;
390 if (!pred->pop_n) { 452
391 match = pred->fn(pred, rec, val1, val2); 453 do {
392 stack[top++] = match; 454 switch (move) {
455 case MOVE_DOWN:
456 /* only AND and OR have children */
457 if (pred->left != FILTER_PRED_INVALID) {
458 /* If ops is set, then it was folded. */
459 if (!pred->ops) {
460 /* keep going to down the left side */
461 pred = &preds[pred->left];
462 continue;
463 }
464 /* We can treat folded ops as a leaf node */
465 match = process_ops(preds, pred, rec);
466 } else
467 match = pred->fn(pred, rec);
468 /* If this pred is the only pred */
469 if (pred == root)
470 break;
471 pred = get_pred_parent(pred, preds,
472 pred->parent, &move);
473 continue;
474 case MOVE_UP_FROM_LEFT:
475 /*
476 * Check for short circuits.
477 *
478 * Optimization: !!match == (pred->op == OP_OR)
479 * is the same as:
480 * if ((match && pred->op == OP_OR) ||
481 * (!match && pred->op == OP_AND))
482 */
483 if (!!match == (pred->op == OP_OR)) {
484 if (pred == root)
485 break;
486 pred = get_pred_parent(pred, preds,
487 pred->parent, &move);
488 continue;
489 }
490 /* now go down the right side of the tree. */
491 pred = &preds[pred->right];
492 move = MOVE_DOWN;
493 continue;
494 case MOVE_UP_FROM_RIGHT:
495 /* We finished this equation. */
496 if (pred == root)
497 break;
498 pred = get_pred_parent(pred, preds,
499 pred->parent, &move);
393 continue; 500 continue;
394 } 501 }
395 if (pred->pop_n > top) { 502 done = 1;
396 WARN_ON_ONCE(1); 503 } while (!done);
397 return 0;
398 }
399 val1 = stack[--top];
400 val2 = stack[--top];
401 match = pred->fn(pred, rec, val1, val2);
402 stack[top++] = match;
403 }
404 504
405 return stack[--top]; 505 return match;
406} 506}
407EXPORT_SYMBOL_GPL(filter_match_preds); 507EXPORT_SYMBOL_GPL(filter_match_preds);
408 508
@@ -414,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos)
414 514
415static void remove_filter_string(struct event_filter *filter) 515static void remove_filter_string(struct event_filter *filter)
416{ 516{
517 if (!filter)
518 return;
519
417 kfree(filter->filter_string); 520 kfree(filter->filter_string);
418 filter->filter_string = NULL; 521 filter->filter_string = NULL;
419} 522}
@@ -473,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps,
473 576
474void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) 577void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
475{ 578{
476 struct event_filter *filter = call->filter; 579 struct event_filter *filter;
477 580
478 mutex_lock(&event_mutex); 581 mutex_lock(&event_mutex);
582 filter = call->filter;
479 if (filter && filter->filter_string) 583 if (filter && filter->filter_string)
480 trace_seq_printf(s, "%s\n", filter->filter_string); 584 trace_seq_printf(s, "%s\n", filter->filter_string);
481 else 585 else
@@ -486,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
486void print_subsystem_event_filter(struct event_subsystem *system, 590void print_subsystem_event_filter(struct event_subsystem *system,
487 struct trace_seq *s) 591 struct trace_seq *s)
488{ 592{
489 struct event_filter *filter = system->filter; 593 struct event_filter *filter;
490 594
491 mutex_lock(&event_mutex); 595 mutex_lock(&event_mutex);
596 filter = system->filter;
492 if (filter && filter->filter_string) 597 if (filter && filter->filter_string)
493 trace_seq_printf(s, "%s\n", filter->filter_string); 598 trace_seq_printf(s, "%s\n", filter->filter_string);
494 else 599 else
@@ -539,10 +644,58 @@ static void filter_clear_pred(struct filter_pred *pred)
539 pred->regex.len = 0; 644 pred->regex.len = 0;
540} 645}
541 646
542static int filter_set_pred(struct filter_pred *dest, 647static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
648{
649 stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
650 if (!stack->preds)
651 return -ENOMEM;
652 stack->index = n_preds;
653 return 0;
654}
655
656static void __free_pred_stack(struct pred_stack *stack)
657{
658 kfree(stack->preds);
659 stack->index = 0;
660}
661
662static int __push_pred_stack(struct pred_stack *stack,
663 struct filter_pred *pred)
664{
665 int index = stack->index;
666
667 if (WARN_ON(index == 0))
668 return -ENOSPC;
669
670 stack->preds[--index] = pred;
671 stack->index = index;
672 return 0;
673}
674
675static struct filter_pred *
676__pop_pred_stack(struct pred_stack *stack)
677{
678 struct filter_pred *pred;
679 int index = stack->index;
680
681 pred = stack->preds[index++];
682 if (!pred)
683 return NULL;
684
685 stack->index = index;
686 return pred;
687}
688
689static int filter_set_pred(struct event_filter *filter,
690 int idx,
691 struct pred_stack *stack,
543 struct filter_pred *src, 692 struct filter_pred *src,
544 filter_pred_fn_t fn) 693 filter_pred_fn_t fn)
545{ 694{
695 struct filter_pred *dest = &filter->preds[idx];
696 struct filter_pred *left;
697 struct filter_pred *right;
698
546 *dest = *src; 699 *dest = *src;
547 if (src->field_name) { 700 if (src->field_name) {
548 dest->field_name = kstrdup(src->field_name, GFP_KERNEL); 701 dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
@@ -550,116 +703,140 @@ static int filter_set_pred(struct filter_pred *dest,
550 return -ENOMEM; 703 return -ENOMEM;
551 } 704 }
552 dest->fn = fn; 705 dest->fn = fn;
706 dest->index = idx;
553 707
554 return 0; 708 if (dest->op == OP_OR || dest->op == OP_AND) {
709 right = __pop_pred_stack(stack);
710 left = __pop_pred_stack(stack);
711 if (!left || !right)
712 return -EINVAL;
713 /*
714 * If both children can be folded
715 * and they are the same op as this op or a leaf,
716 * then this op can be folded.
717 */
718 if (left->index & FILTER_PRED_FOLD &&
719 (left->op == dest->op ||
720 left->left == FILTER_PRED_INVALID) &&
721 right->index & FILTER_PRED_FOLD &&
722 (right->op == dest->op ||
723 right->left == FILTER_PRED_INVALID))
724 dest->index |= FILTER_PRED_FOLD;
725
726 dest->left = left->index & ~FILTER_PRED_FOLD;
727 dest->right = right->index & ~FILTER_PRED_FOLD;
728 left->parent = dest->index & ~FILTER_PRED_FOLD;
729 right->parent = dest->index | FILTER_PRED_IS_RIGHT;
730 } else {
731 /*
732 * Make dest->left invalid to be used as a quick
733 * way to know this is a leaf node.
734 */
735 dest->left = FILTER_PRED_INVALID;
736
737 /* All leafs allow folding the parent ops. */
738 dest->index |= FILTER_PRED_FOLD;
739 }
740
741 return __push_pred_stack(stack, dest);
555} 742}
556 743
557static void filter_disable_preds(struct ftrace_event_call *call) 744static void __free_preds(struct event_filter *filter)
558{ 745{
559 struct event_filter *filter = call->filter;
560 int i; 746 int i;
561 747
562 call->flags &= ~TRACE_EVENT_FL_FILTERED; 748 if (filter->preds) {
749 for (i = 0; i < filter->a_preds; i++)
750 kfree(filter->preds[i].field_name);
751 kfree(filter->preds);
752 filter->preds = NULL;
753 }
754 filter->a_preds = 0;
563 filter->n_preds = 0; 755 filter->n_preds = 0;
564
565 for (i = 0; i < MAX_FILTER_PRED; i++)
566 filter->preds[i]->fn = filter_pred_none;
567} 756}
568 757
569static void __free_preds(struct event_filter *filter) 758static void filter_disable(struct ftrace_event_call *call)
570{ 759{
571 int i; 760 call->flags &= ~TRACE_EVENT_FL_FILTERED;
761}
572 762
763static void __free_filter(struct event_filter *filter)
764{
573 if (!filter) 765 if (!filter)
574 return; 766 return;
575 767
576 for (i = 0; i < MAX_FILTER_PRED; i++) { 768 __free_preds(filter);
577 if (filter->preds[i])
578 filter_free_pred(filter->preds[i]);
579 }
580 kfree(filter->preds);
581 kfree(filter->filter_string); 769 kfree(filter->filter_string);
582 kfree(filter); 770 kfree(filter);
583} 771}
584 772
773/*
774 * Called when destroying the ftrace_event_call.
775 * The call is being freed, so we do not need to worry about
776 * the call being currently used. This is for module code removing
777 * the tracepoints from within it.
778 */
585void destroy_preds(struct ftrace_event_call *call) 779void destroy_preds(struct ftrace_event_call *call)
586{ 780{
587 __free_preds(call->filter); 781 __free_filter(call->filter);
588 call->filter = NULL; 782 call->filter = NULL;
589 call->flags &= ~TRACE_EVENT_FL_FILTERED;
590} 783}
591 784
592static struct event_filter *__alloc_preds(void) 785static struct event_filter *__alloc_filter(void)
593{ 786{
594 struct event_filter *filter; 787 struct event_filter *filter;
788
789 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
790 return filter;
791}
792
793static int __alloc_preds(struct event_filter *filter, int n_preds)
794{
595 struct filter_pred *pred; 795 struct filter_pred *pred;
596 int i; 796 int i;
597 797
598 filter = kzalloc(sizeof(*filter), GFP_KERNEL); 798 if (filter->preds)
599 if (!filter) 799 __free_preds(filter);
600 return ERR_PTR(-ENOMEM);
601 800
602 filter->n_preds = 0; 801 filter->preds =
802 kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
603 803
604 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
605 if (!filter->preds) 804 if (!filter->preds)
606 goto oom; 805 return -ENOMEM;
607 806
608 for (i = 0; i < MAX_FILTER_PRED; i++) { 807 filter->a_preds = n_preds;
609 pred = kzalloc(sizeof(*pred), GFP_KERNEL); 808 filter->n_preds = 0;
610 if (!pred) 809
611 goto oom; 810 for (i = 0; i < n_preds; i++) {
811 pred = &filter->preds[i];
612 pred->fn = filter_pred_none; 812 pred->fn = filter_pred_none;
613 filter->preds[i] = pred;
614 } 813 }
615 814
616 return filter;
617
618oom:
619 __free_preds(filter);
620 return ERR_PTR(-ENOMEM);
621}
622
623static int init_preds(struct ftrace_event_call *call)
624{
625 if (call->filter)
626 return 0;
627
628 call->flags &= ~TRACE_EVENT_FL_FILTERED;
629 call->filter = __alloc_preds();
630 if (IS_ERR(call->filter))
631 return PTR_ERR(call->filter);
632
633 return 0; 815 return 0;
634} 816}
635 817
636static int init_subsystem_preds(struct event_subsystem *system) 818static void filter_free_subsystem_preds(struct event_subsystem *system)
637{ 819{
638 struct ftrace_event_call *call; 820 struct ftrace_event_call *call;
639 int err;
640 821
641 list_for_each_entry(call, &ftrace_events, list) { 822 list_for_each_entry(call, &ftrace_events, list) {
642 if (strcmp(call->class->system, system->name) != 0) 823 if (strcmp(call->class->system, system->name) != 0)
643 continue; 824 continue;
644 825
645 err = init_preds(call); 826 filter_disable(call);
646 if (err) 827 remove_filter_string(call->filter);
647 return err;
648 } 828 }
649
650 return 0;
651} 829}
652 830
653static void filter_free_subsystem_preds(struct event_subsystem *system) 831static void filter_free_subsystem_filters(struct event_subsystem *system)
654{ 832{
655 struct ftrace_event_call *call; 833 struct ftrace_event_call *call;
656 834
657 list_for_each_entry(call, &ftrace_events, list) { 835 list_for_each_entry(call, &ftrace_events, list) {
658 if (strcmp(call->class->system, system->name) != 0) 836 if (strcmp(call->class->system, system->name) != 0)
659 continue; 837 continue;
660 838 __free_filter(call->filter);
661 filter_disable_preds(call); 839 call->filter = NULL;
662 remove_filter_string(call->filter);
663 } 840 }
664} 841}
665 842
@@ -667,18 +844,19 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
667 struct ftrace_event_call *call, 844 struct ftrace_event_call *call,
668 struct event_filter *filter, 845 struct event_filter *filter,
669 struct filter_pred *pred, 846 struct filter_pred *pred,
847 struct pred_stack *stack,
670 filter_pred_fn_t fn) 848 filter_pred_fn_t fn)
671{ 849{
672 int idx, err; 850 int idx, err;
673 851
674 if (filter->n_preds == MAX_FILTER_PRED) { 852 if (WARN_ON(filter->n_preds == filter->a_preds)) {
675 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 853 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
676 return -ENOSPC; 854 return -ENOSPC;
677 } 855 }
678 856
679 idx = filter->n_preds; 857 idx = filter->n_preds;
680 filter_clear_pred(filter->preds[idx]); 858 filter_clear_pred(&filter->preds[idx]);
681 err = filter_set_pred(filter->preds[idx], pred, fn); 859 err = filter_set_pred(filter, idx, stack, pred, fn);
682 if (err) 860 if (err)
683 return err; 861 return err;
684 862
@@ -763,6 +941,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
763 struct ftrace_event_call *call, 941 struct ftrace_event_call *call,
764 struct event_filter *filter, 942 struct event_filter *filter,
765 struct filter_pred *pred, 943 struct filter_pred *pred,
944 struct pred_stack *stack,
766 bool dry_run) 945 bool dry_run)
767{ 946{
768 struct ftrace_event_field *field; 947 struct ftrace_event_field *field;
@@ -770,17 +949,12 @@ static int filter_add_pred(struct filter_parse_state *ps,
770 unsigned long long val; 949 unsigned long long val;
771 int ret; 950 int ret;
772 951
773 pred->fn = filter_pred_none; 952 fn = pred->fn = filter_pred_none;
774 953
775 if (pred->op == OP_AND) { 954 if (pred->op == OP_AND)
776 pred->pop_n = 2;
777 fn = filter_pred_and;
778 goto add_pred_fn; 955 goto add_pred_fn;
779 } else if (pred->op == OP_OR) { 956 else if (pred->op == OP_OR)
780 pred->pop_n = 2;
781 fn = filter_pred_or;
782 goto add_pred_fn; 957 goto add_pred_fn;
783 }
784 958
785 field = find_event_field(call, pred->field_name); 959 field = find_event_field(call, pred->field_name);
786 if (!field) { 960 if (!field) {
@@ -829,7 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
829 1003
830add_pred_fn: 1004add_pred_fn:
831 if (!dry_run) 1005 if (!dry_run)
832 return filter_add_pred_fn(ps, call, filter, pred, fn); 1006 return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
833 return 0; 1007 return 0;
834} 1008}
835 1009
@@ -1187,6 +1361,234 @@ static int check_preds(struct filter_parse_state *ps)
1187 return 0; 1361 return 0;
1188} 1362}
1189 1363
1364static int count_preds(struct filter_parse_state *ps)
1365{
1366 struct postfix_elt *elt;
1367 int n_preds = 0;
1368
1369 list_for_each_entry(elt, &ps->postfix, list) {
1370 if (elt->op == OP_NONE)
1371 continue;
1372 n_preds++;
1373 }
1374
1375 return n_preds;
1376}
1377
1378/*
1379 * The tree is walked at filtering of an event. If the tree is not correctly
1380 * built, it may cause an infinite loop. Check here that the tree does
1381 * indeed terminate.
1382 */
1383static int check_pred_tree(struct event_filter *filter,
1384 struct filter_pred *root)
1385{
1386 struct filter_pred *preds;
1387 struct filter_pred *pred;
1388 enum move_type move = MOVE_DOWN;
1389 int count = 0;
1390 int done = 0;
1391 int max;
1392
1393 /*
1394 * The max that we can hit a node is three times.
1395 * Once going down, once coming up from left, and
1396 * once coming up from right. This is more than enough
1397 * since leafs are only hit a single time.
1398 */
1399 max = 3 * filter->n_preds;
1400
1401 preds = filter->preds;
1402 if (!preds)
1403 return -EINVAL;
1404 pred = root;
1405
1406 do {
1407 if (WARN_ON(count++ > max))
1408 return -EINVAL;
1409
1410 switch (move) {
1411 case MOVE_DOWN:
1412 if (pred->left != FILTER_PRED_INVALID) {
1413 pred = &preds[pred->left];
1414 continue;
1415 }
1416 /* A leaf at the root is just a leaf in the tree */
1417 if (pred == root)
1418 break;
1419 pred = get_pred_parent(pred, preds,
1420 pred->parent, &move);
1421 continue;
1422 case MOVE_UP_FROM_LEFT:
1423 pred = &preds[pred->right];
1424 move = MOVE_DOWN;
1425 continue;
1426 case MOVE_UP_FROM_RIGHT:
1427 if (pred == root)
1428 break;
1429 pred = get_pred_parent(pred, preds,
1430 pred->parent, &move);
1431 continue;
1432 }
1433 done = 1;
1434 } while (!done);
1435
1436 /* We are fine. */
1437 return 0;
1438}
1439
1440static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
1441{
1442 struct filter_pred *pred;
1443 enum move_type move = MOVE_DOWN;
1444 int count = 0;
1445 int done = 0;
1446
1447 pred = root;
1448
1449 do {
1450 switch (move) {
1451 case MOVE_DOWN:
1452 if (pred->left != FILTER_PRED_INVALID) {
1453 pred = &preds[pred->left];
1454 continue;
1455 }
1456 /* A leaf at the root is just a leaf in the tree */
1457 if (pred == root)
1458 return 1;
1459 count++;
1460 pred = get_pred_parent(pred, preds,
1461 pred->parent, &move);
1462 continue;
1463 case MOVE_UP_FROM_LEFT:
1464 pred = &preds[pred->right];
1465 move = MOVE_DOWN;
1466 continue;
1467 case MOVE_UP_FROM_RIGHT:
1468 if (pred == root)
1469 break;
1470 pred = get_pred_parent(pred, preds,
1471 pred->parent, &move);
1472 continue;
1473 }
1474 done = 1;
1475 } while (!done);
1476
1477 return count;
1478}
1479
1480static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1481{
1482 struct filter_pred *pred;
1483 enum move_type move = MOVE_DOWN;
1484 int count = 0;
1485 int children;
1486 int done = 0;
1487
1488 /* No need to keep the fold flag */
1489 root->index &= ~FILTER_PRED_FOLD;
1490
1491 /* If the root is a leaf then do nothing */
1492 if (root->left == FILTER_PRED_INVALID)
1493 return 0;
1494
1495 /* count the children */
1496 children = count_leafs(preds, &preds[root->left]);
1497 children += count_leafs(preds, &preds[root->right]);
1498
1499 root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL);
1500 if (!root->ops)
1501 return -ENOMEM;
1502
1503 root->val = children;
1504
1505 pred = root;
1506 do {
1507 switch (move) {
1508 case MOVE_DOWN:
1509 if (pred->left != FILTER_PRED_INVALID) {
1510 pred = &preds[pred->left];
1511 continue;
1512 }
1513 if (WARN_ON(count == children))
1514 return -EINVAL;
1515 pred->index &= ~FILTER_PRED_FOLD;
1516 root->ops[count++] = pred->index;
1517 pred = get_pred_parent(pred, preds,
1518 pred->parent, &move);
1519 continue;
1520 case MOVE_UP_FROM_LEFT:
1521 pred = &preds[pred->right];
1522 move = MOVE_DOWN;
1523 continue;
1524 case MOVE_UP_FROM_RIGHT:
1525 if (pred == root)
1526 break;
1527 pred = get_pred_parent(pred, preds,
1528 pred->parent, &move);
1529 continue;
1530 }
1531 done = 1;
1532 } while (!done);
1533
1534 return 0;
1535}
1536
1537/*
1538 * To optimize the processing of the ops, if we have several "ors" or
1539 * "ands" together, we can put them in an array and process them all
1540 * together speeding up the filter logic.
1541 */
1542static int fold_pred_tree(struct event_filter *filter,
1543 struct filter_pred *root)
1544{
1545 struct filter_pred *preds;
1546 struct filter_pred *pred;
1547 enum move_type move = MOVE_DOWN;
1548 int done = 0;
1549 int err;
1550
1551 preds = filter->preds;
1552 if (!preds)
1553 return -EINVAL;
1554 pred = root;
1555
1556 do {
1557 switch (move) {
1558 case MOVE_DOWN:
1559 if (pred->index & FILTER_PRED_FOLD) {
1560 err = fold_pred(preds, pred);
1561 if (err)
1562 return err;
1563 /* Folded nodes are like leafs */
1564 } else if (pred->left != FILTER_PRED_INVALID) {
1565 pred = &preds[pred->left];
1566 continue;
1567 }
1568
1569 /* A leaf at the root is just a leaf in the tree */
1570 if (pred == root)
1571 break;
1572 pred = get_pred_parent(pred, preds,
1573 pred->parent, &move);
1574 continue;
1575 case MOVE_UP_FROM_LEFT:
1576 pred = &preds[pred->right];
1577 move = MOVE_DOWN;
1578 continue;
1579 case MOVE_UP_FROM_RIGHT:
1580 if (pred == root)
1581 break;
1582 pred = get_pred_parent(pred, preds,
1583 pred->parent, &move);
1584 continue;
1585 }
1586 done = 1;
1587 } while (!done);
1588
1589 return 0;
1590}
1591
1190static int replace_preds(struct ftrace_event_call *call, 1592static int replace_preds(struct ftrace_event_call *call,
1191 struct event_filter *filter, 1593 struct event_filter *filter,
1192 struct filter_parse_state *ps, 1594 struct filter_parse_state *ps,
@@ -1195,14 +1597,32 @@ static int replace_preds(struct ftrace_event_call *call,
1195{ 1597{
1196 char *operand1 = NULL, *operand2 = NULL; 1598 char *operand1 = NULL, *operand2 = NULL;
1197 struct filter_pred *pred; 1599 struct filter_pred *pred;
1600 struct filter_pred *root;
1198 struct postfix_elt *elt; 1601 struct postfix_elt *elt;
1602 struct pred_stack stack = { }; /* init to NULL */
1199 int err; 1603 int err;
1200 int n_preds = 0; 1604 int n_preds = 0;
1201 1605
1606 n_preds = count_preds(ps);
1607 if (n_preds >= MAX_FILTER_PRED) {
1608 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1609 return -ENOSPC;
1610 }
1611
1202 err = check_preds(ps); 1612 err = check_preds(ps);
1203 if (err) 1613 if (err)
1204 return err; 1614 return err;
1205 1615
1616 if (!dry_run) {
1617 err = __alloc_pred_stack(&stack, n_preds);
1618 if (err)
1619 return err;
1620 err = __alloc_preds(filter, n_preds);
1621 if (err)
1622 goto fail;
1623 }
1624
1625 n_preds = 0;
1206 list_for_each_entry(elt, &ps->postfix, list) { 1626 list_for_each_entry(elt, &ps->postfix, list) {
1207 if (elt->op == OP_NONE) { 1627 if (elt->op == OP_NONE) {
1208 if (!operand1) 1628 if (!operand1)
@@ -1211,14 +1631,16 @@ static int replace_preds(struct ftrace_event_call *call,
1211 operand2 = elt->operand; 1631 operand2 = elt->operand;
1212 else { 1632 else {
1213 parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); 1633 parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
1214 return -EINVAL; 1634 err = -EINVAL;
1635 goto fail;
1215 } 1636 }
1216 continue; 1637 continue;
1217 } 1638 }
1218 1639
1219 if (n_preds++ == MAX_FILTER_PRED) { 1640 if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
1220 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 1641 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1221 return -ENOSPC; 1642 err = -ENOSPC;
1643 goto fail;
1222 } 1644 }
1223 1645
1224 if (elt->op == OP_AND || elt->op == OP_OR) { 1646 if (elt->op == OP_AND || elt->op == OP_OR) {
@@ -1228,76 +1650,181 @@ static int replace_preds(struct ftrace_event_call *call,
1228 1650
1229 if (!operand1 || !operand2) { 1651 if (!operand1 || !operand2) {
1230 parse_error(ps, FILT_ERR_MISSING_FIELD, 0); 1652 parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
1231 return -EINVAL; 1653 err = -EINVAL;
1654 goto fail;
1232 } 1655 }
1233 1656
1234 pred = create_pred(elt->op, operand1, operand2); 1657 pred = create_pred(elt->op, operand1, operand2);
1235add_pred: 1658add_pred:
1236 if (!pred) 1659 if (!pred) {
1237 return -ENOMEM; 1660 err = -ENOMEM;
1238 err = filter_add_pred(ps, call, filter, pred, dry_run); 1661 goto fail;
1662 }
1663 err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
1239 filter_free_pred(pred); 1664 filter_free_pred(pred);
1240 if (err) 1665 if (err)
1241 return err; 1666 goto fail;
1242 1667
1243 operand1 = operand2 = NULL; 1668 operand1 = operand2 = NULL;
1244 } 1669 }
1245 1670
1246 return 0; 1671 if (!dry_run) {
1672 /* We should have one item left on the stack */
1673 pred = __pop_pred_stack(&stack);
1674 if (!pred)
1675 return -EINVAL;
1676 /* This item is where we start from in matching */
1677 root = pred;
1678 /* Make sure the stack is empty */
1679 pred = __pop_pred_stack(&stack);
1680 if (WARN_ON(pred)) {
1681 err = -EINVAL;
1682 filter->root = NULL;
1683 goto fail;
1684 }
1685 err = check_pred_tree(filter, root);
1686 if (err)
1687 goto fail;
1688
1689 /* Optimize the tree */
1690 err = fold_pred_tree(filter, root);
1691 if (err)
1692 goto fail;
1693
1694 /* We don't set root until we know it works */
1695 barrier();
1696 filter->root = root;
1697 }
1698
1699 err = 0;
1700fail:
1701 __free_pred_stack(&stack);
1702 return err;
1247} 1703}
1248 1704
1705struct filter_list {
1706 struct list_head list;
1707 struct event_filter *filter;
1708};
1709
1249static int replace_system_preds(struct event_subsystem *system, 1710static int replace_system_preds(struct event_subsystem *system,
1250 struct filter_parse_state *ps, 1711 struct filter_parse_state *ps,
1251 char *filter_string) 1712 char *filter_string)
1252{ 1713{
1253 struct ftrace_event_call *call; 1714 struct ftrace_event_call *call;
1715 struct filter_list *filter_item;
1716 struct filter_list *tmp;
1717 LIST_HEAD(filter_list);
1254 bool fail = true; 1718 bool fail = true;
1255 int err; 1719 int err;
1256 1720
1257 list_for_each_entry(call, &ftrace_events, list) { 1721 list_for_each_entry(call, &ftrace_events, list) {
1258 struct event_filter *filter = call->filter;
1259 1722
1260 if (strcmp(call->class->system, system->name) != 0) 1723 if (strcmp(call->class->system, system->name) != 0)
1261 continue; 1724 continue;
1262 1725
1263 /* try to see if the filter can be applied */ 1726 /*
1264 err = replace_preds(call, filter, ps, filter_string, true); 1727 * Try to see if the filter can be applied
1728 * (filter arg is ignored on dry_run)
1729 */
1730 err = replace_preds(call, NULL, ps, filter_string, true);
1265 if (err) 1731 if (err)
1732 goto fail;
1733 }
1734
1735 list_for_each_entry(call, &ftrace_events, list) {
1736 struct event_filter *filter;
1737
1738 if (strcmp(call->class->system, system->name) != 0)
1266 continue; 1739 continue;
1267 1740
1268 /* really apply the filter */ 1741 filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
1269 filter_disable_preds(call); 1742 if (!filter_item)
1270 err = replace_preds(call, filter, ps, filter_string, false); 1743 goto fail_mem;
1744
1745 list_add_tail(&filter_item->list, &filter_list);
1746
1747 filter_item->filter = __alloc_filter();
1748 if (!filter_item->filter)
1749 goto fail_mem;
1750 filter = filter_item->filter;
1751
1752 /* Can only fail on no memory */
1753 err = replace_filter_string(filter, filter_string);
1271 if (err) 1754 if (err)
1272 filter_disable_preds(call); 1755 goto fail_mem;
1273 else { 1756
1757 err = replace_preds(call, filter, ps, filter_string, false);
1758 if (err) {
1759 filter_disable(call);
1760 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1761 append_filter_err(ps, filter);
1762 } else
1274 call->flags |= TRACE_EVENT_FL_FILTERED; 1763 call->flags |= TRACE_EVENT_FL_FILTERED;
1275 replace_filter_string(filter, filter_string); 1764 /*
1276 } 1765 * Regardless of if this returned an error, we still
1766 * replace the filter for the call.
1767 */
1768 filter = call->filter;
1769 call->filter = filter_item->filter;
1770 filter_item->filter = filter;
1771
1277 fail = false; 1772 fail = false;
1278 } 1773 }
1279 1774
1280 if (fail) { 1775 if (fail)
1281 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 1776 goto fail;
1282 return -EINVAL; 1777
1778 /*
1779 * The calls can still be using the old filters.
1780 * Do a synchronize_sched() to ensure all calls are
1781 * done with them before we free them.
1782 */
1783 synchronize_sched();
1784 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1785 __free_filter(filter_item->filter);
1786 list_del(&filter_item->list);
1787 kfree(filter_item);
1283 } 1788 }
1284 return 0; 1789 return 0;
1790 fail:
1791 /* No call succeeded */
1792 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1793 list_del(&filter_item->list);
1794 kfree(filter_item);
1795 }
1796 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1797 return -EINVAL;
1798 fail_mem:
1799 /* If any call succeeded, we still need to sync */
1800 if (!fail)
1801 synchronize_sched();
1802 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1803 __free_filter(filter_item->filter);
1804 list_del(&filter_item->list);
1805 kfree(filter_item);
1806 }
1807 return -ENOMEM;
1285} 1808}
1286 1809
1287int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1810int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1288{ 1811{
1289 int err;
1290 struct filter_parse_state *ps; 1812 struct filter_parse_state *ps;
1813 struct event_filter *filter;
1814 struct event_filter *tmp;
1815 int err = 0;
1291 1816
1292 mutex_lock(&event_mutex); 1817 mutex_lock(&event_mutex);
1293 1818
1294 err = init_preds(call);
1295 if (err)
1296 goto out_unlock;
1297
1298 if (!strcmp(strstrip(filter_string), "0")) { 1819 if (!strcmp(strstrip(filter_string), "0")) {
1299 filter_disable_preds(call); 1820 filter_disable(call);
1300 remove_filter_string(call->filter); 1821 filter = call->filter;
1822 if (!filter)
1823 goto out_unlock;
1824 call->filter = NULL;
1825 /* Make sure the filter is not being used */
1826 synchronize_sched();
1827 __free_filter(filter);
1301 goto out_unlock; 1828 goto out_unlock;
1302 } 1829 }
1303 1830
@@ -1306,22 +1833,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1306 if (!ps) 1833 if (!ps)
1307 goto out_unlock; 1834 goto out_unlock;
1308 1835
1309 filter_disable_preds(call); 1836 filter = __alloc_filter();
1310 replace_filter_string(call->filter, filter_string); 1837 if (!filter) {
1838 kfree(ps);
1839 goto out_unlock;
1840 }
1841
1842 replace_filter_string(filter, filter_string);
1311 1843
1312 parse_init(ps, filter_ops, filter_string); 1844 parse_init(ps, filter_ops, filter_string);
1313 err = filter_parse(ps); 1845 err = filter_parse(ps);
1314 if (err) { 1846 if (err) {
1315 append_filter_err(ps, call->filter); 1847 append_filter_err(ps, filter);
1316 goto out; 1848 goto out;
1317 } 1849 }
1318 1850
1319 err = replace_preds(call, call->filter, ps, filter_string, false); 1851 err = replace_preds(call, filter, ps, filter_string, false);
1320 if (err) 1852 if (err) {
1321 append_filter_err(ps, call->filter); 1853 filter_disable(call);
1322 else 1854 append_filter_err(ps, filter);
1855 } else
1323 call->flags |= TRACE_EVENT_FL_FILTERED; 1856 call->flags |= TRACE_EVENT_FL_FILTERED;
1324out: 1857out:
1858 /*
1859 * Always swap the call filter with the new filter
1860 * even if there was an error. If there was an error
1861 * in the filter, we disable the filter and show the error
1862 * string
1863 */
1864 tmp = call->filter;
1865 call->filter = filter;
1866 if (tmp) {
1867 /* Make sure the call is done with the filter */
1868 synchronize_sched();
1869 __free_filter(tmp);
1870 }
1325 filter_opstack_clear(ps); 1871 filter_opstack_clear(ps);
1326 postfix_clear(ps); 1872 postfix_clear(ps);
1327 kfree(ps); 1873 kfree(ps);
@@ -1334,18 +1880,21 @@ out_unlock:
1334int apply_subsystem_event_filter(struct event_subsystem *system, 1880int apply_subsystem_event_filter(struct event_subsystem *system,
1335 char *filter_string) 1881 char *filter_string)
1336{ 1882{
1337 int err;
1338 struct filter_parse_state *ps; 1883 struct filter_parse_state *ps;
1884 struct event_filter *filter;
1885 int err = 0;
1339 1886
1340 mutex_lock(&event_mutex); 1887 mutex_lock(&event_mutex);
1341 1888
1342 err = init_subsystem_preds(system);
1343 if (err)
1344 goto out_unlock;
1345
1346 if (!strcmp(strstrip(filter_string), "0")) { 1889 if (!strcmp(strstrip(filter_string), "0")) {
1347 filter_free_subsystem_preds(system); 1890 filter_free_subsystem_preds(system);
1348 remove_filter_string(system->filter); 1891 remove_filter_string(system->filter);
1892 filter = system->filter;
1893 system->filter = NULL;
1894 /* Ensure all filters are no longer used */
1895 synchronize_sched();
1896 filter_free_subsystem_filters(system);
1897 __free_filter(filter);
1349 goto out_unlock; 1898 goto out_unlock;
1350 } 1899 }
1351 1900
@@ -1354,7 +1903,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1354 if (!ps) 1903 if (!ps)
1355 goto out_unlock; 1904 goto out_unlock;
1356 1905
1357 replace_filter_string(system->filter, filter_string); 1906 filter = __alloc_filter();
1907 if (!filter)
1908 goto out;
1909
1910 replace_filter_string(filter, filter_string);
1911 /*
1912 * No event actually uses the system filter
1913 * we can free it without synchronize_sched().
1914 */
1915 __free_filter(system->filter);
1916 system->filter = filter;
1358 1917
1359 parse_init(ps, filter_ops, filter_string); 1918 parse_init(ps, filter_ops, filter_string);
1360 err = filter_parse(ps); 1919 err = filter_parse(ps);
@@ -1384,7 +1943,7 @@ void ftrace_profile_free_filter(struct perf_event *event)
1384 struct event_filter *filter = event->filter; 1943 struct event_filter *filter = event->filter;
1385 1944
1386 event->filter = NULL; 1945 event->filter = NULL;
1387 __free_preds(filter); 1946 __free_filter(filter);
1388} 1947}
1389 1948
1390int ftrace_profile_set_filter(struct perf_event *event, int event_id, 1949int ftrace_profile_set_filter(struct perf_event *event, int event_id,
@@ -1410,8 +1969,8 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1410 if (event->filter) 1969 if (event->filter)
1411 goto out_unlock; 1970 goto out_unlock;
1412 1971
1413 filter = __alloc_preds(); 1972 filter = __alloc_filter();
1414 if (IS_ERR(filter)) { 1973 if (!filter) {
1415 err = PTR_ERR(filter); 1974 err = PTR_ERR(filter);
1416 goto out_unlock; 1975 goto out_unlock;
1417 } 1976 }
@@ -1419,7 +1978,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1419 err = -ENOMEM; 1978 err = -ENOMEM;
1420 ps = kzalloc(sizeof(*ps), GFP_KERNEL); 1979 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1421 if (!ps) 1980 if (!ps)
1422 goto free_preds; 1981 goto free_filter;
1423 1982
1424 parse_init(ps, filter_ops, filter_str); 1983 parse_init(ps, filter_ops, filter_str);
1425 err = filter_parse(ps); 1984 err = filter_parse(ps);
@@ -1435,9 +1994,9 @@ free_ps:
1435 postfix_clear(ps); 1994 postfix_clear(ps);
1436 kfree(ps); 1995 kfree(ps);
1437 1996
1438free_preds: 1997free_filter:
1439 if (err) 1998 if (err)
1440 __free_preds(filter); 1999 __free_filter(filter);
1441 2000
1442out_unlock: 2001out_unlock:
1443 mutex_unlock(&event_mutex); 2002 mutex_unlock(&event_mutex);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4ba44deaac25..bbeec31e0ae3 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -83,13 +83,19 @@ static void __always_unused ____ftrace_check_##name(void) \
83 83
84#undef __array 84#undef __array
85#define __array(type, item, len) \ 85#define __array(type, item, len) \
86 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ 86 do { \
87 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 87 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
88 mutex_lock(&event_storage_mutex); \
89 snprintf(event_storage, sizeof(event_storage), \
90 "%s[%d]", #type, len); \
91 ret = trace_define_field(event_call, event_storage, #item, \
88 offsetof(typeof(field), item), \ 92 offsetof(typeof(field), item), \
89 sizeof(field.item), \ 93 sizeof(field.item), \
90 is_signed_type(type), FILTER_OTHER); \ 94 is_signed_type(type), FILTER_OTHER); \
91 if (ret) \ 95 mutex_unlock(&event_storage_mutex); \
92 return ret; 96 if (ret) \
97 return ret; \
98 } while (0);
93 99
94#undef __array_desc 100#undef __array_desc
95#define __array_desc(type, container, item, len) \ 101#define __array_desc(type, container, item, len) \
@@ -155,13 +161,13 @@ struct ftrace_event_class event_class_ftrace_##call = { \
155 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ 161 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
156}; \ 162}; \
157 \ 163 \
158struct ftrace_event_call __used \ 164struct ftrace_event_call __used event_##call = { \
159__attribute__((__aligned__(4))) \
160__attribute__((section("_ftrace_events"))) event_##call = { \
161 .name = #call, \ 165 .name = #call, \
162 .event.type = etype, \ 166 .event.type = etype, \
163 .class = &event_class_ftrace_##call, \ 167 .class = &event_class_ftrace_##call, \
164 .print_fmt = print, \ 168 .print_fmt = print, \
165}; \ 169}; \
170struct ftrace_event_call __used \
171__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
166 172
167#include "trace_entries.h" 173#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 16aee4d44e8f..8d0e1cc4e974 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -149,11 +149,13 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
149static struct ftrace_ops trace_ops __read_mostly = 149static struct ftrace_ops trace_ops __read_mostly =
150{ 150{
151 .func = function_trace_call, 151 .func = function_trace_call,
152 .flags = FTRACE_OPS_FL_GLOBAL,
152}; 153};
153 154
154static struct ftrace_ops trace_stack_ops __read_mostly = 155static struct ftrace_ops trace_stack_ops __read_mostly =
155{ 156{
156 .func = function_stack_trace_call, 157 .func = function_stack_trace_call,
158 .flags = FTRACE_OPS_FL_GLOBAL,
157}; 159};
158 160
159/* Our two options */ 161/* Our two options */
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 6f233698518e..962cdb24ed81 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -15,15 +15,19 @@
15#include "trace.h" 15#include "trace.h"
16#include "trace_output.h" 16#include "trace_output.h"
17 17
18/* When set, irq functions will be ignored */
19static int ftrace_graph_skip_irqs;
20
18struct fgraph_cpu_data { 21struct fgraph_cpu_data {
19 pid_t last_pid; 22 pid_t last_pid;
20 int depth; 23 int depth;
24 int depth_irq;
21 int ignore; 25 int ignore;
22 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; 26 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
23}; 27};
24 28
25struct fgraph_data { 29struct fgraph_data {
26 struct fgraph_cpu_data *cpu_data; 30 struct fgraph_cpu_data __percpu *cpu_data;
27 31
28 /* Place to preserve last processed entry. */ 32 /* Place to preserve last processed entry. */
29 struct ftrace_graph_ent_entry ent; 33 struct ftrace_graph_ent_entry ent;
@@ -41,6 +45,7 @@ struct fgraph_data {
41#define TRACE_GRAPH_PRINT_PROC 0x8 45#define TRACE_GRAPH_PRINT_PROC 0x8
42#define TRACE_GRAPH_PRINT_DURATION 0x10 46#define TRACE_GRAPH_PRINT_DURATION 0x10
43#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 47#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
48#define TRACE_GRAPH_PRINT_IRQS 0x40
44 49
45static struct tracer_opt trace_opts[] = { 50static struct tracer_opt trace_opts[] = {
46 /* Display overruns? (for self-debug purpose) */ 51 /* Display overruns? (for self-debug purpose) */
@@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = {
55 { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, 60 { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
56 /* Display absolute time of an entry */ 61 /* Display absolute time of an entry */
57 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, 62 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
63 /* Display interrupts */
64 { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
58 { } /* Empty entry */ 65 { } /* Empty entry */
59}; 66};
60 67
61static struct tracer_flags tracer_flags = { 68static struct tracer_flags tracer_flags = {
62 /* Don't display overruns and proc by default */ 69 /* Don't display overruns and proc by default */
63 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | 70 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
64 TRACE_GRAPH_PRINT_DURATION, 71 TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
65 .opts = trace_opts 72 .opts = trace_opts
66}; 73};
67 74
@@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr,
204 return 1; 211 return 1;
205} 212}
206 213
214static inline int ftrace_graph_ignore_irqs(void)
215{
216 if (!ftrace_graph_skip_irqs)
217 return 0;
218
219 return in_irq();
220}
221
207int trace_graph_entry(struct ftrace_graph_ent *trace) 222int trace_graph_entry(struct ftrace_graph_ent *trace)
208{ 223{
209 struct trace_array *tr = graph_array; 224 struct trace_array *tr = graph_array;
@@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
218 return 0; 233 return 0;
219 234
220 /* trace it when it is-nested-in or is a function enabled. */ 235 /* trace it when it is-nested-in or is a function enabled. */
221 if (!(trace->depth || ftrace_graph_addr(trace->func))) 236 if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
237 ftrace_graph_ignore_irqs())
222 return 0; 238 return 0;
223 239
224 local_irq_save(flags); 240 local_irq_save(flags);
@@ -246,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
246 return trace_graph_entry(trace); 262 return trace_graph_entry(trace);
247} 263}
248 264
265static void
266__trace_graph_function(struct trace_array *tr,
267 unsigned long ip, unsigned long flags, int pc)
268{
269 u64 time = trace_clock_local();
270 struct ftrace_graph_ent ent = {
271 .func = ip,
272 .depth = 0,
273 };
274 struct ftrace_graph_ret ret = {
275 .func = ip,
276 .depth = 0,
277 .calltime = time,
278 .rettime = time,
279 };
280
281 __trace_graph_entry(tr, &ent, flags, pc);
282 __trace_graph_return(tr, &ret, flags, pc);
283}
284
285void
286trace_graph_function(struct trace_array *tr,
287 unsigned long ip, unsigned long parent_ip,
288 unsigned long flags, int pc)
289{
290 __trace_graph_function(tr, ip, flags, pc);
291}
292
249void __trace_graph_return(struct trace_array *tr, 293void __trace_graph_return(struct trace_array *tr,
250 struct ftrace_graph_ret *trace, 294 struct ftrace_graph_ret *trace,
251 unsigned long flags, 295 unsigned long flags,
@@ -649,8 +693,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
649 693
650 /* Print nsecs (we don't want to exceed 7 numbers) */ 694 /* Print nsecs (we don't want to exceed 7 numbers) */
651 if (len < 7) { 695 if (len < 7) {
652 snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu", 696 size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
653 nsecs_rem); 697
698 snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
654 ret = trace_seq_printf(s, ".%s", nsecs_str); 699 ret = trace_seq_printf(s, ".%s", nsecs_str);
655 if (!ret) 700 if (!ret)
656 return TRACE_TYPE_PARTIAL_LINE; 701 return TRACE_TYPE_PARTIAL_LINE;
@@ -855,6 +900,108 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
855 return 0; 900 return 0;
856} 901}
857 902
903/*
904 * Entry check for irq code
905 *
906 * returns 1 if
907 * - we are inside irq code
908 * - we just entered irq code
909 *
910 * retunns 0 if
911 * - funcgraph-interrupts option is set
912 * - we are not inside irq code
913 */
914static int
915check_irq_entry(struct trace_iterator *iter, u32 flags,
916 unsigned long addr, int depth)
917{
918 int cpu = iter->cpu;
919 int *depth_irq;
920 struct fgraph_data *data = iter->private;
921
922 /*
923 * If we are either displaying irqs, or we got called as
924 * a graph event and private data does not exist,
925 * then we bypass the irq check.
926 */
927 if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
928 (!data))
929 return 0;
930
931 depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
932
933 /*
934 * We are inside the irq code
935 */
936 if (*depth_irq >= 0)
937 return 1;
938
939 if ((addr < (unsigned long)__irqentry_text_start) ||
940 (addr >= (unsigned long)__irqentry_text_end))
941 return 0;
942
943 /*
944 * We are entering irq code.
945 */
946 *depth_irq = depth;
947 return 1;
948}
949
950/*
951 * Return check for irq code
952 *
953 * returns 1 if
954 * - we are inside irq code
955 * - we just left irq code
956 *
957 * returns 0 if
958 * - funcgraph-interrupts option is set
959 * - we are not inside irq code
960 */
961static int
962check_irq_return(struct trace_iterator *iter, u32 flags, int depth)
963{
964 int cpu = iter->cpu;
965 int *depth_irq;
966 struct fgraph_data *data = iter->private;
967
968 /*
969 * If we are either displaying irqs, or we got called as
970 * a graph event and private data does not exist,
971 * then we bypass the irq check.
972 */
973 if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
974 (!data))
975 return 0;
976
977 depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
978
979 /*
980 * We are not inside the irq code.
981 */
982 if (*depth_irq == -1)
983 return 0;
984
985 /*
986 * We are inside the irq code, and this is returning entry.
987 * Let's not trace it and clear the entry depth, since
988 * we are out of irq code.
989 *
990 * This condition ensures that we 'leave the irq code' once
991 * we are out of the entry depth. Thus protecting us from
992 * the RETURN entry loss.
993 */
994 if (*depth_irq >= depth) {
995 *depth_irq = -1;
996 return 1;
997 }
998
999 /*
1000 * We are inside the irq code, and this is not the entry.
1001 */
1002 return 1;
1003}
1004
858static enum print_line_t 1005static enum print_line_t
859print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 1006print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
860 struct trace_iterator *iter, u32 flags) 1007 struct trace_iterator *iter, u32 flags)
@@ -865,6 +1012,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
865 static enum print_line_t ret; 1012 static enum print_line_t ret;
866 int cpu = iter->cpu; 1013 int cpu = iter->cpu;
867 1014
1015 if (check_irq_entry(iter, flags, call->func, call->depth))
1016 return TRACE_TYPE_HANDLED;
1017
868 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) 1018 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
869 return TRACE_TYPE_PARTIAL_LINE; 1019 return TRACE_TYPE_PARTIAL_LINE;
870 1020
@@ -902,6 +1052,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
902 int ret; 1052 int ret;
903 int i; 1053 int i;
904 1054
1055 if (check_irq_return(iter, flags, trace->depth))
1056 return TRACE_TYPE_HANDLED;
1057
905 if (data) { 1058 if (data) {
906 struct fgraph_cpu_data *cpu_data; 1059 struct fgraph_cpu_data *cpu_data;
907 int cpu = iter->cpu; 1060 int cpu = iter->cpu;
@@ -1054,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1054 1207
1055 1208
1056enum print_line_t 1209enum print_line_t
1057print_graph_function_flags(struct trace_iterator *iter, u32 flags) 1210__print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1058{ 1211{
1059 struct ftrace_graph_ent_entry *field; 1212 struct ftrace_graph_ent_entry *field;
1060 struct fgraph_data *data = iter->private; 1213 struct fgraph_data *data = iter->private;
@@ -1117,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1117static enum print_line_t 1270static enum print_line_t
1118print_graph_function(struct trace_iterator *iter) 1271print_graph_function(struct trace_iterator *iter)
1119{ 1272{
1120 return print_graph_function_flags(iter, tracer_flags.val); 1273 return __print_graph_function_flags(iter, tracer_flags.val);
1274}
1275
1276enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
1277 u32 flags)
1278{
1279 if (trace_flags & TRACE_ITER_LATENCY_FMT)
1280 flags |= TRACE_GRAPH_PRINT_DURATION;
1281 else
1282 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1283
1284 return __print_graph_function_flags(iter, flags);
1121} 1285}
1122 1286
1123static enum print_line_t 1287static enum print_line_t
@@ -1149,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
1149 seq_printf(s, "#%.*s|||| / \n", size, spaces); 1313 seq_printf(s, "#%.*s|||| / \n", size, spaces);
1150} 1314}
1151 1315
1152void print_graph_headers_flags(struct seq_file *s, u32 flags) 1316static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1153{ 1317{
1154 int lat = trace_flags & TRACE_ITER_LATENCY_FMT; 1318 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
1155 1319
@@ -1190,6 +1354,23 @@ void print_graph_headers(struct seq_file *s)
1190 print_graph_headers_flags(s, tracer_flags.val); 1354 print_graph_headers_flags(s, tracer_flags.val);
1191} 1355}
1192 1356
1357void print_graph_headers_flags(struct seq_file *s, u32 flags)
1358{
1359 struct trace_iterator *iter = s->private;
1360
1361 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
1362 /* print nothing if the buffers are empty */
1363 if (trace_empty(iter))
1364 return;
1365
1366 print_trace_header(s, iter);
1367 flags |= TRACE_GRAPH_PRINT_DURATION;
1368 } else
1369 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1370
1371 __print_graph_headers_flags(s, flags);
1372}
1373
1193void graph_trace_open(struct trace_iterator *iter) 1374void graph_trace_open(struct trace_iterator *iter)
1194{ 1375{
1195 /* pid and depth on the last trace processed */ 1376 /* pid and depth on the last trace processed */
@@ -1210,9 +1391,12 @@ void graph_trace_open(struct trace_iterator *iter)
1210 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); 1391 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
1211 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 1392 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
1212 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); 1393 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
1394 int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
1395
1213 *pid = -1; 1396 *pid = -1;
1214 *depth = 0; 1397 *depth = 0;
1215 *ignore = 0; 1398 *ignore = 0;
1399 *depth_irq = -1;
1216 } 1400 }
1217 1401
1218 iter->private = data; 1402 iter->private = data;
@@ -1235,6 +1419,14 @@ void graph_trace_close(struct trace_iterator *iter)
1235 } 1419 }
1236} 1420}
1237 1421
1422static int func_graph_set_flag(u32 old_flags, u32 bit, int set)
1423{
1424 if (bit == TRACE_GRAPH_PRINT_IRQS)
1425 ftrace_graph_skip_irqs = !set;
1426
1427 return 0;
1428}
1429
1238static struct trace_event_functions graph_functions = { 1430static struct trace_event_functions graph_functions = {
1239 .trace = print_graph_function_event, 1431 .trace = print_graph_function_event,
1240}; 1432};
@@ -1261,6 +1453,7 @@ static struct tracer graph_trace __read_mostly = {
1261 .print_line = print_graph_function, 1453 .print_line = print_graph_function,
1262 .print_header = print_graph_headers, 1454 .print_header = print_graph_headers,
1263 .flags = &tracer_flags, 1455 .flags = &tracer_flags,
1456 .set_flag = func_graph_set_flag,
1264#ifdef CONFIG_FTRACE_SELFTEST 1457#ifdef CONFIG_FTRACE_SELFTEST
1265 .selftest = trace_selftest_startup_function_graph, 1458 .selftest = trace_selftest_startup_function_graph,
1266#endif 1459#endif
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 73a6b0601f2e..c77424be284d 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -80,21 +80,29 @@ static struct tracer_flags tracer_flags = {
80 * skip the latency if the sequence has changed - some other section 80 * skip the latency if the sequence has changed - some other section
81 * did a maximum and could disturb our measurement with serial console 81 * did a maximum and could disturb our measurement with serial console
82 * printouts, etc. Truly coinciding maximum latencies should be rare 82 * printouts, etc. Truly coinciding maximum latencies should be rare
83 * and what happens together happens separately as well, so this doesnt 83 * and what happens together happens separately as well, so this doesn't
84 * decrease the validity of the maximum found: 84 * decrease the validity of the maximum found:
85 */ 85 */
86static __cacheline_aligned_in_smp unsigned long max_sequence; 86static __cacheline_aligned_in_smp unsigned long max_sequence;
87 87
88#ifdef CONFIG_FUNCTION_TRACER 88#ifdef CONFIG_FUNCTION_TRACER
89/* 89/*
90 * irqsoff uses its own tracer function to keep the overhead down: 90 * Prologue for the preempt and irqs off function tracers.
91 *
92 * Returns 1 if it is OK to continue, and data->disabled is
93 * incremented.
94 * 0 if the trace is to be ignored, and data->disabled
95 * is kept the same.
96 *
97 * Note, this function is also used outside this ifdef but
98 * inside the #ifdef of the function graph tracer below.
99 * This is OK, since the function graph tracer is
100 * dependent on the function tracer.
91 */ 101 */
92static void 102static int func_prolog_dec(struct trace_array *tr,
93irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) 103 struct trace_array_cpu **data,
104 unsigned long *flags)
94{ 105{
95 struct trace_array *tr = irqsoff_trace;
96 struct trace_array_cpu *data;
97 unsigned long flags;
98 long disabled; 106 long disabled;
99 int cpu; 107 int cpu;
100 108
@@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
106 */ 114 */
107 cpu = raw_smp_processor_id(); 115 cpu = raw_smp_processor_id();
108 if (likely(!per_cpu(tracing_cpu, cpu))) 116 if (likely(!per_cpu(tracing_cpu, cpu)))
109 return; 117 return 0;
110 118
111 local_save_flags(flags); 119 local_save_flags(*flags);
112 /* slight chance to get a false positive on tracing_cpu */ 120 /* slight chance to get a false positive on tracing_cpu */
113 if (!irqs_disabled_flags(flags)) 121 if (!irqs_disabled_flags(*flags))
114 return; 122 return 0;
115 123
116 data = tr->data[cpu]; 124 *data = tr->data[cpu];
117 disabled = atomic_inc_return(&data->disabled); 125 disabled = atomic_inc_return(&(*data)->disabled);
118 126
119 if (likely(disabled == 1)) 127 if (likely(disabled == 1))
120 trace_function(tr, ip, parent_ip, flags, preempt_count()); 128 return 1;
129
130 atomic_dec(&(*data)->disabled);
131
132 return 0;
133}
134
135/*
136 * irqsoff uses its own tracer function to keep the overhead down:
137 */
138static void
139irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
140{
141 struct trace_array *tr = irqsoff_trace;
142 struct trace_array_cpu *data;
143 unsigned long flags;
144
145 if (!func_prolog_dec(tr, &data, &flags))
146 return;
147
148 trace_function(tr, ip, parent_ip, flags, preempt_count());
121 149
122 atomic_dec(&data->disabled); 150 atomic_dec(&data->disabled);
123} 151}
@@ -125,6 +153,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
125static struct ftrace_ops trace_ops __read_mostly = 153static struct ftrace_ops trace_ops __read_mostly =
126{ 154{
127 .func = irqsoff_tracer_call, 155 .func = irqsoff_tracer_call,
156 .flags = FTRACE_OPS_FL_GLOBAL,
128}; 157};
129#endif /* CONFIG_FUNCTION_TRACER */ 158#endif /* CONFIG_FUNCTION_TRACER */
130 159
@@ -155,30 +184,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
155 struct trace_array *tr = irqsoff_trace; 184 struct trace_array *tr = irqsoff_trace;
156 struct trace_array_cpu *data; 185 struct trace_array_cpu *data;
157 unsigned long flags; 186 unsigned long flags;
158 long disabled;
159 int ret; 187 int ret;
160 int cpu;
161 int pc; 188 int pc;
162 189
163 cpu = raw_smp_processor_id(); 190 if (!func_prolog_dec(tr, &data, &flags))
164 if (likely(!per_cpu(tracing_cpu, cpu)))
165 return 0;
166
167 local_save_flags(flags);
168 /* slight chance to get a false positive on tracing_cpu */
169 if (!irqs_disabled_flags(flags))
170 return 0; 191 return 0;
171 192
172 data = tr->data[cpu]; 193 pc = preempt_count();
173 disabled = atomic_inc_return(&data->disabled); 194 ret = __trace_graph_entry(tr, trace, flags, pc);
174
175 if (likely(disabled == 1)) {
176 pc = preempt_count();
177 ret = __trace_graph_entry(tr, trace, flags, pc);
178 } else
179 ret = 0;
180
181 atomic_dec(&data->disabled); 195 atomic_dec(&data->disabled);
196
182 return ret; 197 return ret;
183} 198}
184 199
@@ -187,27 +202,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
187 struct trace_array *tr = irqsoff_trace; 202 struct trace_array *tr = irqsoff_trace;
188 struct trace_array_cpu *data; 203 struct trace_array_cpu *data;
189 unsigned long flags; 204 unsigned long flags;
190 long disabled;
191 int cpu;
192 int pc; 205 int pc;
193 206
194 cpu = raw_smp_processor_id(); 207 if (!func_prolog_dec(tr, &data, &flags))
195 if (likely(!per_cpu(tracing_cpu, cpu)))
196 return;
197
198 local_save_flags(flags);
199 /* slight chance to get a false positive on tracing_cpu */
200 if (!irqs_disabled_flags(flags))
201 return; 208 return;
202 209
203 data = tr->data[cpu]; 210 pc = preempt_count();
204 disabled = atomic_inc_return(&data->disabled); 211 __trace_graph_return(tr, trace, flags, pc);
205
206 if (likely(disabled == 1)) {
207 pc = preempt_count();
208 __trace_graph_return(tr, trace, flags, pc);
209 }
210
211 atomic_dec(&data->disabled); 212 atomic_dec(&data->disabled);
212} 213}
213 214
@@ -229,75 +230,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
229 230
230static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) 231static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
231{ 232{
232 u32 flags = GRAPH_TRACER_FLAGS;
233
234 if (trace_flags & TRACE_ITER_LATENCY_FMT)
235 flags |= TRACE_GRAPH_PRINT_DURATION;
236 else
237 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
238
239 /* 233 /*
240 * In graph mode call the graph tracer output function, 234 * In graph mode call the graph tracer output function,
241 * otherwise go with the TRACE_FN event handler 235 * otherwise go with the TRACE_FN event handler
242 */ 236 */
243 if (is_graph()) 237 if (is_graph())
244 return print_graph_function_flags(iter, flags); 238 return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
245 239
246 return TRACE_TYPE_UNHANDLED; 240 return TRACE_TYPE_UNHANDLED;
247} 241}
248 242
249static void irqsoff_print_header(struct seq_file *s) 243static void irqsoff_print_header(struct seq_file *s)
250{ 244{
251 if (is_graph()) { 245 if (is_graph())
252 struct trace_iterator *iter = s->private; 246 print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
253 u32 flags = GRAPH_TRACER_FLAGS; 247 else
254
255 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
256 /* print nothing if the buffers are empty */
257 if (trace_empty(iter))
258 return;
259
260 print_trace_header(s, iter);
261 flags |= TRACE_GRAPH_PRINT_DURATION;
262 } else
263 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
264
265 print_graph_headers_flags(s, flags);
266 } else
267 trace_default_header(s); 248 trace_default_header(s);
268} 249}
269 250
270static void 251static void
271trace_graph_function(struct trace_array *tr,
272 unsigned long ip, unsigned long flags, int pc)
273{
274 u64 time = trace_clock_local();
275 struct ftrace_graph_ent ent = {
276 .func = ip,
277 .depth = 0,
278 };
279 struct ftrace_graph_ret ret = {
280 .func = ip,
281 .depth = 0,
282 .calltime = time,
283 .rettime = time,
284 };
285
286 __trace_graph_entry(tr, &ent, flags, pc);
287 __trace_graph_return(tr, &ret, flags, pc);
288}
289
290static void
291__trace_function(struct trace_array *tr, 252__trace_function(struct trace_array *tr,
292 unsigned long ip, unsigned long parent_ip, 253 unsigned long ip, unsigned long parent_ip,
293 unsigned long flags, int pc) 254 unsigned long flags, int pc)
294{ 255{
295 if (!is_graph()) 256 if (is_graph())
257 trace_graph_function(tr, ip, parent_ip, flags, pc);
258 else
296 trace_function(tr, ip, parent_ip, flags, pc); 259 trace_function(tr, ip, parent_ip, flags, pc);
297 else {
298 trace_graph_function(tr, parent_ip, flags, pc);
299 trace_graph_function(tr, ip, flags, pc);
300 }
301} 260}
302 261
303#else 262#else
@@ -495,14 +454,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
495 * Stubs: 454 * Stubs:
496 */ 455 */
497 456
498void early_boot_irqs_off(void)
499{
500}
501
502void early_boot_irqs_on(void)
503{
504}
505
506void trace_softirqs_on(unsigned long ip) 457void trace_softirqs_on(unsigned long ip)
507{ 458{
508} 459}
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 7b8ecd751d93..3c5c5dfea0b3 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -13,7 +13,6 @@
13#include <linux/kdb.h> 13#include <linux/kdb.h>
14#include <linux/ftrace.h> 14#include <linux/ftrace.h>
15 15
16#include "../debug/kdb/kdb_private.h"
17#include "trace.h" 16#include "trace.h"
18#include "trace_output.h" 17#include "trace_output.h"
19 18
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 544301d29dee..27d13b36b8be 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -31,7 +31,6 @@
31#include <linux/perf_event.h> 31#include <linux/perf_event.h>
32#include <linux/stringify.h> 32#include <linux/stringify.h>
33#include <linux/limits.h> 33#include <linux/limits.h>
34#include <linux/uaccess.h>
35#include <asm/bitsperlong.h> 34#include <asm/bitsperlong.h>
36 35
37#include "trace.h" 36#include "trace.h"
@@ -54,7 +53,6 @@ const char *reserved_field_names[] = {
54 "common_preempt_count", 53 "common_preempt_count",
55 "common_pid", 54 "common_pid",
56 "common_tgid", 55 "common_tgid",
57 "common_lock_depth",
58 FIELD_STRING_IP, 56 FIELD_STRING_IP,
59 FIELD_STRING_RETIP, 57 FIELD_STRING_RETIP,
60 FIELD_STRING_FUNC, 58 FIELD_STRING_FUNC,
@@ -354,6 +352,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
354 kfree(data); 352 kfree(data);
355} 353}
356 354
355/* Bitfield fetch function */
356struct bitfield_fetch_param {
357 struct fetch_param orig;
358 unsigned char hi_shift;
359 unsigned char low_shift;
360};
361
362#define DEFINE_FETCH_bitfield(type) \
363static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
364 void *data, void *dest) \
365{ \
366 struct bitfield_fetch_param *bprm = data; \
367 type buf = 0; \
368 call_fetch(&bprm->orig, regs, &buf); \
369 if (buf) { \
370 buf <<= bprm->hi_shift; \
371 buf >>= bprm->low_shift; \
372 } \
373 *(type *)dest = buf; \
374}
375DEFINE_BASIC_FETCH_FUNCS(bitfield)
376#define fetch_bitfield_string NULL
377#define fetch_bitfield_string_size NULL
378
379static __kprobes void
380free_bitfield_fetch_param(struct bitfield_fetch_param *data)
381{
382 /*
383 * Don't check the bitfield itself, because this must be the
384 * last fetch function.
385 */
386 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
387 free_deref_fetch_param(data->orig.data);
388 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
389 free_symbol_cache(data->orig.data);
390 kfree(data);
391}
357/* Default (unsigned long) fetch type */ 392/* Default (unsigned long) fetch type */
358#define __DEFAULT_FETCH_TYPE(t) u##t 393#define __DEFAULT_FETCH_TYPE(t) u##t
359#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) 394#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -368,6 +403,7 @@ enum {
368 FETCH_MTD_memory, 403 FETCH_MTD_memory,
369 FETCH_MTD_symbol, 404 FETCH_MTD_symbol,
370 FETCH_MTD_deref, 405 FETCH_MTD_deref,
406 FETCH_MTD_bitfield,
371 FETCH_MTD_END, 407 FETCH_MTD_END,
372}; 408};
373 409
@@ -388,6 +424,7 @@ ASSIGN_FETCH_FUNC(retval, ftype), \
388ASSIGN_FETCH_FUNC(memory, ftype), \ 424ASSIGN_FETCH_FUNC(memory, ftype), \
389ASSIGN_FETCH_FUNC(symbol, ftype), \ 425ASSIGN_FETCH_FUNC(symbol, ftype), \
390ASSIGN_FETCH_FUNC(deref, ftype), \ 426ASSIGN_FETCH_FUNC(deref, ftype), \
427ASSIGN_FETCH_FUNC(bitfield, ftype), \
391 } \ 428 } \
392 } 429 }
393 430
@@ -431,9 +468,33 @@ static const struct fetch_type *find_fetch_type(const char *type)
431 if (!type) 468 if (!type)
432 type = DEFAULT_FETCH_TYPE_STR; 469 type = DEFAULT_FETCH_TYPE_STR;
433 470
471 /* Special case: bitfield */
472 if (*type == 'b') {
473 unsigned long bs;
474 type = strchr(type, '/');
475 if (!type)
476 goto fail;
477 type++;
478 if (strict_strtoul(type, 0, &bs))
479 goto fail;
480 switch (bs) {
481 case 8:
482 return find_fetch_type("u8");
483 case 16:
484 return find_fetch_type("u16");
485 case 32:
486 return find_fetch_type("u32");
487 case 64:
488 return find_fetch_type("u64");
489 default:
490 goto fail;
491 }
492 }
493
434 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) 494 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
435 if (strcmp(type, fetch_type_table[i].name) == 0) 495 if (strcmp(type, fetch_type_table[i].name) == 0)
436 return &fetch_type_table[i]; 496 return &fetch_type_table[i];
497fail:
437 return NULL; 498 return NULL;
438} 499}
439 500
@@ -587,7 +648,9 @@ error:
587 648
588static void free_probe_arg(struct probe_arg *arg) 649static void free_probe_arg(struct probe_arg *arg)
589{ 650{
590 if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) 651 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
652 free_bitfield_fetch_param(arg->fetch.data);
653 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
591 free_deref_fetch_param(arg->fetch.data); 654 free_deref_fetch_param(arg->fetch.data);
592 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) 655 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
593 free_symbol_cache(arg->fetch.data); 656 free_symbol_cache(arg->fetch.data);
@@ -648,7 +711,7 @@ static int register_trace_probe(struct trace_probe *tp)
648 } 711 }
649 ret = register_probe_event(tp); 712 ret = register_probe_event(tp);
650 if (ret) { 713 if (ret) {
651 pr_warning("Faild to register probe event(%d)\n", ret); 714 pr_warning("Failed to register probe event(%d)\n", ret);
652 goto end; 715 goto end;
653 } 716 }
654 717
@@ -768,16 +831,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
768 } 831 }
769 break; 832 break;
770 case '+': /* deref memory */ 833 case '+': /* deref memory */
834 arg++; /* Skip '+', because strict_strtol() rejects it. */
771 case '-': 835 case '-':
772 tmp = strchr(arg, '('); 836 tmp = strchr(arg, '(');
773 if (!tmp) 837 if (!tmp)
774 break; 838 break;
775 *tmp = '\0'; 839 *tmp = '\0';
776 ret = strict_strtol(arg + 1, 0, &offset); 840 ret = strict_strtol(arg, 0, &offset);
777 if (ret) 841 if (ret)
778 break; 842 break;
779 if (arg[0] == '-')
780 offset = -offset;
781 arg = tmp + 1; 843 arg = tmp + 1;
782 tmp = strrchr(arg, ')'); 844 tmp = strrchr(arg, ')');
783 if (tmp) { 845 if (tmp) {
@@ -808,6 +870,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
808 return ret; 870 return ret;
809} 871}
810 872
873#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
874
875/* Bitfield type needs to be parsed into a fetch function */
876static int __parse_bitfield_probe_arg(const char *bf,
877 const struct fetch_type *t,
878 struct fetch_param *f)
879{
880 struct bitfield_fetch_param *bprm;
881 unsigned long bw, bo;
882 char *tail;
883
884 if (*bf != 'b')
885 return 0;
886
887 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
888 if (!bprm)
889 return -ENOMEM;
890 bprm->orig = *f;
891 f->fn = t->fetch[FETCH_MTD_bitfield];
892 f->data = (void *)bprm;
893
894 bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
895 if (bw == 0 || *tail != '@')
896 return -EINVAL;
897
898 bf = tail + 1;
899 bo = simple_strtoul(bf, &tail, 0);
900 if (tail == bf || *tail != '/')
901 return -EINVAL;
902
903 bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
904 bprm->low_shift = bprm->hi_shift + bo;
905 return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
906}
907
811/* String length checking wrapper */ 908/* String length checking wrapper */
812static int parse_probe_arg(char *arg, struct trace_probe *tp, 909static int parse_probe_arg(char *arg, struct trace_probe *tp,
813 struct probe_arg *parg, int is_return) 910 struct probe_arg *parg, int is_return)
@@ -837,6 +934,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
837 parg->offset = tp->size; 934 parg->offset = tp->size;
838 tp->size += parg->type->size; 935 tp->size += parg->type->size;
839 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); 936 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
937 if (ret >= 0 && t != NULL)
938 ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
840 if (ret >= 0) { 939 if (ret >= 0) {
841 parg->fetch_size.fn = get_fetch_size_function(parg->type, 940 parg->fetch_size.fn = get_fetch_size_function(parg->type,
842 parg->fetch.fn); 941 parg->fetch.fn);
@@ -1131,7 +1230,7 @@ static int command_trace_probe(const char *buf)
1131 return ret; 1230 return ret;
1132} 1231}
1133 1232
1134#define WRITE_BUFSIZE 128 1233#define WRITE_BUFSIZE 4096
1135 1234
1136static ssize_t probes_write(struct file *file, const char __user *buffer, 1235static ssize_t probes_write(struct file *file, const char __user *buffer,
1137 size_t count, loff_t *ppos) 1236 size_t count, loff_t *ppos)
@@ -1739,7 +1838,7 @@ static void unregister_probe_event(struct trace_probe *tp)
1739 kfree(tp->call.print_fmt); 1838 kfree(tp->call.print_fmt);
1740} 1839}
1741 1840
1742/* Make a debugfs interface for controling probe points */ 1841/* Make a debugfs interface for controlling probe points */
1743static __init int init_kprobe_trace(void) 1842static __init int init_kprobe_trace(void)
1744{ 1843{
1745 struct dentry *d_tracer; 1844 struct dentry *d_tracer;
@@ -1771,8 +1870,12 @@ fs_initcall(init_kprobe_trace);
1771 1870
1772#ifdef CONFIG_FTRACE_STARTUP_TEST 1871#ifdef CONFIG_FTRACE_STARTUP_TEST
1773 1872
1774static int kprobe_trace_selftest_target(int a1, int a2, int a3, 1873/*
1775 int a4, int a5, int a6) 1874 * The "__used" keeps gcc from removing the function symbol
1875 * from the kallsyms table.
1876 */
1877static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
1878 int a4, int a5, int a6)
1776{ 1879{
1777 return a1 + a2 + a3 + a4 + a5 + a6; 1880 return a1 + a2 + a3 + a4 + a5 + a6;
1778} 1881}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 02272baa2206..e37de492a9e1 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -353,6 +353,33 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
353} 353}
354EXPORT_SYMBOL(ftrace_print_symbols_seq); 354EXPORT_SYMBOL(ftrace_print_symbols_seq);
355 355
356#if BITS_PER_LONG == 32
357const char *
358ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
359 const struct trace_print_flags_u64 *symbol_array)
360{
361 int i;
362 const char *ret = p->buffer + p->len;
363
364 for (i = 0; symbol_array[i].name; i++) {
365
366 if (val != symbol_array[i].mask)
367 continue;
368
369 trace_seq_puts(p, symbol_array[i].name);
370 break;
371 }
372
373 if (!p->len)
374 trace_seq_printf(p, "0x%llx", val);
375
376 trace_seq_putc(p, 0);
377
378 return ret;
379}
380EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
381#endif
382
356const char * 383const char *
357ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) 384ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
358{ 385{
@@ -529,24 +556,34 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
529 * @entry: The trace entry field from the ring buffer 556 * @entry: The trace entry field from the ring buffer
530 * 557 *
531 * Prints the generic fields of irqs off, in hard or softirq, preempt 558 * Prints the generic fields of irqs off, in hard or softirq, preempt
532 * count and lock depth. 559 * count.
533 */ 560 */
534int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) 561int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
535{ 562{
536 int hardirq, softirq; 563 char hardsoft_irq;
564 char need_resched;
565 char irqs_off;
566 int hardirq;
567 int softirq;
537 int ret; 568 int ret;
538 569
539 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 570 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
540 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 571 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
541 572
573 irqs_off =
574 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
575 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
576 '.';
577 need_resched =
578 (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
579 hardsoft_irq =
580 (hardirq && softirq) ? 'H' :
581 hardirq ? 'h' :
582 softirq ? 's' :
583 '.';
584
542 if (!trace_seq_printf(s, "%c%c%c", 585 if (!trace_seq_printf(s, "%c%c%c",
543 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 586 irqs_off, need_resched, hardsoft_irq))
544 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
545 'X' : '.',
546 (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
547 'N' : '.',
548 (hardirq && softirq) ? 'H' :
549 hardirq ? 'h' : softirq ? 's' : '.'))
550 return 0; 587 return 0;
551 588
552 if (entry->preempt_count) 589 if (entry->preempt_count)
@@ -554,13 +591,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
554 else 591 else
555 ret = trace_seq_putc(s, '.'); 592 ret = trace_seq_putc(s, '.');
556 593
557 if (!ret) 594 return ret;
558 return 0;
559
560 if (entry->lock_depth < 0)
561 return trace_seq_putc(s, '.');
562
563 return trace_seq_printf(s, "%d", entry->lock_depth);
564} 595}
565 596
566static int 597static int
@@ -826,6 +857,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
826enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, 857enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
827 struct trace_event *event) 858 struct trace_event *event)
828{ 859{
860 if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type))
861 return TRACE_TYPE_PARTIAL_LINE;
862
829 return TRACE_TYPE_HANDLED; 863 return TRACE_TYPE_HANDLED;
830} 864}
831 865
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2547d8813cf0..1f06468a10d7 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -32,7 +32,7 @@ static DEFINE_MUTEX(btrace_mutex);
32 32
33struct trace_bprintk_fmt { 33struct trace_bprintk_fmt {
34 struct list_head list; 34 struct list_head list;
35 char fmt[0]; 35 const char *fmt;
36}; 36};
37 37
38static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) 38static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
@@ -49,6 +49,7 @@ static
49void hold_module_trace_bprintk_format(const char **start, const char **end) 49void hold_module_trace_bprintk_format(const char **start, const char **end)
50{ 50{
51 const char **iter; 51 const char **iter;
52 char *fmt;
52 53
53 mutex_lock(&btrace_mutex); 54 mutex_lock(&btrace_mutex);
54 for (iter = start; iter < end; iter++) { 55 for (iter = start; iter < end; iter++) {
@@ -58,14 +59,18 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
58 continue; 59 continue;
59 } 60 }
60 61
61 tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt) 62 tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
62 + strlen(*iter) + 1, GFP_KERNEL); 63 if (tb_fmt)
63 if (tb_fmt) { 64 fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
65 if (tb_fmt && fmt) {
64 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); 66 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
65 strcpy(tb_fmt->fmt, *iter); 67 strcpy(fmt, *iter);
68 tb_fmt->fmt = fmt;
66 *iter = tb_fmt->fmt; 69 *iter = tb_fmt->fmt;
67 } else 70 } else {
71 kfree(tb_fmt);
68 *iter = NULL; 72 *iter = NULL;
73 }
69 } 74 }
70 mutex_unlock(&btrace_mutex); 75 mutex_unlock(&btrace_mutex);
71} 76}
@@ -84,6 +89,76 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self,
84 return 0; 89 return 0;
85} 90}
86 91
92/*
93 * The debugfs/tracing/printk_formats file maps the addresses with
94 * the ASCII formats that are used in the bprintk events in the
95 * buffer. For userspace tools to be able to decode the events from
96 * the buffer, they need to be able to map the address with the format.
97 *
98 * The addresses of the bprintk formats are in their own section
99 * __trace_printk_fmt. But for modules we copy them into a link list.
100 * The code to print the formats and their addresses passes around the
101 * address of the fmt string. If the fmt address passed into the seq
102 * functions is within the kernel core __trace_printk_fmt section, then
103 * it simply uses the next pointer in the list.
104 *
105 * When the fmt pointer is outside the kernel core __trace_printk_fmt
106 * section, then we need to read the link list pointers. The trick is
107 * we pass the address of the string to the seq function just like
108 * we do for the kernel core formats. To get back the structure that
109 * holds the format, we simply use containerof() and then go to the
110 * next format in the list.
111 */
112static const char **
113find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
114{
115 struct trace_bprintk_fmt *mod_fmt;
116
117 if (list_empty(&trace_bprintk_fmt_list))
118 return NULL;
119
120 /*
121 * v will point to the address of the fmt record from t_next
122 * v will be NULL from t_start.
123 * If this is the first pointer or called from start
124 * then we need to walk the list.
125 */
126 if (!v || start_index == *pos) {
127 struct trace_bprintk_fmt *p;
128
129 /* search the module list */
130 list_for_each_entry(p, &trace_bprintk_fmt_list, list) {
131 if (start_index == *pos)
132 return &p->fmt;
133 start_index++;
134 }
135 /* pos > index */
136 return NULL;
137 }
138
139 /*
140 * v points to the address of the fmt field in the mod list
141 * structure that holds the module print format.
142 */
143 mod_fmt = container_of(v, typeof(*mod_fmt), fmt);
144 if (mod_fmt->list.next == &trace_bprintk_fmt_list)
145 return NULL;
146
147 mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list);
148
149 return &mod_fmt->fmt;
150}
151
152static void format_mod_start(void)
153{
154 mutex_lock(&btrace_mutex);
155}
156
157static void format_mod_stop(void)
158{
159 mutex_unlock(&btrace_mutex);
160}
161
87#else /* !CONFIG_MODULES */ 162#else /* !CONFIG_MODULES */
88__init static int 163__init static int
89module_trace_bprintk_format_notify(struct notifier_block *self, 164module_trace_bprintk_format_notify(struct notifier_block *self,
@@ -91,6 +166,13 @@ module_trace_bprintk_format_notify(struct notifier_block *self,
91{ 166{
92 return 0; 167 return 0;
93} 168}
169static inline const char **
170find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
171{
172 return NULL;
173}
174static inline void format_mod_start(void) { }
175static inline void format_mod_stop(void) { }
94#endif /* CONFIG_MODULES */ 176#endif /* CONFIG_MODULES */
95 177
96 178
@@ -153,20 +235,30 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
153} 235}
154EXPORT_SYMBOL_GPL(__ftrace_vprintk); 236EXPORT_SYMBOL_GPL(__ftrace_vprintk);
155 237
238static const char **find_next(void *v, loff_t *pos)
239{
240 const char **fmt = v;
241 int start_index;
242
243 start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt;
244
245 if (*pos < start_index)
246 return __start___trace_bprintk_fmt + *pos;
247
248 return find_next_mod_format(start_index, v, fmt, pos);
249}
250
156static void * 251static void *
157t_start(struct seq_file *m, loff_t *pos) 252t_start(struct seq_file *m, loff_t *pos)
158{ 253{
159 const char **fmt = __start___trace_bprintk_fmt + *pos; 254 format_mod_start();
160 255 return find_next(NULL, pos);
161 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
162 return NULL;
163 return fmt;
164} 256}
165 257
166static void *t_next(struct seq_file *m, void * v, loff_t *pos) 258static void *t_next(struct seq_file *m, void * v, loff_t *pos)
167{ 259{
168 (*pos)++; 260 (*pos)++;
169 return t_start(m, pos); 261 return find_next(v, pos);
170} 262}
171 263
172static int t_show(struct seq_file *m, void *v) 264static int t_show(struct seq_file *m, void *v)
@@ -205,6 +297,7 @@ static int t_show(struct seq_file *m, void *v)
205 297
206static void t_stop(struct seq_file *m, void *p) 298static void t_stop(struct seq_file *m, void *p)
207{ 299{
300 format_mod_stop();
208} 301}
209 302
210static const struct seq_operations show_format_seq_ops = { 303static const struct seq_operations show_format_seq_ops = {
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8f758d070c43..7e62c0a18456 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
247 ctx_trace = tr; 247 ctx_trace = tr;
248} 248}
249 249
250static void stop_sched_trace(struct trace_array *tr)
251{
252 tracing_stop_sched_switch_record();
253}
254
255static int sched_switch_trace_init(struct trace_array *tr)
256{
257 ctx_trace = tr;
258 tracing_reset_online_cpus(tr);
259 tracing_start_sched_switch_record();
260 return 0;
261}
262
263static void sched_switch_trace_reset(struct trace_array *tr)
264{
265 if (sched_ref)
266 stop_sched_trace(tr);
267}
268
269static void sched_switch_trace_start(struct trace_array *tr)
270{
271 sched_stopped = 0;
272}
273
274static void sched_switch_trace_stop(struct trace_array *tr)
275{
276 sched_stopped = 1;
277}
278
279static struct tracer sched_switch_trace __read_mostly =
280{
281 .name = "sched_switch",
282 .init = sched_switch_trace_init,
283 .reset = sched_switch_trace_reset,
284 .start = sched_switch_trace_start,
285 .stop = sched_switch_trace_stop,
286 .wait_pipe = poll_wait_pipe,
287#ifdef CONFIG_FTRACE_SELFTEST
288 .selftest = trace_selftest_startup_sched_switch,
289#endif
290};
291
292__init static int init_sched_switch_trace(void)
293{
294 return register_tracer(&sched_switch_trace);
295}
296device_initcall(init_sched_switch_trace);
297
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 4086eae6e81b..f029dd4fd2ca 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -31,57 +31,258 @@ static int wakeup_rt;
31static arch_spinlock_t wakeup_lock = 31static arch_spinlock_t wakeup_lock =
32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
33 33
34static void wakeup_reset(struct trace_array *tr);
34static void __wakeup_reset(struct trace_array *tr); 35static void __wakeup_reset(struct trace_array *tr);
36static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
37static void wakeup_graph_return(struct ftrace_graph_ret *trace);
35 38
36static int save_lat_flag; 39static int save_lat_flag;
37 40
41#define TRACE_DISPLAY_GRAPH 1
42
43static struct tracer_opt trace_opts[] = {
44#ifdef CONFIG_FUNCTION_GRAPH_TRACER
45 /* display latency trace as call graph */
46 { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
47#endif
48 { } /* Empty entry */
49};
50
51static struct tracer_flags tracer_flags = {
52 .val = 0,
53 .opts = trace_opts,
54};
55
56#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
57
38#ifdef CONFIG_FUNCTION_TRACER 58#ifdef CONFIG_FUNCTION_TRACER
59
39/* 60/*
40 * irqsoff uses its own tracer function to keep the overhead down: 61 * Prologue for the wakeup function tracers.
62 *
63 * Returns 1 if it is OK to continue, and preemption
64 * is disabled and data->disabled is incremented.
65 * 0 if the trace is to be ignored, and preemption
66 * is not disabled and data->disabled is
67 * kept the same.
68 *
69 * Note, this function is also used outside this ifdef but
70 * inside the #ifdef of the function graph tracer below.
71 * This is OK, since the function graph tracer is
72 * dependent on the function tracer.
41 */ 73 */
42static void 74static int
43wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) 75func_prolog_preempt_disable(struct trace_array *tr,
76 struct trace_array_cpu **data,
77 int *pc)
44{ 78{
45 struct trace_array *tr = wakeup_trace;
46 struct trace_array_cpu *data;
47 unsigned long flags;
48 long disabled; 79 long disabled;
49 int cpu; 80 int cpu;
50 int pc;
51 81
52 if (likely(!wakeup_task)) 82 if (likely(!wakeup_task))
53 return; 83 return 0;
54 84
55 pc = preempt_count(); 85 *pc = preempt_count();
56 preempt_disable_notrace(); 86 preempt_disable_notrace();
57 87
58 cpu = raw_smp_processor_id(); 88 cpu = raw_smp_processor_id();
59 if (cpu != wakeup_current_cpu) 89 if (cpu != wakeup_current_cpu)
60 goto out_enable; 90 goto out_enable;
61 91
62 data = tr->data[cpu]; 92 *data = tr->data[cpu];
63 disabled = atomic_inc_return(&data->disabled); 93 disabled = atomic_inc_return(&(*data)->disabled);
64 if (unlikely(disabled != 1)) 94 if (unlikely(disabled != 1))
65 goto out; 95 goto out;
66 96
67 local_irq_save(flags); 97 return 1;
68 98
69 trace_function(tr, ip, parent_ip, flags, pc); 99out:
100 atomic_dec(&(*data)->disabled);
101
102out_enable:
103 preempt_enable_notrace();
104 return 0;
105}
70 106
107/*
108 * wakeup uses its own tracer function to keep the overhead down:
109 */
110static void
111wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
112{
113 struct trace_array *tr = wakeup_trace;
114 struct trace_array_cpu *data;
115 unsigned long flags;
116 int pc;
117
118 if (!func_prolog_preempt_disable(tr, &data, &pc))
119 return;
120
121 local_irq_save(flags);
122 trace_function(tr, ip, parent_ip, flags, pc);
71 local_irq_restore(flags); 123 local_irq_restore(flags);
72 124
73 out:
74 atomic_dec(&data->disabled); 125 atomic_dec(&data->disabled);
75 out_enable:
76 preempt_enable_notrace(); 126 preempt_enable_notrace();
77} 127}
78 128
79static struct ftrace_ops trace_ops __read_mostly = 129static struct ftrace_ops trace_ops __read_mostly =
80{ 130{
81 .func = wakeup_tracer_call, 131 .func = wakeup_tracer_call,
132 .flags = FTRACE_OPS_FL_GLOBAL,
82}; 133};
83#endif /* CONFIG_FUNCTION_TRACER */ 134#endif /* CONFIG_FUNCTION_TRACER */
84 135
136static int start_func_tracer(int graph)
137{
138 int ret;
139
140 if (!graph)
141 ret = register_ftrace_function(&trace_ops);
142 else
143 ret = register_ftrace_graph(&wakeup_graph_return,
144 &wakeup_graph_entry);
145
146 if (!ret && tracing_is_enabled())
147 tracer_enabled = 1;
148 else
149 tracer_enabled = 0;
150
151 return ret;
152}
153
154static void stop_func_tracer(int graph)
155{
156 tracer_enabled = 0;
157
158 if (!graph)
159 unregister_ftrace_function(&trace_ops);
160 else
161 unregister_ftrace_graph();
162}
163
164#ifdef CONFIG_FUNCTION_GRAPH_TRACER
165static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
166{
167
168 if (!(bit & TRACE_DISPLAY_GRAPH))
169 return -EINVAL;
170
171 if (!(is_graph() ^ set))
172 return 0;
173
174 stop_func_tracer(!set);
175
176 wakeup_reset(wakeup_trace);
177 tracing_max_latency = 0;
178
179 return start_func_tracer(set);
180}
181
182static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
183{
184 struct trace_array *tr = wakeup_trace;
185 struct trace_array_cpu *data;
186 unsigned long flags;
187 int pc, ret = 0;
188
189 if (!func_prolog_preempt_disable(tr, &data, &pc))
190 return 0;
191
192 local_save_flags(flags);
193 ret = __trace_graph_entry(tr, trace, flags, pc);
194 atomic_dec(&data->disabled);
195 preempt_enable_notrace();
196
197 return ret;
198}
199
200static void wakeup_graph_return(struct ftrace_graph_ret *trace)
201{
202 struct trace_array *tr = wakeup_trace;
203 struct trace_array_cpu *data;
204 unsigned long flags;
205 int pc;
206
207 if (!func_prolog_preempt_disable(tr, &data, &pc))
208 return;
209
210 local_save_flags(flags);
211 __trace_graph_return(tr, trace, flags, pc);
212 atomic_dec(&data->disabled);
213
214 preempt_enable_notrace();
215 return;
216}
217
218static void wakeup_trace_open(struct trace_iterator *iter)
219{
220 if (is_graph())
221 graph_trace_open(iter);
222}
223
224static void wakeup_trace_close(struct trace_iterator *iter)
225{
226 if (iter->private)
227 graph_trace_close(iter);
228}
229
230#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC)
231
232static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
233{
234 /*
235 * In graph mode call the graph tracer output function,
236 * otherwise go with the TRACE_FN event handler
237 */
238 if (is_graph())
239 return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
240
241 return TRACE_TYPE_UNHANDLED;
242}
243
244static void wakeup_print_header(struct seq_file *s)
245{
246 if (is_graph())
247 print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
248 else
249 trace_default_header(s);
250}
251
252static void
253__trace_function(struct trace_array *tr,
254 unsigned long ip, unsigned long parent_ip,
255 unsigned long flags, int pc)
256{
257 if (is_graph())
258 trace_graph_function(tr, ip, parent_ip, flags, pc);
259 else
260 trace_function(tr, ip, parent_ip, flags, pc);
261}
262#else
263#define __trace_function trace_function
264
265static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
266{
267 return -EINVAL;
268}
269
270static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
271{
272 return -1;
273}
274
275static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
276{
277 return TRACE_TYPE_UNHANDLED;
278}
279
280static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
281static void wakeup_print_header(struct seq_file *s) { }
282static void wakeup_trace_open(struct trace_iterator *iter) { }
283static void wakeup_trace_close(struct trace_iterator *iter) { }
284#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
285
85/* 286/*
86 * Should this new latency be reported/recorded? 287 * Should this new latency be reported/recorded?
87 */ 288 */
@@ -152,7 +353,7 @@ probe_wakeup_sched_switch(void *ignore,
152 /* The task we are waiting for is waking up */ 353 /* The task we are waiting for is waking up */
153 data = wakeup_trace->data[wakeup_cpu]; 354 data = wakeup_trace->data[wakeup_cpu];
154 355
155 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 356 __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
156 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 357 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
157 358
158 T0 = data->preempt_timestamp; 359 T0 = data->preempt_timestamp;
@@ -252,7 +453,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
252 * is not called by an assembly function (where as schedule is) 453 * is not called by an assembly function (where as schedule is)
253 * it should be safe to use it here. 454 * it should be safe to use it here.
254 */ 455 */
255 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); 456 __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
256 457
257out_locked: 458out_locked:
258 arch_spin_unlock(&wakeup_lock); 459 arch_spin_unlock(&wakeup_lock);
@@ -303,12 +504,8 @@ static void start_wakeup_tracer(struct trace_array *tr)
303 */ 504 */
304 smp_wmb(); 505 smp_wmb();
305 506
306 register_ftrace_function(&trace_ops); 507 if (start_func_tracer(is_graph()))
307 508 printk(KERN_ERR "failed to start wakeup tracer\n");
308 if (tracing_is_enabled())
309 tracer_enabled = 1;
310 else
311 tracer_enabled = 0;
312 509
313 return; 510 return;
314fail_deprobe_wake_new: 511fail_deprobe_wake_new:
@@ -320,7 +517,7 @@ fail_deprobe:
320static void stop_wakeup_tracer(struct trace_array *tr) 517static void stop_wakeup_tracer(struct trace_array *tr)
321{ 518{
322 tracer_enabled = 0; 519 tracer_enabled = 0;
323 unregister_ftrace_function(&trace_ops); 520 stop_func_tracer(is_graph());
324 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); 521 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
325 unregister_trace_sched_wakeup_new(probe_wakeup, NULL); 522 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
326 unregister_trace_sched_wakeup(probe_wakeup, NULL); 523 unregister_trace_sched_wakeup(probe_wakeup, NULL);
@@ -379,9 +576,15 @@ static struct tracer wakeup_tracer __read_mostly =
379 .start = wakeup_tracer_start, 576 .start = wakeup_tracer_start,
380 .stop = wakeup_tracer_stop, 577 .stop = wakeup_tracer_stop,
381 .print_max = 1, 578 .print_max = 1,
579 .print_header = wakeup_print_header,
580 .print_line = wakeup_print_line,
581 .flags = &tracer_flags,
582 .set_flag = wakeup_set_flag,
382#ifdef CONFIG_FTRACE_SELFTEST 583#ifdef CONFIG_FTRACE_SELFTEST
383 .selftest = trace_selftest_startup_wakeup, 584 .selftest = trace_selftest_startup_wakeup,
384#endif 585#endif
586 .open = wakeup_trace_open,
587 .close = wakeup_trace_close,
385 .use_max_tr = 1, 588 .use_max_tr = 1,
386}; 589};
387 590
@@ -394,9 +597,15 @@ static struct tracer wakeup_rt_tracer __read_mostly =
394 .stop = wakeup_tracer_stop, 597 .stop = wakeup_tracer_stop,
395 .wait_pipe = poll_wait_pipe, 598 .wait_pipe = poll_wait_pipe,
396 .print_max = 1, 599 .print_max = 1,
600 .print_header = wakeup_print_header,
601 .print_line = wakeup_print_line,
602 .flags = &tracer_flags,
603 .set_flag = wakeup_set_flag,
397#ifdef CONFIG_FTRACE_SELFTEST 604#ifdef CONFIG_FTRACE_SELFTEST
398 .selftest = trace_selftest_startup_wakeup, 605 .selftest = trace_selftest_startup_wakeup,
399#endif 606#endif
607 .open = wakeup_trace_open,
608 .close = wakeup_trace_close,
400 .use_max_tr = 1, 609 .use_max_tr = 1,
401}; 610};
402 611
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 155a415b3209..288541f977fb 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -101,6 +101,206 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
101 101
102#ifdef CONFIG_DYNAMIC_FTRACE 102#ifdef CONFIG_DYNAMIC_FTRACE
103 103
104static int trace_selftest_test_probe1_cnt;
105static void trace_selftest_test_probe1_func(unsigned long ip,
106 unsigned long pip)
107{
108 trace_selftest_test_probe1_cnt++;
109}
110
111static int trace_selftest_test_probe2_cnt;
112static void trace_selftest_test_probe2_func(unsigned long ip,
113 unsigned long pip)
114{
115 trace_selftest_test_probe2_cnt++;
116}
117
118static int trace_selftest_test_probe3_cnt;
119static void trace_selftest_test_probe3_func(unsigned long ip,
120 unsigned long pip)
121{
122 trace_selftest_test_probe3_cnt++;
123}
124
125static int trace_selftest_test_global_cnt;
126static void trace_selftest_test_global_func(unsigned long ip,
127 unsigned long pip)
128{
129 trace_selftest_test_global_cnt++;
130}
131
132static int trace_selftest_test_dyn_cnt;
133static void trace_selftest_test_dyn_func(unsigned long ip,
134 unsigned long pip)
135{
136 trace_selftest_test_dyn_cnt++;
137}
138
139static struct ftrace_ops test_probe1 = {
140 .func = trace_selftest_test_probe1_func,
141};
142
143static struct ftrace_ops test_probe2 = {
144 .func = trace_selftest_test_probe2_func,
145};
146
147static struct ftrace_ops test_probe3 = {
148 .func = trace_selftest_test_probe3_func,
149};
150
151static struct ftrace_ops test_global = {
152 .func = trace_selftest_test_global_func,
153 .flags = FTRACE_OPS_FL_GLOBAL,
154};
155
156static void print_counts(void)
157{
158 printk("(%d %d %d %d %d) ",
159 trace_selftest_test_probe1_cnt,
160 trace_selftest_test_probe2_cnt,
161 trace_selftest_test_probe3_cnt,
162 trace_selftest_test_global_cnt,
163 trace_selftest_test_dyn_cnt);
164}
165
166static void reset_counts(void)
167{
168 trace_selftest_test_probe1_cnt = 0;
169 trace_selftest_test_probe2_cnt = 0;
170 trace_selftest_test_probe3_cnt = 0;
171 trace_selftest_test_global_cnt = 0;
172 trace_selftest_test_dyn_cnt = 0;
173}
174
175static int trace_selftest_ops(int cnt)
176{
177 int save_ftrace_enabled = ftrace_enabled;
178 struct ftrace_ops *dyn_ops;
179 char *func1_name;
180 char *func2_name;
181 int len1;
182 int len2;
183 int ret = -1;
184
185 printk(KERN_CONT "PASSED\n");
186 pr_info("Testing dynamic ftrace ops #%d: ", cnt);
187
188 ftrace_enabled = 1;
189 reset_counts();
190
191 /* Handle PPC64 '.' name */
192 func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
193 func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2);
194 len1 = strlen(func1_name);
195 len2 = strlen(func2_name);
196
197 /*
198 * Probe 1 will trace function 1.
199 * Probe 2 will trace function 2.
200 * Probe 3 will trace functions 1 and 2.
201 */
202 ftrace_set_filter(&test_probe1, func1_name, len1, 1);
203 ftrace_set_filter(&test_probe2, func2_name, len2, 1);
204 ftrace_set_filter(&test_probe3, func1_name, len1, 1);
205 ftrace_set_filter(&test_probe3, func2_name, len2, 0);
206
207 register_ftrace_function(&test_probe1);
208 register_ftrace_function(&test_probe2);
209 register_ftrace_function(&test_probe3);
210 register_ftrace_function(&test_global);
211
212 DYN_FTRACE_TEST_NAME();
213
214 print_counts();
215
216 if (trace_selftest_test_probe1_cnt != 1)
217 goto out;
218 if (trace_selftest_test_probe2_cnt != 0)
219 goto out;
220 if (trace_selftest_test_probe3_cnt != 1)
221 goto out;
222 if (trace_selftest_test_global_cnt == 0)
223 goto out;
224
225 DYN_FTRACE_TEST_NAME2();
226
227 print_counts();
228
229 if (trace_selftest_test_probe1_cnt != 1)
230 goto out;
231 if (trace_selftest_test_probe2_cnt != 1)
232 goto out;
233 if (trace_selftest_test_probe3_cnt != 2)
234 goto out;
235
236 /* Add a dynamic probe */
237 dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL);
238 if (!dyn_ops) {
239 printk("MEMORY ERROR ");
240 goto out;
241 }
242
243 dyn_ops->func = trace_selftest_test_dyn_func;
244
245 register_ftrace_function(dyn_ops);
246
247 trace_selftest_test_global_cnt = 0;
248
249 DYN_FTRACE_TEST_NAME();
250
251 print_counts();
252
253 if (trace_selftest_test_probe1_cnt != 2)
254 goto out_free;
255 if (trace_selftest_test_probe2_cnt != 1)
256 goto out_free;
257 if (trace_selftest_test_probe3_cnt != 3)
258 goto out_free;
259 if (trace_selftest_test_global_cnt == 0)
260 goto out;
261 if (trace_selftest_test_dyn_cnt == 0)
262 goto out_free;
263
264 DYN_FTRACE_TEST_NAME2();
265
266 print_counts();
267
268 if (trace_selftest_test_probe1_cnt != 2)
269 goto out_free;
270 if (trace_selftest_test_probe2_cnt != 2)
271 goto out_free;
272 if (trace_selftest_test_probe3_cnt != 4)
273 goto out_free;
274
275 ret = 0;
276 out_free:
277 unregister_ftrace_function(dyn_ops);
278 kfree(dyn_ops);
279
280 out:
281 /* Purposely unregister in the same order */
282 unregister_ftrace_function(&test_probe1);
283 unregister_ftrace_function(&test_probe2);
284 unregister_ftrace_function(&test_probe3);
285 unregister_ftrace_function(&test_global);
286
287 /* Make sure everything is off */
288 reset_counts();
289 DYN_FTRACE_TEST_NAME();
290 DYN_FTRACE_TEST_NAME();
291
292 if (trace_selftest_test_probe1_cnt ||
293 trace_selftest_test_probe2_cnt ||
294 trace_selftest_test_probe3_cnt ||
295 trace_selftest_test_global_cnt ||
296 trace_selftest_test_dyn_cnt)
297 ret = -1;
298
299 ftrace_enabled = save_ftrace_enabled;
300
301 return ret;
302}
303
104/* Test dynamic code modification and ftrace filters */ 304/* Test dynamic code modification and ftrace filters */
105int trace_selftest_startup_dynamic_tracing(struct tracer *trace, 305int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
106 struct trace_array *tr, 306 struct trace_array *tr,
@@ -131,7 +331,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
131 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); 331 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
132 332
133 /* filter only on our function */ 333 /* filter only on our function */
134 ftrace_set_filter(func_name, strlen(func_name), 1); 334 ftrace_set_global_filter(func_name, strlen(func_name), 1);
135 335
136 /* enable tracing */ 336 /* enable tracing */
137 ret = tracer_init(trace, tr); 337 ret = tracer_init(trace, tr);
@@ -166,22 +366,30 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
166 366
167 /* check the trace buffer */ 367 /* check the trace buffer */
168 ret = trace_test_buffer(tr, &count); 368 ret = trace_test_buffer(tr, &count);
169 trace->reset(tr);
170 tracing_start(); 369 tracing_start();
171 370
172 /* we should only have one item */ 371 /* we should only have one item */
173 if (!ret && count != 1) { 372 if (!ret && count != 1) {
373 trace->reset(tr);
174 printk(KERN_CONT ".. filter failed count=%ld ..", count); 374 printk(KERN_CONT ".. filter failed count=%ld ..", count);
175 ret = -1; 375 ret = -1;
176 goto out; 376 goto out;
177 } 377 }
178 378
379 /* Test the ops with global tracing running */
380 ret = trace_selftest_ops(1);
381 trace->reset(tr);
382
179 out: 383 out:
180 ftrace_enabled = save_ftrace_enabled; 384 ftrace_enabled = save_ftrace_enabled;
181 tracer_enabled = save_tracer_enabled; 385 tracer_enabled = save_tracer_enabled;
182 386
183 /* Enable tracing on all functions again */ 387 /* Enable tracing on all functions again */
184 ftrace_set_filter(NULL, 0, 1); 388 ftrace_set_global_filter(NULL, 0, 1);
389
390 /* Test the ops with global tracing off */
391 if (!ret)
392 ret = trace_selftest_ops(2);
185 393
186 return ret; 394 return ret;
187} 395}
@@ -558,7 +766,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
558static int trace_wakeup_test_thread(void *data) 766static int trace_wakeup_test_thread(void *data)
559{ 767{
560 /* Make this a RT thread, doesn't need to be too high */ 768 /* Make this a RT thread, doesn't need to be too high */
561 struct sched_param param = { .sched_priority = 5 }; 769 static const struct sched_param param = { .sched_priority = 5 };
562 struct completion *x = data; 770 struct completion *x = data;
563 771
564 sched_setscheduler(current, SCHED_FIFO, &param); 772 sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
index 54dd77cce5bf..b4c475a0a48b 100644
--- a/kernel/trace/trace_selftest_dynamic.c
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -5,3 +5,9 @@ int DYN_FTRACE_TEST_NAME(void)
5 /* used to call mcount */ 5 /* used to call mcount */
6 return 0; 6 return 0;
7} 7}
8
9int DYN_FTRACE_TEST_NAME2(void)
10{
11 /* used to call mcount */
12 return 0;
13}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index a6b7e0e0f3eb..b0b53b8e4c25 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -133,6 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
133static struct ftrace_ops trace_ops __read_mostly = 133static struct ftrace_ops trace_ops __read_mostly =
134{ 134{
135 .func = stack_trace_call, 135 .func = stack_trace_call,
136 .flags = FTRACE_OPS_FL_GLOBAL,
136}; 137};
137 138
138static ssize_t 139static ssize_t
@@ -195,6 +196,7 @@ static const struct file_operations stack_max_size_fops = {
195 .open = tracing_open_generic, 196 .open = tracing_open_generic,
196 .read = stack_max_size_read, 197 .read = stack_max_size_read,
197 .write = stack_max_size_write, 198 .write = stack_max_size_write,
199 .llseek = default_llseek,
198}; 200};
199 201
200static void * 202static void *
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index bac752f0cfb5..ee7b5a0bb9f8 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event,
23static int syscall_enter_define_fields(struct ftrace_event_call *call); 23static int syscall_enter_define_fields(struct ftrace_event_call *call);
24static int syscall_exit_define_fields(struct ftrace_event_call *call); 24static int syscall_exit_define_fields(struct ftrace_event_call *call);
25 25
26/* All syscall exit events have the same fields */
27static LIST_HEAD(syscall_exit_fields);
28
29static struct list_head * 26static struct list_head *
30syscall_get_enter_fields(struct ftrace_event_call *call) 27syscall_get_enter_fields(struct ftrace_event_call *call)
31{ 28{
@@ -34,61 +31,66 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
34 return &entry->enter_fields; 31 return &entry->enter_fields;
35} 32}
36 33
37static struct list_head *
38syscall_get_exit_fields(struct ftrace_event_call *call)
39{
40 return &syscall_exit_fields;
41}
42
43struct trace_event_functions enter_syscall_print_funcs = { 34struct trace_event_functions enter_syscall_print_funcs = {
44 .trace = print_syscall_enter, 35 .trace = print_syscall_enter,
45}; 36};
46 37
47struct trace_event_functions exit_syscall_print_funcs = { 38struct trace_event_functions exit_syscall_print_funcs = {
48 .trace = print_syscall_exit, 39 .trace = print_syscall_exit,
49}; 40};
50 41
51struct ftrace_event_class event_class_syscall_enter = { 42struct ftrace_event_class event_class_syscall_enter = {
52 .system = "syscalls", 43 .system = "syscalls",
53 .reg = syscall_enter_register, 44 .reg = syscall_enter_register,
54 .define_fields = syscall_enter_define_fields, 45 .define_fields = syscall_enter_define_fields,
55 .get_fields = syscall_get_enter_fields, 46 .get_fields = syscall_get_enter_fields,
56 .raw_init = init_syscall_trace, 47 .raw_init = init_syscall_trace,
57}; 48};
58 49
59struct ftrace_event_class event_class_syscall_exit = { 50struct ftrace_event_class event_class_syscall_exit = {
60 .system = "syscalls", 51 .system = "syscalls",
61 .reg = syscall_exit_register, 52 .reg = syscall_exit_register,
62 .define_fields = syscall_exit_define_fields, 53 .define_fields = syscall_exit_define_fields,
63 .get_fields = syscall_get_exit_fields, 54 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
64 .raw_init = init_syscall_trace, 55 .raw_init = init_syscall_trace,
65}; 56};
66 57
67extern unsigned long __start_syscalls_metadata[]; 58extern struct syscall_metadata *__start_syscalls_metadata[];
68extern unsigned long __stop_syscalls_metadata[]; 59extern struct syscall_metadata *__stop_syscalls_metadata[];
69 60
70static struct syscall_metadata **syscalls_metadata; 61static struct syscall_metadata **syscalls_metadata;
71 62
72static struct syscall_metadata *find_syscall_meta(unsigned long syscall) 63#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
64static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
65{
66 /*
67 * Only compare after the "sys" prefix. Archs that use
68 * syscall wrappers may have syscalls symbols aliases prefixed
69 * with "SyS" instead of "sys", leading to an unwanted
70 * mismatch.
71 */
72 return !strcmp(sym + 3, name + 3);
73}
74#endif
75
76static __init struct syscall_metadata *
77find_syscall_meta(unsigned long syscall)
73{ 78{
74 struct syscall_metadata *start; 79 struct syscall_metadata **start;
75 struct syscall_metadata *stop; 80 struct syscall_metadata **stop;
76 char str[KSYM_SYMBOL_LEN]; 81 char str[KSYM_SYMBOL_LEN];
77 82
78 83
79 start = (struct syscall_metadata *)__start_syscalls_metadata; 84 start = __start_syscalls_metadata;
80 stop = (struct syscall_metadata *)__stop_syscalls_metadata; 85 stop = __stop_syscalls_metadata;
81 kallsyms_lookup(syscall, NULL, NULL, NULL, str); 86 kallsyms_lookup(syscall, NULL, NULL, NULL, str);
82 87
88 if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
89 return NULL;
90
83 for ( ; start < stop; start++) { 91 for ( ; start < stop; start++) {
84 /* 92 if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
85 * Only compare after the "sys" prefix. Archs that use 93 return *start;
86 * syscall wrappers may have syscalls symbols aliases prefixed
87 * with "SyS" instead of "sys", leading to an unwanted
88 * mismatch.
89 */
90 if (start->name && !strcmp(start->name + 3, str + 3))
91 return start;
92 } 94 }
93 return NULL; 95 return NULL;
94} 96}
@@ -367,7 +369,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
367 int num; 369 int num;
368 370
369 num = ((struct syscall_metadata *)call->data)->syscall_nr; 371 num = ((struct syscall_metadata *)call->data)->syscall_nr;
370 if (num < 0 || num >= NR_syscalls) 372 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
371 return -ENOSYS; 373 return -ENOSYS;
372 mutex_lock(&syscall_trace_lock); 374 mutex_lock(&syscall_trace_lock);
373 if (!sys_refcount_enter) 375 if (!sys_refcount_enter)
@@ -385,7 +387,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
385 int num; 387 int num;
386 388
387 num = ((struct syscall_metadata *)call->data)->syscall_nr; 389 num = ((struct syscall_metadata *)call->data)->syscall_nr;
388 if (num < 0 || num >= NR_syscalls) 390 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
389 return; 391 return;
390 mutex_lock(&syscall_trace_lock); 392 mutex_lock(&syscall_trace_lock);
391 sys_refcount_enter--; 393 sys_refcount_enter--;
@@ -401,7 +403,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
401 int num; 403 int num;
402 404
403 num = ((struct syscall_metadata *)call->data)->syscall_nr; 405 num = ((struct syscall_metadata *)call->data)->syscall_nr;
404 if (num < 0 || num >= NR_syscalls) 406 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
405 return -ENOSYS; 407 return -ENOSYS;
406 mutex_lock(&syscall_trace_lock); 408 mutex_lock(&syscall_trace_lock);
407 if (!sys_refcount_exit) 409 if (!sys_refcount_exit)
@@ -419,7 +421,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
419 int num; 421 int num;
420 422
421 num = ((struct syscall_metadata *)call->data)->syscall_nr; 423 num = ((struct syscall_metadata *)call->data)->syscall_nr;
422 if (num < 0 || num >= NR_syscalls) 424 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
423 return; 425 return;
424 mutex_lock(&syscall_trace_lock); 426 mutex_lock(&syscall_trace_lock);
425 sys_refcount_exit--; 427 sys_refcount_exit--;
@@ -432,6 +434,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
432int init_syscall_trace(struct ftrace_event_call *call) 434int init_syscall_trace(struct ftrace_event_call *call)
433{ 435{
434 int id; 436 int id;
437 int num;
438
439 num = ((struct syscall_metadata *)call->data)->syscall_nr;
440 if (num < 0 || num >= NR_syscalls) {
441 pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
442 ((struct syscall_metadata *)call->data)->name);
443 return -ENOSYS;
444 }
435 445
436 if (set_syscall_print_fmt(call) < 0) 446 if (set_syscall_print_fmt(call) < 0)
437 return -ENOMEM; 447 return -ENOMEM;
@@ -446,7 +456,7 @@ int init_syscall_trace(struct ftrace_event_call *call)
446 return id; 456 return id;
447} 457}
448 458
449unsigned long __init arch_syscall_addr(int nr) 459unsigned long __init __weak arch_syscall_addr(int nr)
450{ 460{
451 return (unsigned long)sys_call_table[nr]; 461 return (unsigned long)sys_call_table[nr];
452} 462}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index a7cc3793baf6..209b379a4721 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void)
263{ 263{
264 int ret, cpu; 264 int ret, cpu;
265 265
266 for_each_possible_cpu(cpu) {
267 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
268 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
269 }
270
266 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); 271 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
267 if (ret) 272 if (ret)
268 goto out; 273 goto out;
@@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void)
279 if (ret) 284 if (ret)
280 goto no_creation; 285 goto no_creation;
281 286
282 for_each_possible_cpu(cpu) {
283 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
284 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
285 }
286
287 return 0; 287 return 0;
288 288
289no_creation: 289no_creation:
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index c77f3eceea25..b219f1449c54 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,9 +25,10 @@
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/jump_label.h>
28 29
29extern struct tracepoint __start___tracepoints[]; 30extern struct tracepoint * const __start___tracepoints_ptrs[];
30extern struct tracepoint __stop___tracepoints[]; 31extern struct tracepoint * const __stop___tracepoints_ptrs[];
31 32
32/* Set to 1 to enable tracepoint debug output */ 33/* Set to 1 to enable tracepoint debug output */
33static const int tracepoint_debug; 34static const int tracepoint_debug;
@@ -250,9 +251,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
250{ 251{
251 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 252 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
252 253
253 if (elem->regfunc && !elem->state && active) 254 if (elem->regfunc && !jump_label_enabled(&elem->key) && active)
254 elem->regfunc(); 255 elem->regfunc();
255 else if (elem->unregfunc && elem->state && !active) 256 else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active)
256 elem->unregfunc(); 257 elem->unregfunc();
257 258
258 /* 259 /*
@@ -263,7 +264,10 @@ static void set_tracepoint(struct tracepoint_entry **entry,
263 * is used. 264 * is used.
264 */ 265 */
265 rcu_assign_pointer(elem->funcs, (*entry)->funcs); 266 rcu_assign_pointer(elem->funcs, (*entry)->funcs);
266 elem->state = active; 267 if (active && !jump_label_enabled(&elem->key))
268 jump_label_inc(&elem->key);
269 else if (!active && jump_label_enabled(&elem->key))
270 jump_label_dec(&elem->key);
267} 271}
268 272
269/* 273/*
@@ -274,10 +278,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
274 */ 278 */
275static void disable_tracepoint(struct tracepoint *elem) 279static void disable_tracepoint(struct tracepoint *elem)
276{ 280{
277 if (elem->unregfunc && elem->state) 281 if (elem->unregfunc && jump_label_enabled(&elem->key))
278 elem->unregfunc(); 282 elem->unregfunc();
279 283
280 elem->state = 0; 284 if (jump_label_enabled(&elem->key))
285 jump_label_dec(&elem->key);
281 rcu_assign_pointer(elem->funcs, NULL); 286 rcu_assign_pointer(elem->funcs, NULL);
282} 287}
283 288
@@ -288,10 +293,10 @@ static void disable_tracepoint(struct tracepoint *elem)
288 * 293 *
289 * Updates the probe callback corresponding to a range of tracepoints. 294 * Updates the probe callback corresponding to a range of tracepoints.
290 */ 295 */
291void 296void tracepoint_update_probe_range(struct tracepoint * const *begin,
292tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) 297 struct tracepoint * const *end)
293{ 298{
294 struct tracepoint *iter; 299 struct tracepoint * const *iter;
295 struct tracepoint_entry *mark_entry; 300 struct tracepoint_entry *mark_entry;
296 301
297 if (!begin) 302 if (!begin)
@@ -299,12 +304,12 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
299 304
300 mutex_lock(&tracepoints_mutex); 305 mutex_lock(&tracepoints_mutex);
301 for (iter = begin; iter < end; iter++) { 306 for (iter = begin; iter < end; iter++) {
302 mark_entry = get_tracepoint(iter->name); 307 mark_entry = get_tracepoint((*iter)->name);
303 if (mark_entry) { 308 if (mark_entry) {
304 set_tracepoint(&mark_entry, iter, 309 set_tracepoint(&mark_entry, *iter,
305 !!mark_entry->refcount); 310 !!mark_entry->refcount);
306 } else { 311 } else {
307 disable_tracepoint(iter); 312 disable_tracepoint(*iter);
308 } 313 }
309 } 314 }
310 mutex_unlock(&tracepoints_mutex); 315 mutex_unlock(&tracepoints_mutex);
@@ -316,8 +321,8 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
316static void tracepoint_update_probes(void) 321static void tracepoint_update_probes(void)
317{ 322{
318 /* Core kernel tracepoints */ 323 /* Core kernel tracepoints */
319 tracepoint_update_probe_range(__start___tracepoints, 324 tracepoint_update_probe_range(__start___tracepoints_ptrs,
320 __stop___tracepoints); 325 __stop___tracepoints_ptrs);
321 /* tracepoints in modules. */ 326 /* tracepoints in modules. */
322 module_update_tracepoints(); 327 module_update_tracepoints();
323} 328}
@@ -504,8 +509,8 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
504 * Will return the first tracepoint in the range if the input tracepoint is 509 * Will return the first tracepoint in the range if the input tracepoint is
505 * NULL. 510 * NULL.
506 */ 511 */
507int tracepoint_get_iter_range(struct tracepoint **tracepoint, 512int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
508 struct tracepoint *begin, struct tracepoint *end) 513 struct tracepoint * const *begin, struct tracepoint * const *end)
509{ 514{
510 if (!*tracepoint && begin != end) { 515 if (!*tracepoint && begin != end) {
511 *tracepoint = begin; 516 *tracepoint = begin;
@@ -524,7 +529,8 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter)
524 /* Core kernel tracepoints */ 529 /* Core kernel tracepoints */
525 if (!iter->module) { 530 if (!iter->module) {
526 found = tracepoint_get_iter_range(&iter->tracepoint, 531 found = tracepoint_get_iter_range(&iter->tracepoint,
527 __start___tracepoints, __stop___tracepoints); 532 __start___tracepoints_ptrs,
533 __stop___tracepoints_ptrs);
528 if (found) 534 if (found)
529 goto end; 535 goto end;
530 } 536 }
@@ -575,8 +581,8 @@ int tracepoint_module_notify(struct notifier_block *self,
575 switch (val) { 581 switch (val) {
576 case MODULE_STATE_COMING: 582 case MODULE_STATE_COMING:
577 case MODULE_STATE_GOING: 583 case MODULE_STATE_GOING:
578 tracepoint_update_probe_range(mod->tracepoints, 584 tracepoint_update_probe_range(mod->tracepoints_ptrs,
579 mod->tracepoints + mod->num_tracepoints); 585 mod->tracepoints_ptrs + mod->num_tracepoints);
580 break; 586 break;
581 } 587 }
582 return 0; 588 return 0;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 0a67e041edf8..24dc60d9fa1f 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -63,12 +63,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
63 stats->ac_ppid = pid_alive(tsk) ? 63 stats->ac_ppid = pid_alive(tsk) ?
64 rcu_dereference(tsk->real_parent)->tgid : 0; 64 rcu_dereference(tsk->real_parent)->tgid : 0;
65 rcu_read_unlock(); 65 rcu_read_unlock();
66 stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; 66 stats->ac_utime = cputime_to_usecs(tsk->utime);
67 stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; 67 stats->ac_stime = cputime_to_usecs(tsk->stime);
68 stats->ac_utimescaled = 68 stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled);
69 cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC; 69 stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled);
70 stats->ac_stimescaled =
71 cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC;
72 stats->ac_minflt = tsk->min_flt; 70 stats->ac_minflt = tsk->min_flt;
73 stats->ac_majflt = tsk->maj_flt; 71 stats->ac_majflt = tsk->maj_flt;
74 72
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 419209893d87..51c6e89e8619 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
189 struct group_info *group_info; 189 struct group_info *group_info;
190 int retval; 190 int retval;
191 191
192 if (!capable(CAP_SETGID)) 192 if (!nsown_capable(CAP_SETGID))
193 return -EPERM; 193 return -EPERM;
194 if ((unsigned)gidsetsize > NGROUPS_MAX) 194 if ((unsigned)gidsetsize > NGROUPS_MAX)
195 return -EINVAL; 195 return -EINVAL;
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index eb27fd3430a2..92cb706c7fc8 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -20,7 +20,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register);
20 20
21/* 21/*
22 * Removes a registered user return notifier. Must be called from atomic 22 * Removes a registered user return notifier. Must be called from atomic
23 * context, and from the same cpu registration occured in. 23 * context, and from the same cpu registration occurred in.
24 */ 24 */
25void user_return_notifier_unregister(struct user_return_notifier *urn) 25void user_return_notifier_unregister(struct user_return_notifier *urn)
26{ 26{
diff --git a/kernel/user.c b/kernel/user.c
index 7e72614b736d..9e03e9c1df8d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -17,9 +17,13 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19 19
20/*
21 * userns count is 1 for root user, 1 for init_uts_ns,
22 * and 1 for... ?
23 */
20struct user_namespace init_user_ns = { 24struct user_namespace init_user_ns = {
21 .kref = { 25 .kref = {
22 .refcount = ATOMIC_INIT(2), 26 .refcount = ATOMIC_INIT(3),
23 }, 27 },
24 .creator = &root_user, 28 .creator = &root_user,
25}; 29};
@@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep;
47 */ 51 */
48static DEFINE_SPINLOCK(uidhash_lock); 52static DEFINE_SPINLOCK(uidhash_lock);
49 53
50/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */ 54/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */
51struct user_struct root_user = { 55struct user_struct root_user = {
52 .__count = ATOMIC_INIT(2), 56 .__count = ATOMIC_INIT(2),
53 .processes = ATOMIC_INIT(1), 57 .processes = ATOMIC_INIT(1),
@@ -91,6 +95,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
91 * upon function exit. 95 * upon function exit.
92 */ 96 */
93static void free_user(struct user_struct *up, unsigned long flags) 97static void free_user(struct user_struct *up, unsigned long flags)
98 __releases(&uidhash_lock)
94{ 99{
95 uid_hash_remove(up); 100 uid_hash_remove(up);
96 spin_unlock_irqrestore(&uidhash_lock, flags); 101 spin_unlock_irqrestore(&uidhash_lock, flags);
@@ -157,6 +162,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
157 spin_lock_irq(&uidhash_lock); 162 spin_lock_irq(&uidhash_lock);
158 up = uid_hash_find(uid, hashent); 163 up = uid_hash_find(uid, hashent);
159 if (up) { 164 if (up) {
165 put_user_ns(ns);
160 key_put(new->uid_keyring); 166 key_put(new->uid_keyring);
161 key_put(new->session_keyring); 167 key_put(new->session_keyring);
162 kmem_cache_free(uid_cachep, new); 168 kmem_cache_free(uid_cachep, new);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 25915832291a..9da289c34f22 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -12,6 +12,8 @@
12#include <linux/highuid.h> 12#include <linux/highuid.h>
13#include <linux/cred.h> 13#include <linux/cred.h>
14 14
15static struct kmem_cache *user_ns_cachep __read_mostly;
16
15/* 17/*
16 * Create a new user namespace, deriving the creator from the user in the 18 * Create a new user namespace, deriving the creator from the user in the
17 * passed credentials, and replacing that user with the new root user for the 19 * passed credentials, and replacing that user with the new root user for the
@@ -26,7 +28,7 @@ int create_user_ns(struct cred *new)
26 struct user_struct *root_user; 28 struct user_struct *root_user;
27 int n; 29 int n;
28 30
29 ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); 31 ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL);
30 if (!ns) 32 if (!ns)
31 return -ENOMEM; 33 return -ENOMEM;
32 34
@@ -38,7 +40,7 @@ int create_user_ns(struct cred *new)
38 /* Alloc new root user. */ 40 /* Alloc new root user. */
39 root_user = alloc_uid(ns, 0); 41 root_user = alloc_uid(ns, 0);
40 if (!root_user) { 42 if (!root_user) {
41 kfree(ns); 43 kmem_cache_free(user_ns_cachep, ns);
42 return -ENOMEM; 44 return -ENOMEM;
43 } 45 }
44 46
@@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work)
71 struct user_namespace *ns = 73 struct user_namespace *ns =
72 container_of(work, struct user_namespace, destroyer); 74 container_of(work, struct user_namespace, destroyer);
73 free_uid(ns->creator); 75 free_uid(ns->creator);
74 kfree(ns); 76 kmem_cache_free(user_ns_cachep, ns);
75} 77}
76 78
77void free_user_ns(struct kref *kref) 79void free_user_ns(struct kref *kref)
@@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t
126 /* No useful relationship so no mapping */ 128 /* No useful relationship so no mapping */
127 return overflowgid; 129 return overflowgid;
128} 130}
131
132static __init int user_namespaces_init(void)
133{
134 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
135 return 0;
136}
137module_init(user_namespaces_init);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 8a82b4b8ea52..bff131b9510a 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,8 @@
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/user_namespace.h>
18#include <linux/proc_fs.h>
17 19
18static struct uts_namespace *create_uts_ns(void) 20static struct uts_namespace *create_uts_ns(void)
19{ 21{
@@ -30,7 +32,8 @@ static struct uts_namespace *create_uts_ns(void)
30 * @old_ns: namespace to clone 32 * @old_ns: namespace to clone
31 * Return NULL on error (failure to kmalloc), new ns otherwise 33 * Return NULL on error (failure to kmalloc), new ns otherwise
32 */ 34 */
33static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) 35static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
36 struct uts_namespace *old_ns)
34{ 37{
35 struct uts_namespace *ns; 38 struct uts_namespace *ns;
36 39
@@ -40,6 +43,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
40 43
41 down_read(&uts_sem); 44 down_read(&uts_sem);
42 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 45 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
46 ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
43 up_read(&uts_sem); 47 up_read(&uts_sem);
44 return ns; 48 return ns;
45} 49}
@@ -50,8 +54,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
50 * utsname of this process won't be seen by parent, and vice 54 * utsname of this process won't be seen by parent, and vice
51 * versa. 55 * versa.
52 */ 56 */
53struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns) 57struct uts_namespace *copy_utsname(unsigned long flags,
58 struct task_struct *tsk)
54{ 59{
60 struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
55 struct uts_namespace *new_ns; 61 struct uts_namespace *new_ns;
56 62
57 BUG_ON(!old_ns); 63 BUG_ON(!old_ns);
@@ -60,7 +66,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol
60 if (!(flags & CLONE_NEWUTS)) 66 if (!(flags & CLONE_NEWUTS))
61 return old_ns; 67 return old_ns;
62 68
63 new_ns = clone_uts_ns(old_ns); 69 new_ns = clone_uts_ns(tsk, old_ns);
64 70
65 put_uts_ns(old_ns); 71 put_uts_ns(old_ns);
66 return new_ns; 72 return new_ns;
@@ -71,5 +77,44 @@ void free_uts_ns(struct kref *kref)
71 struct uts_namespace *ns; 77 struct uts_namespace *ns;
72 78
73 ns = container_of(kref, struct uts_namespace, kref); 79 ns = container_of(kref, struct uts_namespace, kref);
80 put_user_ns(ns->user_ns);
74 kfree(ns); 81 kfree(ns);
75} 82}
83
84static void *utsns_get(struct task_struct *task)
85{
86 struct uts_namespace *ns = NULL;
87 struct nsproxy *nsproxy;
88
89 rcu_read_lock();
90 nsproxy = task_nsproxy(task);
91 if (nsproxy) {
92 ns = nsproxy->uts_ns;
93 get_uts_ns(ns);
94 }
95 rcu_read_unlock();
96
97 return ns;
98}
99
100static void utsns_put(void *ns)
101{
102 put_uts_ns(ns);
103}
104
105static int utsns_install(struct nsproxy *nsproxy, void *ns)
106{
107 get_uts_ns(ns);
108 put_uts_ns(nsproxy->uts_ns);
109 nsproxy->uts_ns = ns;
110 return 0;
111}
112
113const struct proc_ns_operations utsns_operations = {
114 .name = "uts",
115 .type = CLONE_NEWUTS,
116 .get = utsns_get,
117 .put = utsns_put,
118 .install = utsns_install,
119};
120
diff --git a/kernel/wait.c b/kernel/wait.c
index c4bd3d825f35..f45ea8d2a1ce 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -92,7 +92,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
92} 92}
93EXPORT_SYMBOL(prepare_to_wait_exclusive); 93EXPORT_SYMBOL(prepare_to_wait_exclusive);
94 94
95/* 95/**
96 * finish_wait - clean up after waiting in a queue 96 * finish_wait - clean up after waiting in a queue
97 * @q: waitqueue waited on 97 * @q: waitqueue waited on
98 * @wait: wait descriptor 98 * @wait: wait descriptor
@@ -127,11 +127,11 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
127} 127}
128EXPORT_SYMBOL(finish_wait); 128EXPORT_SYMBOL(finish_wait);
129 129
130/* 130/**
131 * abort_exclusive_wait - abort exclusive waiting in a queue 131 * abort_exclusive_wait - abort exclusive waiting in a queue
132 * @q: waitqueue waited on 132 * @q: waitqueue waited on
133 * @wait: wait descriptor 133 * @wait: wait descriptor
134 * @state: runstate of the waiter to be woken 134 * @mode: runstate of the waiter to be woken
135 * @key: key to identify a wait bit queue or %NULL 135 * @key: key to identify a wait bit queue or %NULL
136 * 136 *
137 * Sets current thread back to running state and removes 137 * Sets current thread back to running state and removes
@@ -142,7 +142,7 @@ EXPORT_SYMBOL(finish_wait);
142 * woken up through the queue. 142 * woken up through the queue.
143 * 143 *
144 * This prevents waiter starvation where an exclusive waiter 144 * This prevents waiter starvation where an exclusive waiter
145 * aborts and is woken up concurrently and noone wakes up 145 * aborts and is woken up concurrently and no one wakes up
146 * the next waiter. 146 * the next waiter.
147 */ 147 */
148void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, 148void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7f9c3c52ecc1..3d0c56ad4792 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -27,8 +27,8 @@
27#include <asm/irq_regs.h> 27#include <asm/irq_regs.h>
28#include <linux/perf_event.h> 28#include <linux/perf_event.h>
29 29
30int watchdog_enabled; 30int watchdog_enabled = 1;
31int __read_mostly softlockup_thresh = 60; 31int __read_mostly watchdog_thresh = 10;
32 32
33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); 34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -43,21 +43,22 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif 44#endif
45 45
46static int __read_mostly did_panic;
47static int __initdata no_watchdog;
48
49
50/* boot commands */ 46/* boot commands */
51/* 47/*
52 * Should we panic when a soft-lockup or hard-lockup occurs: 48 * Should we panic when a soft-lockup or hard-lockup occurs:
53 */ 49 */
54#ifdef CONFIG_HARDLOCKUP_DETECTOR 50#ifdef CONFIG_HARDLOCKUP_DETECTOR
55static int hardlockup_panic; 51static int hardlockup_panic =
52 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
56 53
57static int __init hardlockup_panic_setup(char *str) 54static int __init hardlockup_panic_setup(char *str)
58{ 55{
59 if (!strncmp(str, "panic", 5)) 56 if (!strncmp(str, "panic", 5))
60 hardlockup_panic = 1; 57 hardlockup_panic = 1;
58 else if (!strncmp(str, "nopanic", 7))
59 hardlockup_panic = 0;
60 else if (!strncmp(str, "0", 1))
61 watchdog_enabled = 0;
61 return 1; 62 return 1;
62} 63}
63__setup("nmi_watchdog=", hardlockup_panic_setup); 64__setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -76,7 +77,7 @@ __setup("softlockup_panic=", softlockup_panic_setup);
76 77
77static int __init nowatchdog_setup(char *str) 78static int __init nowatchdog_setup(char *str)
78{ 79{
79 no_watchdog = 1; 80 watchdog_enabled = 0;
80 return 1; 81 return 1;
81} 82}
82__setup("nowatchdog", nowatchdog_setup); 83__setup("nowatchdog", nowatchdog_setup);
@@ -84,12 +85,23 @@ __setup("nowatchdog", nowatchdog_setup);
84/* deprecated */ 85/* deprecated */
85static int __init nosoftlockup_setup(char *str) 86static int __init nosoftlockup_setup(char *str)
86{ 87{
87 no_watchdog = 1; 88 watchdog_enabled = 0;
88 return 1; 89 return 1;
89} 90}
90__setup("nosoftlockup", nosoftlockup_setup); 91__setup("nosoftlockup", nosoftlockup_setup);
91/* */ 92/* */
92 93
94/*
95 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
96 * lockups can have false positives under extreme conditions. So we generally
97 * want a higher threshold for soft lockups than for hard lockups. So we couple
98 * the thresholds with a factor: we make the soft threshold twice the amount of
99 * time the hard threshold is.
100 */
101static int get_softlockup_thresh(void)
102{
103 return watchdog_thresh * 2;
104}
93 105
94/* 106/*
95 * Returns seconds, approximately. We don't need nanosecond 107 * Returns seconds, approximately. We don't need nanosecond
@@ -104,12 +116,12 @@ static unsigned long get_timestamp(int this_cpu)
104static unsigned long get_sample_period(void) 116static unsigned long get_sample_period(void)
105{ 117{
106 /* 118 /*
107 * convert softlockup_thresh from seconds to ns 119 * convert watchdog_thresh from seconds to ns
108 * the divide by 5 is to give hrtimer 5 chances to 120 * the divide by 5 is to give hrtimer 5 chances to
109 * increment before the hardlockup detector generates 121 * increment before the hardlockup detector generates
110 * a warning 122 * a warning
111 */ 123 */
112 return softlockup_thresh / 5 * NSEC_PER_SEC; 124 return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
113} 125}
114 126
115/* Commands for resetting the watchdog */ 127/* Commands for resetting the watchdog */
@@ -117,12 +129,12 @@ static void __touch_watchdog(void)
117{ 129{
118 int this_cpu = smp_processor_id(); 130 int this_cpu = smp_processor_id();
119 131
120 __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); 132 __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
121} 133}
122 134
123void touch_softlockup_watchdog(void) 135void touch_softlockup_watchdog(void)
124{ 136{
125 __raw_get_cpu_var(watchdog_touch_ts) = 0; 137 __this_cpu_write(watchdog_touch_ts, 0);
126} 138}
127EXPORT_SYMBOL(touch_softlockup_watchdog); 139EXPORT_SYMBOL(touch_softlockup_watchdog);
128 140
@@ -166,12 +178,12 @@ void touch_softlockup_watchdog_sync(void)
166/* watchdog detector functions */ 178/* watchdog detector functions */
167static int is_hardlockup(void) 179static int is_hardlockup(void)
168{ 180{
169 unsigned long hrint = __get_cpu_var(hrtimer_interrupts); 181 unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
170 182
171 if (__get_cpu_var(hrtimer_interrupts_saved) == hrint) 183 if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
172 return 1; 184 return 1;
173 185
174 __get_cpu_var(hrtimer_interrupts_saved) = hrint; 186 __this_cpu_write(hrtimer_interrupts_saved, hrint);
175 return 0; 187 return 0;
176} 188}
177#endif 189#endif
@@ -181,24 +193,12 @@ static int is_softlockup(unsigned long touch_ts)
181 unsigned long now = get_timestamp(smp_processor_id()); 193 unsigned long now = get_timestamp(smp_processor_id());
182 194
183 /* Warn about unreasonable delays: */ 195 /* Warn about unreasonable delays: */
184 if (time_after(now, touch_ts + softlockup_thresh)) 196 if (time_after(now, touch_ts + get_softlockup_thresh()))
185 return now - touch_ts; 197 return now - touch_ts;
186 198
187 return 0; 199 return 0;
188} 200}
189 201
190static int
191watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
192{
193 did_panic = 1;
194
195 return NOTIFY_DONE;
196}
197
198static struct notifier_block panic_block = {
199 .notifier_call = watchdog_panic,
200};
201
202#ifdef CONFIG_HARDLOCKUP_DETECTOR 202#ifdef CONFIG_HARDLOCKUP_DETECTOR
203static struct perf_event_attr wd_hw_attr = { 203static struct perf_event_attr wd_hw_attr = {
204 .type = PERF_TYPE_HARDWARE, 204 .type = PERF_TYPE_HARDWARE,
@@ -209,15 +209,15 @@ static struct perf_event_attr wd_hw_attr = {
209}; 209};
210 210
211/* Callback function for perf event subsystem */ 211/* Callback function for perf event subsystem */
212void watchdog_overflow_callback(struct perf_event *event, int nmi, 212static void watchdog_overflow_callback(struct perf_event *event, int nmi,
213 struct perf_sample_data *data, 213 struct perf_sample_data *data,
214 struct pt_regs *regs) 214 struct pt_regs *regs)
215{ 215{
216 /* Ensure the watchdog never gets throttled */ 216 /* Ensure the watchdog never gets throttled */
217 event->hw.interrupts = 0; 217 event->hw.interrupts = 0;
218 218
219 if (__get_cpu_var(watchdog_nmi_touch) == true) { 219 if (__this_cpu_read(watchdog_nmi_touch) == true) {
220 __get_cpu_var(watchdog_nmi_touch) = false; 220 __this_cpu_write(watchdog_nmi_touch, false);
221 return; 221 return;
222 } 222 }
223 223
@@ -231,7 +231,7 @@ void watchdog_overflow_callback(struct perf_event *event, int nmi,
231 int this_cpu = smp_processor_id(); 231 int this_cpu = smp_processor_id();
232 232
233 /* only print hardlockups once */ 233 /* only print hardlockups once */
234 if (__get_cpu_var(hard_watchdog_warn) == true) 234 if (__this_cpu_read(hard_watchdog_warn) == true)
235 return; 235 return;
236 236
237 if (hardlockup_panic) 237 if (hardlockup_panic)
@@ -239,16 +239,16 @@ void watchdog_overflow_callback(struct perf_event *event, int nmi,
239 else 239 else
240 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); 240 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
241 241
242 __get_cpu_var(hard_watchdog_warn) = true; 242 __this_cpu_write(hard_watchdog_warn, true);
243 return; 243 return;
244 } 244 }
245 245
246 __get_cpu_var(hard_watchdog_warn) = false; 246 __this_cpu_write(hard_watchdog_warn, false);
247 return; 247 return;
248} 248}
249static void watchdog_interrupt_count(void) 249static void watchdog_interrupt_count(void)
250{ 250{
251 __get_cpu_var(hrtimer_interrupts)++; 251 __this_cpu_inc(hrtimer_interrupts);
252} 252}
253#else 253#else
254static inline void watchdog_interrupt_count(void) { return; } 254static inline void watchdog_interrupt_count(void) { return; }
@@ -257,7 +257,7 @@ static inline void watchdog_interrupt_count(void) { return; }
257/* watchdog kicker functions */ 257/* watchdog kicker functions */
258static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) 258static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
259{ 259{
260 unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); 260 unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
261 struct pt_regs *regs = get_irq_regs(); 261 struct pt_regs *regs = get_irq_regs();
262 int duration; 262 int duration;
263 263
@@ -265,18 +265,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
265 watchdog_interrupt_count(); 265 watchdog_interrupt_count();
266 266
267 /* kick the softlockup detector */ 267 /* kick the softlockup detector */
268 wake_up_process(__get_cpu_var(softlockup_watchdog)); 268 wake_up_process(__this_cpu_read(softlockup_watchdog));
269 269
270 /* .. and repeat */ 270 /* .. and repeat */
271 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); 271 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
272 272
273 if (touch_ts == 0) { 273 if (touch_ts == 0) {
274 if (unlikely(__get_cpu_var(softlockup_touch_sync))) { 274 if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
275 /* 275 /*
276 * If the time stamp was touched atomically 276 * If the time stamp was touched atomically
277 * make sure the scheduler tick is up to date. 277 * make sure the scheduler tick is up to date.
278 */ 278 */
279 __get_cpu_var(softlockup_touch_sync) = false; 279 __this_cpu_write(softlockup_touch_sync, false);
280 sched_clock_tick(); 280 sched_clock_tick();
281 } 281 }
282 __touch_watchdog(); 282 __touch_watchdog();
@@ -292,7 +292,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
292 duration = is_softlockup(touch_ts); 292 duration = is_softlockup(touch_ts);
293 if (unlikely(duration)) { 293 if (unlikely(duration)) {
294 /* only warn once */ 294 /* only warn once */
295 if (__get_cpu_var(soft_watchdog_warn) == true) 295 if (__this_cpu_read(soft_watchdog_warn) == true)
296 return HRTIMER_RESTART; 296 return HRTIMER_RESTART;
297 297
298 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", 298 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
@@ -307,9 +307,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
307 307
308 if (softlockup_panic) 308 if (softlockup_panic)
309 panic("softlockup: hung tasks"); 309 panic("softlockup: hung tasks");
310 __get_cpu_var(soft_watchdog_warn) = true; 310 __this_cpu_write(soft_watchdog_warn, true);
311 } else 311 } else
312 __get_cpu_var(soft_watchdog_warn) = false; 312 __this_cpu_write(soft_watchdog_warn, false);
313 313
314 return HRTIMER_RESTART; 314 return HRTIMER_RESTART;
315} 315}
@@ -320,7 +320,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
320 */ 320 */
321static int watchdog(void *unused) 321static int watchdog(void *unused)
322{ 322{
323 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 323 static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
324 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 324 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
325 325
326 sched_setscheduler(current, SCHED_FIFO, &param); 326 sched_setscheduler(current, SCHED_FIFO, &param);
@@ -370,15 +370,22 @@ static int watchdog_nmi_enable(int cpu)
370 370
371 /* Try to register using hardware perf events */ 371 /* Try to register using hardware perf events */
372 wd_attr = &wd_hw_attr; 372 wd_attr = &wd_hw_attr;
373 wd_attr->sample_period = hw_nmi_get_sample_period(); 373 wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
374 event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback); 374 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
375 if (!IS_ERR(event)) { 375 if (!IS_ERR(event)) {
376 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); 376 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
377 goto out_save; 377 goto out_save;
378 } 378 }
379 379
380 printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); 380
381 return -1; 381 /* vary the KERN level based on the returned errno */
382 if (PTR_ERR(event) == -EOPNOTSUPP)
383 printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
384 else if (PTR_ERR(event) == -ENOENT)
385 printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu);
386 else
387 printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event));
388 return PTR_ERR(event);
382 389
383 /* success path */ 390 /* success path */
384out_save: 391out_save:
@@ -408,31 +415,37 @@ static void watchdog_nmi_disable(int cpu) { return; }
408#endif /* CONFIG_HARDLOCKUP_DETECTOR */ 415#endif /* CONFIG_HARDLOCKUP_DETECTOR */
409 416
410/* prepare/enable/disable routines */ 417/* prepare/enable/disable routines */
411static int watchdog_prepare_cpu(int cpu) 418static void watchdog_prepare_cpu(int cpu)
412{ 419{
413 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); 420 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
414 421
415 WARN_ON(per_cpu(softlockup_watchdog, cpu)); 422 WARN_ON(per_cpu(softlockup_watchdog, cpu));
416 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 423 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
417 hrtimer->function = watchdog_timer_fn; 424 hrtimer->function = watchdog_timer_fn;
418
419 return 0;
420} 425}
421 426
422static int watchdog_enable(int cpu) 427static int watchdog_enable(int cpu)
423{ 428{
424 struct task_struct *p = per_cpu(softlockup_watchdog, cpu); 429 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
430 int err = 0;
425 431
426 /* enable the perf event */ 432 /* enable the perf event */
427 if (watchdog_nmi_enable(cpu) != 0) 433 err = watchdog_nmi_enable(cpu);
428 return -1; 434
435 /* Regardless of err above, fall through and start softlockup */
429 436
430 /* create the watchdog thread */ 437 /* create the watchdog thread */
431 if (!p) { 438 if (!p) {
432 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); 439 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
433 if (IS_ERR(p)) { 440 if (IS_ERR(p)) {
434 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); 441 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
435 return -1; 442 if (!err) {
443 /* if hardlockup hasn't already set this */
444 err = PTR_ERR(p);
445 /* and disable the perf event */
446 watchdog_nmi_disable(cpu);
447 }
448 goto out;
436 } 449 }
437 kthread_bind(p, cpu); 450 kthread_bind(p, cpu);
438 per_cpu(watchdog_touch_ts, cpu) = 0; 451 per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -440,10 +453,8 @@ static int watchdog_enable(int cpu)
440 wake_up_process(p); 453 wake_up_process(p);
441 } 454 }
442 455
443 /* if any cpu succeeds, watchdog is considered enabled for the system */ 456out:
444 watchdog_enabled = 1; 457 return err;
445
446 return 0;
447} 458}
448 459
449static void watchdog_disable(int cpu) 460static void watchdog_disable(int cpu)
@@ -470,12 +481,16 @@ static void watchdog_disable(int cpu)
470static void watchdog_enable_all_cpus(void) 481static void watchdog_enable_all_cpus(void)
471{ 482{
472 int cpu; 483 int cpu;
473 int result = 0; 484
485 watchdog_enabled = 0;
474 486
475 for_each_online_cpu(cpu) 487 for_each_online_cpu(cpu)
476 result += watchdog_enable(cpu); 488 if (!watchdog_enable(cpu))
489 /* if any cpu succeeds, watchdog is considered
490 enabled for the system */
491 watchdog_enabled = 1;
477 492
478 if (result) 493 if (!watchdog_enabled)
479 printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); 494 printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
480 495
481} 496}
@@ -495,26 +510,25 @@ static void watchdog_disable_all_cpus(void)
495/* sysctl functions */ 510/* sysctl functions */
496#ifdef CONFIG_SYSCTL 511#ifdef CONFIG_SYSCTL
497/* 512/*
498 * proc handler for /proc/sys/kernel/nmi_watchdog 513 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
499 */ 514 */
500 515
501int proc_dowatchdog_enabled(struct ctl_table *table, int write, 516int proc_dowatchdog(struct ctl_table *table, int write,
502 void __user *buffer, size_t *length, loff_t *ppos) 517 void __user *buffer, size_t *lenp, loff_t *ppos)
503{ 518{
504 proc_dointvec(table, write, buffer, length, ppos); 519 int ret;
520
521 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
522 if (ret || !write)
523 goto out;
505 524
506 if (watchdog_enabled) 525 if (watchdog_enabled && watchdog_thresh)
507 watchdog_enable_all_cpus(); 526 watchdog_enable_all_cpus();
508 else 527 else
509 watchdog_disable_all_cpus(); 528 watchdog_disable_all_cpus();
510 return 0;
511}
512 529
513int proc_dowatchdog_thresh(struct ctl_table *table, int write, 530out:
514 void __user *buffer, 531 return ret;
515 size_t *lenp, loff_t *ppos)
516{
517 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
518} 532}
519#endif /* CONFIG_SYSCTL */ 533#endif /* CONFIG_SYSCTL */
520 534
@@ -530,13 +544,12 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
530 switch (action) { 544 switch (action) {
531 case CPU_UP_PREPARE: 545 case CPU_UP_PREPARE:
532 case CPU_UP_PREPARE_FROZEN: 546 case CPU_UP_PREPARE_FROZEN:
533 if (watchdog_prepare_cpu(hotcpu)) 547 watchdog_prepare_cpu(hotcpu);
534 return NOTIFY_BAD;
535 break; 548 break;
536 case CPU_ONLINE: 549 case CPU_ONLINE:
537 case CPU_ONLINE_FROZEN: 550 case CPU_ONLINE_FROZEN:
538 if (watchdog_enable(hotcpu)) 551 if (watchdog_enabled)
539 return NOTIFY_BAD; 552 watchdog_enable(hotcpu);
540 break; 553 break;
541#ifdef CONFIG_HOTPLUG_CPU 554#ifdef CONFIG_HOTPLUG_CPU
542 case CPU_UP_CANCELED: 555 case CPU_UP_CANCELED:
@@ -549,6 +562,12 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
549 break; 562 break;
550#endif /* CONFIG_HOTPLUG_CPU */ 563#endif /* CONFIG_HOTPLUG_CPU */
551 } 564 }
565
566 /*
567 * hardlockup and softlockup are not important enough
568 * to block cpu bring up. Just always succeed and
569 * rely on printk output to flag problems.
570 */
552 return NOTIFY_OK; 571 return NOTIFY_OK;
553} 572}
554 573
@@ -556,22 +575,16 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
556 .notifier_call = cpu_callback 575 .notifier_call = cpu_callback
557}; 576};
558 577
559static int __init spawn_watchdog_task(void) 578void __init lockup_detector_init(void)
560{ 579{
561 void *cpu = (void *)(long)smp_processor_id(); 580 void *cpu = (void *)(long)smp_processor_id();
562 int err; 581 int err;
563 582
564 if (no_watchdog)
565 return 0;
566
567 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 583 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
568 WARN_ON(err == NOTIFY_BAD); 584 WARN_ON(notifier_to_errno(err));
569 585
570 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 586 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
571 register_cpu_notifier(&cpu_nfb); 587 register_cpu_notifier(&cpu_nfb);
572 588
573 atomic_notifier_chain_register(&panic_notifier_list, &panic_block); 589 return;
574
575 return 0;
576} 590}
577early_initcall(spawn_watchdog_task);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f77afd939229..0400553f0d04 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -42,9 +42,6 @@
42#include <linux/lockdep.h> 42#include <linux/lockdep.h>
43#include <linux/idr.h> 43#include <linux/idr.h>
44 44
45#define CREATE_TRACE_POINTS
46#include <trace/events/workqueue.h>
47
48#include "workqueue_sched.h" 45#include "workqueue_sched.h"
49 46
50enum { 47enum {
@@ -82,7 +79,9 @@ enum {
82 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ 79 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
83 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ 80 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
84 81
85 MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */ 82 MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2,
83 /* call for help after 10ms
84 (min two ticks) */
86 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ 85 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
87 CREATE_COOLDOWN = HZ, /* time to breath after fail */ 86 CREATE_COOLDOWN = HZ, /* time to breath after fail */
88 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ 87 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
@@ -252,10 +251,15 @@ struct workqueue_struct *system_wq __read_mostly;
252struct workqueue_struct *system_long_wq __read_mostly; 251struct workqueue_struct *system_long_wq __read_mostly;
253struct workqueue_struct *system_nrt_wq __read_mostly; 252struct workqueue_struct *system_nrt_wq __read_mostly;
254struct workqueue_struct *system_unbound_wq __read_mostly; 253struct workqueue_struct *system_unbound_wq __read_mostly;
254struct workqueue_struct *system_freezable_wq __read_mostly;
255EXPORT_SYMBOL_GPL(system_wq); 255EXPORT_SYMBOL_GPL(system_wq);
256EXPORT_SYMBOL_GPL(system_long_wq); 256EXPORT_SYMBOL_GPL(system_long_wq);
257EXPORT_SYMBOL_GPL(system_nrt_wq); 257EXPORT_SYMBOL_GPL(system_nrt_wq);
258EXPORT_SYMBOL_GPL(system_unbound_wq); 258EXPORT_SYMBOL_GPL(system_unbound_wq);
259EXPORT_SYMBOL_GPL(system_freezable_wq);
260
261#define CREATE_TRACE_POINTS
262#include <trace/events/workqueue.h>
259 263
260#define for_each_busy_worker(worker, i, pos, gcwq) \ 264#define for_each_busy_worker(worker, i, pos, gcwq) \
261 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ 265 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
@@ -310,25 +314,15 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
310 (cpu) < WORK_CPU_NONE; \ 314 (cpu) < WORK_CPU_NONE; \
311 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) 315 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
312 316
313#ifdef CONFIG_LOCKDEP
314/**
315 * in_workqueue_context() - in context of specified workqueue?
316 * @wq: the workqueue of interest
317 *
318 * Checks lockdep state to see if the current task is executing from
319 * within a workqueue item. This function exists only if lockdep is
320 * enabled.
321 */
322int in_workqueue_context(struct workqueue_struct *wq)
323{
324 return lock_is_held(&wq->lockdep_map);
325}
326#endif
327
328#ifdef CONFIG_DEBUG_OBJECTS_WORK 317#ifdef CONFIG_DEBUG_OBJECTS_WORK
329 318
330static struct debug_obj_descr work_debug_descr; 319static struct debug_obj_descr work_debug_descr;
331 320
321static void *work_debug_hint(void *addr)
322{
323 return ((struct work_struct *) addr)->func;
324}
325
332/* 326/*
333 * fixup_init is called when: 327 * fixup_init is called when:
334 * - an active object is initialized 328 * - an active object is initialized
@@ -400,6 +394,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state)
400 394
401static struct debug_obj_descr work_debug_descr = { 395static struct debug_obj_descr work_debug_descr = {
402 .name = "work_struct", 396 .name = "work_struct",
397 .debug_hint = work_debug_hint,
403 .fixup_init = work_fixup_init, 398 .fixup_init = work_fixup_init,
404 .fixup_activate = work_fixup_activate, 399 .fixup_activate = work_fixup_activate,
405 .fixup_free = work_fixup_free, 400 .fixup_free = work_fixup_free,
@@ -604,7 +599,9 @@ static bool keep_working(struct global_cwq *gcwq)
604{ 599{
605 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); 600 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
606 601
607 return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1; 602 return !list_empty(&gcwq->worklist) &&
603 (atomic_read(nr_running) <= 1 ||
604 gcwq->flags & GCWQ_HIGHPRI_PENDING);
608} 605}
609 606
610/* Do we need a new worker? Called from manager. */ 607/* Do we need a new worker? Called from manager. */
@@ -674,7 +671,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
674{ 671{
675 struct worker *worker = kthread_data(task); 672 struct worker *worker = kthread_data(task);
676 673
677 if (likely(!(worker->flags & WORKER_NOT_RUNNING))) 674 if (!(worker->flags & WORKER_NOT_RUNNING))
678 atomic_inc(get_gcwq_nr_running(cpu)); 675 atomic_inc(get_gcwq_nr_running(cpu));
679} 676}
680 677
@@ -700,7 +697,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
700 struct global_cwq *gcwq = get_gcwq(cpu); 697 struct global_cwq *gcwq = get_gcwq(cpu);
701 atomic_t *nr_running = get_gcwq_nr_running(cpu); 698 atomic_t *nr_running = get_gcwq_nr_running(cpu);
702 699
703 if (unlikely(worker->flags & WORKER_NOT_RUNNING)) 700 if (worker->flags & WORKER_NOT_RUNNING)
704 return NULL; 701 return NULL;
705 702
706 /* this can only happen on the local cpu */ 703 /* this can only happen on the local cpu */
@@ -781,7 +778,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
781 778
782 worker->flags &= ~flags; 779 worker->flags &= ~flags;
783 780
784 /* if transitioning out of NOT_RUNNING, increment nr_running */ 781 /*
782 * If transitioning out of NOT_RUNNING, increment nr_running. Note
783 * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
784 * of multiple flags, not a single flag.
785 */
785 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) 786 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
786 if (!(worker->flags & WORKER_NOT_RUNNING)) 787 if (!(worker->flags & WORKER_NOT_RUNNING))
787 atomic_inc(get_gcwq_nr_running(gcwq->cpu)); 788 atomic_inc(get_gcwq_nr_running(gcwq->cpu));
@@ -945,6 +946,38 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
945 wake_up_worker(gcwq); 946 wake_up_worker(gcwq);
946} 947}
947 948
949/*
950 * Test whether @work is being queued from another work executing on the
951 * same workqueue. This is rather expensive and should only be used from
952 * cold paths.
953 */
954static bool is_chained_work(struct workqueue_struct *wq)
955{
956 unsigned long flags;
957 unsigned int cpu;
958
959 for_each_gcwq_cpu(cpu) {
960 struct global_cwq *gcwq = get_gcwq(cpu);
961 struct worker *worker;
962 struct hlist_node *pos;
963 int i;
964
965 spin_lock_irqsave(&gcwq->lock, flags);
966 for_each_busy_worker(worker, i, pos, gcwq) {
967 if (worker->task != current)
968 continue;
969 spin_unlock_irqrestore(&gcwq->lock, flags);
970 /*
971 * I'm @worker, no locking necessary. See if @work
972 * is headed to the same workqueue.
973 */
974 return worker->current_cwq->wq == wq;
975 }
976 spin_unlock_irqrestore(&gcwq->lock, flags);
977 }
978 return false;
979}
980
948static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, 981static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
949 struct work_struct *work) 982 struct work_struct *work)
950{ 983{
@@ -956,7 +989,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
956 989
957 debug_work_activate(work); 990 debug_work_activate(work);
958 991
959 if (WARN_ON_ONCE(wq->flags & WQ_DYING)) 992 /* if dying, only works from the same workqueue are allowed */
993 if (unlikely(wq->flags & WQ_DYING) &&
994 WARN_ON_ONCE(!is_chained_work(wq)))
960 return; 995 return;
961 996
962 /* determine gcwq to use */ 997 /* determine gcwq to use */
@@ -997,6 +1032,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
997 1032
998 /* gcwq determined, get cwq and queue */ 1033 /* gcwq determined, get cwq and queue */
999 cwq = get_cwq(gcwq->cpu, wq); 1034 cwq = get_cwq(gcwq->cpu, wq);
1035 trace_workqueue_queue_work(cpu, cwq, work);
1000 1036
1001 BUG_ON(!list_empty(&work->entry)); 1037 BUG_ON(!list_empty(&work->entry));
1002 1038
@@ -1004,6 +1040,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1004 work_flags = work_color_to_flags(cwq->work_color); 1040 work_flags = work_color_to_flags(cwq->work_color);
1005 1041
1006 if (likely(cwq->nr_active < cwq->max_active)) { 1042 if (likely(cwq->nr_active < cwq->max_active)) {
1043 trace_workqueue_activate_work(work);
1007 cwq->nr_active++; 1044 cwq->nr_active++;
1008 worklist = gcwq_determine_ins_pos(gcwq, cwq); 1045 worklist = gcwq_determine_ins_pos(gcwq, cwq);
1009 } else { 1046 } else {
@@ -1254,8 +1291,14 @@ __acquires(&gcwq->lock)
1254 return true; 1291 return true;
1255 spin_unlock_irq(&gcwq->lock); 1292 spin_unlock_irq(&gcwq->lock);
1256 1293
1257 /* CPU has come up inbetween, retry migration */ 1294 /*
1295 * We've raced with CPU hot[un]plug. Give it a breather
1296 * and retry migration. cond_resched() is required here;
1297 * otherwise, we might deadlock against cpu_stop trying to
1298 * bring down the CPU on non-preemptive kernel.
1299 */
1258 cpu_relax(); 1300 cpu_relax();
1301 cond_resched();
1259 } 1302 }
1260} 1303}
1261 1304
@@ -1329,8 +1372,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
1329 worker->id = id; 1372 worker->id = id;
1330 1373
1331 if (!on_unbound_cpu) 1374 if (!on_unbound_cpu)
1332 worker->task = kthread_create(worker_thread, worker, 1375 worker->task = kthread_create_on_node(worker_thread,
1333 "kworker/%u:%d", gcwq->cpu, id); 1376 worker,
1377 cpu_to_node(gcwq->cpu),
1378 "kworker/%u:%d", gcwq->cpu, id);
1334 else 1379 else
1335 worker->task = kthread_create(worker_thread, worker, 1380 worker->task = kthread_create(worker_thread, worker,
1336 "kworker/u:%d", id); 1381 "kworker/u:%d", id);
@@ -1679,6 +1724,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1679 struct work_struct, entry); 1724 struct work_struct, entry);
1680 struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); 1725 struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
1681 1726
1727 trace_workqueue_activate_work(work);
1682 move_linked_works(work, pos, NULL); 1728 move_linked_works(work, pos, NULL);
1683 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); 1729 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
1684 cwq->nr_active++; 1730 cwq->nr_active++;
@@ -1816,7 +1862,7 @@ __acquires(&gcwq->lock)
1816 spin_unlock_irq(&gcwq->lock); 1862 spin_unlock_irq(&gcwq->lock);
1817 1863
1818 work_clear_pending(work); 1864 work_clear_pending(work);
1819 lock_map_acquire(&cwq->wq->lockdep_map); 1865 lock_map_acquire_read(&cwq->wq->lockdep_map);
1820 lock_map_acquire(&lockdep_map); 1866 lock_map_acquire(&lockdep_map);
1821 trace_workqueue_execute_start(work); 1867 trace_workqueue_execute_start(work);
1822 f(work); 1868 f(work);
@@ -2019,6 +2065,15 @@ repeat:
2019 move_linked_works(work, scheduled, &n); 2065 move_linked_works(work, scheduled, &n);
2020 2066
2021 process_scheduled_works(rescuer); 2067 process_scheduled_works(rescuer);
2068
2069 /*
2070 * Leave this gcwq. If keep_working() is %true, notify a
2071 * regular worker; otherwise, we end up with 0 concurrency
2072 * and stalling the execution.
2073 */
2074 if (keep_working(gcwq))
2075 wake_up_worker(gcwq);
2076
2022 spin_unlock_irq(&gcwq->lock); 2077 spin_unlock_irq(&gcwq->lock);
2023 } 2078 }
2024 2079
@@ -2074,7 +2129,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2074 * checks and call back into the fixup functions where we 2129 * checks and call back into the fixup functions where we
2075 * might deadlock. 2130 * might deadlock.
2076 */ 2131 */
2077 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); 2132 INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
2078 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); 2133 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
2079 init_completion(&barr->done); 2134 init_completion(&barr->done);
2080 2135
@@ -2326,27 +2381,17 @@ out_unlock:
2326} 2381}
2327EXPORT_SYMBOL_GPL(flush_workqueue); 2382EXPORT_SYMBOL_GPL(flush_workqueue);
2328 2383
2329/** 2384static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2330 * flush_work - block until a work_struct's callback has terminated 2385 bool wait_executing)
2331 * @work: the work which is to be flushed
2332 *
2333 * Returns false if @work has already terminated.
2334 *
2335 * It is expected that, prior to calling flush_work(), the caller has
2336 * arranged for the work to not be requeued, otherwise it doesn't make
2337 * sense to use this function.
2338 */
2339int flush_work(struct work_struct *work)
2340{ 2386{
2341 struct worker *worker = NULL; 2387 struct worker *worker = NULL;
2342 struct global_cwq *gcwq; 2388 struct global_cwq *gcwq;
2343 struct cpu_workqueue_struct *cwq; 2389 struct cpu_workqueue_struct *cwq;
2344 struct wq_barrier barr;
2345 2390
2346 might_sleep(); 2391 might_sleep();
2347 gcwq = get_work_gcwq(work); 2392 gcwq = get_work_gcwq(work);
2348 if (!gcwq) 2393 if (!gcwq)
2349 return 0; 2394 return false;
2350 2395
2351 spin_lock_irq(&gcwq->lock); 2396 spin_lock_irq(&gcwq->lock);
2352 if (!list_empty(&work->entry)) { 2397 if (!list_empty(&work->entry)) {
@@ -2359,28 +2404,137 @@ int flush_work(struct work_struct *work)
2359 cwq = get_work_cwq(work); 2404 cwq = get_work_cwq(work);
2360 if (unlikely(!cwq || gcwq != cwq->gcwq)) 2405 if (unlikely(!cwq || gcwq != cwq->gcwq))
2361 goto already_gone; 2406 goto already_gone;
2362 } else { 2407 } else if (wait_executing) {
2363 worker = find_worker_executing_work(gcwq, work); 2408 worker = find_worker_executing_work(gcwq, work);
2364 if (!worker) 2409 if (!worker)
2365 goto already_gone; 2410 goto already_gone;
2366 cwq = worker->current_cwq; 2411 cwq = worker->current_cwq;
2367 } 2412 } else
2413 goto already_gone;
2368 2414
2369 insert_wq_barrier(cwq, &barr, work, worker); 2415 insert_wq_barrier(cwq, barr, work, worker);
2370 spin_unlock_irq(&gcwq->lock); 2416 spin_unlock_irq(&gcwq->lock);
2371 2417
2372 lock_map_acquire(&cwq->wq->lockdep_map); 2418 /*
2419 * If @max_active is 1 or rescuer is in use, flushing another work
2420 * item on the same workqueue may lead to deadlock. Make sure the
2421 * flusher is not running on the same workqueue by verifying write
2422 * access.
2423 */
2424 if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
2425 lock_map_acquire(&cwq->wq->lockdep_map);
2426 else
2427 lock_map_acquire_read(&cwq->wq->lockdep_map);
2373 lock_map_release(&cwq->wq->lockdep_map); 2428 lock_map_release(&cwq->wq->lockdep_map);
2374 2429
2375 wait_for_completion(&barr.done); 2430 return true;
2376 destroy_work_on_stack(&barr.work);
2377 return 1;
2378already_gone: 2431already_gone:
2379 spin_unlock_irq(&gcwq->lock); 2432 spin_unlock_irq(&gcwq->lock);
2380 return 0; 2433 return false;
2434}
2435
2436/**
2437 * flush_work - wait for a work to finish executing the last queueing instance
2438 * @work: the work to flush
2439 *
2440 * Wait until @work has finished execution. This function considers
2441 * only the last queueing instance of @work. If @work has been
2442 * enqueued across different CPUs on a non-reentrant workqueue or on
2443 * multiple workqueues, @work might still be executing on return on
2444 * some of the CPUs from earlier queueing.
2445 *
2446 * If @work was queued only on a non-reentrant, ordered or unbound
2447 * workqueue, @work is guaranteed to be idle on return if it hasn't
2448 * been requeued since flush started.
2449 *
2450 * RETURNS:
2451 * %true if flush_work() waited for the work to finish execution,
2452 * %false if it was already idle.
2453 */
2454bool flush_work(struct work_struct *work)
2455{
2456 struct wq_barrier barr;
2457
2458 if (start_flush_work(work, &barr, true)) {
2459 wait_for_completion(&barr.done);
2460 destroy_work_on_stack(&barr.work);
2461 return true;
2462 } else
2463 return false;
2381} 2464}
2382EXPORT_SYMBOL_GPL(flush_work); 2465EXPORT_SYMBOL_GPL(flush_work);
2383 2466
2467static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
2468{
2469 struct wq_barrier barr;
2470 struct worker *worker;
2471
2472 spin_lock_irq(&gcwq->lock);
2473
2474 worker = find_worker_executing_work(gcwq, work);
2475 if (unlikely(worker))
2476 insert_wq_barrier(worker->current_cwq, &barr, work, worker);
2477
2478 spin_unlock_irq(&gcwq->lock);
2479
2480 if (unlikely(worker)) {
2481 wait_for_completion(&barr.done);
2482 destroy_work_on_stack(&barr.work);
2483 return true;
2484 } else
2485 return false;
2486}
2487
2488static bool wait_on_work(struct work_struct *work)
2489{
2490 bool ret = false;
2491 int cpu;
2492
2493 might_sleep();
2494
2495 lock_map_acquire(&work->lockdep_map);
2496 lock_map_release(&work->lockdep_map);
2497
2498 for_each_gcwq_cpu(cpu)
2499 ret |= wait_on_cpu_work(get_gcwq(cpu), work);
2500 return ret;
2501}
2502
2503/**
2504 * flush_work_sync - wait until a work has finished execution
2505 * @work: the work to flush
2506 *
2507 * Wait until @work has finished execution. On return, it's
2508 * guaranteed that all queueing instances of @work which happened
2509 * before this function is called are finished. In other words, if
2510 * @work hasn't been requeued since this function was called, @work is
2511 * guaranteed to be idle on return.
2512 *
2513 * RETURNS:
2514 * %true if flush_work_sync() waited for the work to finish execution,
2515 * %false if it was already idle.
2516 */
2517bool flush_work_sync(struct work_struct *work)
2518{
2519 struct wq_barrier barr;
2520 bool pending, waited;
2521
2522 /* we'll wait for executions separately, queue barr only if pending */
2523 pending = start_flush_work(work, &barr, false);
2524
2525 /* wait for executions to finish */
2526 waited = wait_on_work(work);
2527
2528 /* wait for the pending one */
2529 if (pending) {
2530 wait_for_completion(&barr.done);
2531 destroy_work_on_stack(&barr.work);
2532 }
2533
2534 return pending || waited;
2535}
2536EXPORT_SYMBOL_GPL(flush_work_sync);
2537
2384/* 2538/*
2385 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, 2539 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
2386 * so this work can't be re-armed in any way. 2540 * so this work can't be re-armed in any way.
@@ -2423,39 +2577,7 @@ static int try_to_grab_pending(struct work_struct *work)
2423 return ret; 2577 return ret;
2424} 2578}
2425 2579
2426static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) 2580static bool __cancel_work_timer(struct work_struct *work,
2427{
2428 struct wq_barrier barr;
2429 struct worker *worker;
2430
2431 spin_lock_irq(&gcwq->lock);
2432
2433 worker = find_worker_executing_work(gcwq, work);
2434 if (unlikely(worker))
2435 insert_wq_barrier(worker->current_cwq, &barr, work, worker);
2436
2437 spin_unlock_irq(&gcwq->lock);
2438
2439 if (unlikely(worker)) {
2440 wait_for_completion(&barr.done);
2441 destroy_work_on_stack(&barr.work);
2442 }
2443}
2444
2445static void wait_on_work(struct work_struct *work)
2446{
2447 int cpu;
2448
2449 might_sleep();
2450
2451 lock_map_acquire(&work->lockdep_map);
2452 lock_map_release(&work->lockdep_map);
2453
2454 for_each_gcwq_cpu(cpu)
2455 wait_on_cpu_work(get_gcwq(cpu), work);
2456}
2457
2458static int __cancel_work_timer(struct work_struct *work,
2459 struct timer_list* timer) 2581 struct timer_list* timer)
2460{ 2582{
2461 int ret; 2583 int ret;
@@ -2472,42 +2594,81 @@ static int __cancel_work_timer(struct work_struct *work,
2472} 2594}
2473 2595
2474/** 2596/**
2475 * cancel_work_sync - block until a work_struct's callback has terminated 2597 * cancel_work_sync - cancel a work and wait for it to finish
2476 * @work: the work which is to be flushed 2598 * @work: the work to cancel
2477 * 2599 *
2478 * Returns true if @work was pending. 2600 * Cancel @work and wait for its execution to finish. This function
2479 * 2601 * can be used even if the work re-queues itself or migrates to
2480 * cancel_work_sync() will cancel the work if it is queued. If the work's 2602 * another workqueue. On return from this function, @work is
2481 * callback appears to be running, cancel_work_sync() will block until it 2603 * guaranteed to be not pending or executing on any CPU.
2482 * has completed.
2483 *
2484 * It is possible to use this function if the work re-queues itself. It can
2485 * cancel the work even if it migrates to another workqueue, however in that
2486 * case it only guarantees that work->func() has completed on the last queued
2487 * workqueue.
2488 * 2604 *
2489 * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not 2605 * cancel_work_sync(&delayed_work->work) must not be used for
2490 * pending, otherwise it goes into a busy-wait loop until the timer expires. 2606 * delayed_work's. Use cancel_delayed_work_sync() instead.
2491 * 2607 *
2492 * The caller must ensure that workqueue_struct on which this work was last 2608 * The caller must ensure that the workqueue on which @work was last
2493 * queued can't be destroyed before this function returns. 2609 * queued can't be destroyed before this function returns.
2610 *
2611 * RETURNS:
2612 * %true if @work was pending, %false otherwise.
2494 */ 2613 */
2495int cancel_work_sync(struct work_struct *work) 2614bool cancel_work_sync(struct work_struct *work)
2496{ 2615{
2497 return __cancel_work_timer(work, NULL); 2616 return __cancel_work_timer(work, NULL);
2498} 2617}
2499EXPORT_SYMBOL_GPL(cancel_work_sync); 2618EXPORT_SYMBOL_GPL(cancel_work_sync);
2500 2619
2501/** 2620/**
2502 * cancel_delayed_work_sync - reliably kill off a delayed work. 2621 * flush_delayed_work - wait for a dwork to finish executing the last queueing
2503 * @dwork: the delayed work struct 2622 * @dwork: the delayed work to flush
2623 *
2624 * Delayed timer is cancelled and the pending work is queued for
2625 * immediate execution. Like flush_work(), this function only
2626 * considers the last queueing instance of @dwork.
2627 *
2628 * RETURNS:
2629 * %true if flush_work() waited for the work to finish execution,
2630 * %false if it was already idle.
2631 */
2632bool flush_delayed_work(struct delayed_work *dwork)
2633{
2634 if (del_timer_sync(&dwork->timer))
2635 __queue_work(raw_smp_processor_id(),
2636 get_work_cwq(&dwork->work)->wq, &dwork->work);
2637 return flush_work(&dwork->work);
2638}
2639EXPORT_SYMBOL(flush_delayed_work);
2640
2641/**
2642 * flush_delayed_work_sync - wait for a dwork to finish
2643 * @dwork: the delayed work to flush
2644 *
2645 * Delayed timer is cancelled and the pending work is queued for
2646 * execution immediately. Other than timer handling, its behavior
2647 * is identical to flush_work_sync().
2648 *
2649 * RETURNS:
2650 * %true if flush_work_sync() waited for the work to finish execution,
2651 * %false if it was already idle.
2652 */
2653bool flush_delayed_work_sync(struct delayed_work *dwork)
2654{
2655 if (del_timer_sync(&dwork->timer))
2656 __queue_work(raw_smp_processor_id(),
2657 get_work_cwq(&dwork->work)->wq, &dwork->work);
2658 return flush_work_sync(&dwork->work);
2659}
2660EXPORT_SYMBOL(flush_delayed_work_sync);
2661
2662/**
2663 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
2664 * @dwork: the delayed work cancel
2504 * 2665 *
2505 * Returns true if @dwork was pending. 2666 * This is cancel_work_sync() for delayed works.
2506 * 2667 *
2507 * It is possible to use this function if @dwork rearms itself via queue_work() 2668 * RETURNS:
2508 * or queue_delayed_work(). See also the comment for cancel_work_sync(). 2669 * %true if @dwork was pending, %false otherwise.
2509 */ 2670 */
2510int cancel_delayed_work_sync(struct delayed_work *dwork) 2671bool cancel_delayed_work_sync(struct delayed_work *dwork)
2511{ 2672{
2512 return __cancel_work_timer(&dwork->work, &dwork->timer); 2673 return __cancel_work_timer(&dwork->work, &dwork->timer);
2513} 2674}
@@ -2559,23 +2720,6 @@ int schedule_delayed_work(struct delayed_work *dwork,
2559EXPORT_SYMBOL(schedule_delayed_work); 2720EXPORT_SYMBOL(schedule_delayed_work);
2560 2721
2561/** 2722/**
2562 * flush_delayed_work - block until a dwork_struct's callback has terminated
2563 * @dwork: the delayed work which is to be flushed
2564 *
2565 * Any timeout is cancelled, and any pending work is run immediately.
2566 */
2567void flush_delayed_work(struct delayed_work *dwork)
2568{
2569 if (del_timer_sync(&dwork->timer)) {
2570 __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq,
2571 &dwork->work);
2572 put_cpu();
2573 }
2574 flush_work(&dwork->work);
2575}
2576EXPORT_SYMBOL(flush_delayed_work);
2577
2578/**
2579 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay 2723 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
2580 * @cpu: cpu to use 2724 * @cpu: cpu to use
2581 * @dwork: job to be done 2725 * @dwork: job to be done
@@ -2592,13 +2736,15 @@ int schedule_delayed_work_on(int cpu,
2592EXPORT_SYMBOL(schedule_delayed_work_on); 2736EXPORT_SYMBOL(schedule_delayed_work_on);
2593 2737
2594/** 2738/**
2595 * schedule_on_each_cpu - call a function on each online CPU from keventd 2739 * schedule_on_each_cpu - execute a function synchronously on each online CPU
2596 * @func: the function to call 2740 * @func: the function to call
2597 * 2741 *
2598 * Returns zero on success. 2742 * schedule_on_each_cpu() executes @func on each online CPU using the
2599 * Returns -ve errno on failure. 2743 * system workqueue and blocks until all CPUs have completed.
2600 *
2601 * schedule_on_each_cpu() is very slow. 2744 * schedule_on_each_cpu() is very slow.
2745 *
2746 * RETURNS:
2747 * 0 on success, -errno on failure.
2602 */ 2748 */
2603int schedule_on_each_cpu(work_func_t func) 2749int schedule_on_each_cpu(work_func_t func)
2604{ 2750{
@@ -2764,6 +2910,13 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
2764 unsigned int cpu; 2910 unsigned int cpu;
2765 2911
2766 /* 2912 /*
2913 * Workqueues which may be used during memory reclaim should
2914 * have a rescuer to guarantee forward progress.
2915 */
2916 if (flags & WQ_MEM_RECLAIM)
2917 flags |= WQ_RESCUER;
2918
2919 /*
2767 * Unbound workqueues aren't concurrency managed and should be 2920 * Unbound workqueues aren't concurrency managed and should be
2768 * dispatched to workers immediately. 2921 * dispatched to workers immediately.
2769 */ 2922 */
@@ -2828,7 +2981,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
2828 */ 2981 */
2829 spin_lock(&workqueue_lock); 2982 spin_lock(&workqueue_lock);
2830 2983
2831 if (workqueue_freezing && wq->flags & WQ_FREEZEABLE) 2984 if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
2832 for_each_cwq_cpu(cpu, wq) 2985 for_each_cwq_cpu(cpu, wq)
2833 get_cwq(cpu, wq)->max_active = 0; 2986 get_cwq(cpu, wq)->max_active = 0;
2834 2987
@@ -2856,11 +3009,35 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
2856 */ 3009 */
2857void destroy_workqueue(struct workqueue_struct *wq) 3010void destroy_workqueue(struct workqueue_struct *wq)
2858{ 3011{
3012 unsigned int flush_cnt = 0;
2859 unsigned int cpu; 3013 unsigned int cpu;
2860 3014
3015 /*
3016 * Mark @wq dying and drain all pending works. Once WQ_DYING is
3017 * set, only chain queueing is allowed. IOW, only currently
3018 * pending or running work items on @wq can queue further work
3019 * items on it. @wq is flushed repeatedly until it becomes empty.
3020 * The number of flushing is detemined by the depth of chaining and
3021 * should be relatively short. Whine if it takes too long.
3022 */
2861 wq->flags |= WQ_DYING; 3023 wq->flags |= WQ_DYING;
3024reflush:
2862 flush_workqueue(wq); 3025 flush_workqueue(wq);
2863 3026
3027 for_each_cwq_cpu(cpu, wq) {
3028 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3029
3030 if (!cwq->nr_active && list_empty(&cwq->delayed_works))
3031 continue;
3032
3033 if (++flush_cnt == 10 ||
3034 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
3035 printk(KERN_WARNING "workqueue %s: flush on "
3036 "destruction isn't complete after %u tries\n",
3037 wq->name, flush_cnt);
3038 goto reflush;
3039 }
3040
2864 /* 3041 /*
2865 * wq list is used to freeze wq, remove from list after 3042 * wq list is used to freeze wq, remove from list after
2866 * flushing is complete in case freeze races us. 3043 * flushing is complete in case freeze races us.
@@ -2916,7 +3093,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
2916 3093
2917 spin_lock_irq(&gcwq->lock); 3094 spin_lock_irq(&gcwq->lock);
2918 3095
2919 if (!(wq->flags & WQ_FREEZEABLE) || 3096 if (!(wq->flags & WQ_FREEZABLE) ||
2920 !(gcwq->flags & GCWQ_FREEZING)) 3097 !(gcwq->flags & GCWQ_FREEZING))
2921 get_cwq(gcwq->cpu, wq)->max_active = max_active; 3098 get_cwq(gcwq->cpu, wq)->max_active = max_active;
2922 3099
@@ -3166,7 +3343,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
3166 * want to get it over with ASAP - spam rescuers, wake up as 3343 * want to get it over with ASAP - spam rescuers, wake up as
3167 * many idlers as necessary and create new ones till the 3344 * many idlers as necessary and create new ones till the
3168 * worklist is empty. Note that if the gcwq is frozen, there 3345 * worklist is empty. Note that if the gcwq is frozen, there
3169 * may be frozen works in freezeable cwqs. Don't declare 3346 * may be frozen works in freezable cwqs. Don't declare
3170 * completion while frozen. 3347 * completion while frozen.
3171 */ 3348 */
3172 while (gcwq->nr_workers != gcwq->nr_idle || 3349 while (gcwq->nr_workers != gcwq->nr_idle ||
@@ -3424,9 +3601,9 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
3424/** 3601/**
3425 * freeze_workqueues_begin - begin freezing workqueues 3602 * freeze_workqueues_begin - begin freezing workqueues
3426 * 3603 *
3427 * Start freezing workqueues. After this function returns, all 3604 * Start freezing workqueues. After this function returns, all freezable
3428 * freezeable workqueues will queue new works to their frozen_works 3605 * workqueues will queue new works to their frozen_works list instead of
3429 * list instead of gcwq->worklist. 3606 * gcwq->worklist.
3430 * 3607 *
3431 * CONTEXT: 3608 * CONTEXT:
3432 * Grabs and releases workqueue_lock and gcwq->lock's. 3609 * Grabs and releases workqueue_lock and gcwq->lock's.
@@ -3452,7 +3629,7 @@ void freeze_workqueues_begin(void)
3452 list_for_each_entry(wq, &workqueues, list) { 3629 list_for_each_entry(wq, &workqueues, list) {
3453 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3630 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3454 3631
3455 if (cwq && wq->flags & WQ_FREEZEABLE) 3632 if (cwq && wq->flags & WQ_FREEZABLE)
3456 cwq->max_active = 0; 3633 cwq->max_active = 0;
3457 } 3634 }
3458 3635
@@ -3463,7 +3640,7 @@ void freeze_workqueues_begin(void)
3463} 3640}
3464 3641
3465/** 3642/**
3466 * freeze_workqueues_busy - are freezeable workqueues still busy? 3643 * freeze_workqueues_busy - are freezable workqueues still busy?
3467 * 3644 *
3468 * Check whether freezing is complete. This function must be called 3645 * Check whether freezing is complete. This function must be called
3469 * between freeze_workqueues_begin() and thaw_workqueues(). 3646 * between freeze_workqueues_begin() and thaw_workqueues().
@@ -3472,8 +3649,8 @@ void freeze_workqueues_begin(void)
3472 * Grabs and releases workqueue_lock. 3649 * Grabs and releases workqueue_lock.
3473 * 3650 *
3474 * RETURNS: 3651 * RETURNS:
3475 * %true if some freezeable workqueues are still busy. %false if 3652 * %true if some freezable workqueues are still busy. %false if freezing
3476 * freezing is complete. 3653 * is complete.
3477 */ 3654 */
3478bool freeze_workqueues_busy(void) 3655bool freeze_workqueues_busy(void)
3479{ 3656{
@@ -3493,7 +3670,7 @@ bool freeze_workqueues_busy(void)
3493 list_for_each_entry(wq, &workqueues, list) { 3670 list_for_each_entry(wq, &workqueues, list) {
3494 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3671 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3495 3672
3496 if (!cwq || !(wq->flags & WQ_FREEZEABLE)) 3673 if (!cwq || !(wq->flags & WQ_FREEZABLE))
3497 continue; 3674 continue;
3498 3675
3499 BUG_ON(cwq->nr_active < 0); 3676 BUG_ON(cwq->nr_active < 0);
@@ -3538,7 +3715,7 @@ void thaw_workqueues(void)
3538 list_for_each_entry(wq, &workqueues, list) { 3715 list_for_each_entry(wq, &workqueues, list) {
3539 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3716 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3540 3717
3541 if (!cwq || !(wq->flags & WQ_FREEZEABLE)) 3718 if (!cwq || !(wq->flags & WQ_FREEZABLE))
3542 continue; 3719 continue;
3543 3720
3544 /* restore max_active and repopulate worklist */ 3721 /* restore max_active and repopulate worklist */
@@ -3612,7 +3789,10 @@ static int __init init_workqueues(void)
3612 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); 3789 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3613 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, 3790 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3614 WQ_UNBOUND_MAX_ACTIVE); 3791 WQ_UNBOUND_MAX_ACTIVE);
3615 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq); 3792 system_freezable_wq = alloc_workqueue("events_freezable",
3793 WQ_FREEZABLE, 0);
3794 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
3795 !system_unbound_wq || !system_freezable_wq);
3616 return 0; 3796 return 0;
3617} 3797}
3618early_initcall(init_workqueues); 3798early_initcall(init_workqueues);