aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/audit.c67
-rw-r--r--kernel/audit.h5
-rw-r--r--kernel/audit_tree.c9
-rw-r--r--kernel/audit_watch.c4
-rw-r--r--kernel/auditfilter.c12
-rw-r--r--kernel/auditsc.c16
-rw-r--r--kernel/cgroup.c160
-rw-r--r--kernel/cgroup_freezer.c72
-rw-r--r--kernel/compat.c21
-rw-r--r--kernel/configs.c1
-rw-r--r--kernel/cpuset.c17
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/debug/debug_core.c153
-rw-r--r--kernel/debug/debug_core.h1
-rw-r--r--kernel/debug/kdb/kdb_bp.c2
-rw-r--r--kernel/debug/kdb/kdb_debugger.c3
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_main.c66
-rw-r--r--kernel/debug/kdb/kdb_private.h41
-rw-r--r--kernel/debug/kdb/kdb_support.c4
-rw-r--r--kernel/early_res.c590
-rw-r--r--kernel/exit.c14
-rw-r--r--kernel/fork.c36
-rw-r--r--kernel/futex.c68
-rw-r--r--kernel/futex_compat.c2
-rw-r--r--kernel/gcov/fs.c245
-rw-r--r--kernel/groups.c5
-rw-r--r--kernel/hrtimer.c16
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/hw_breakpoint.c74
-rw-r--r--kernel/irq/Kconfig53
-rw-r--r--kernel/irq/Makefile3
-rw-r--r--kernel/irq/autoprobe.c15
-rw-r--r--kernel/irq/chip.c378
-rw-r--r--kernel/irq/dummychip.c68
-rw-r--r--kernel/irq/handle.c341
-rw-r--r--kernel/irq/internals.h39
-rw-r--r--kernel/irq/irqdesc.c410
-rw-r--r--kernel/irq/manage.c91
-rw-r--r--kernel/irq/migration.c12
-rw-r--r--kernel/irq/numa_migrate.c120
-rw-r--r--kernel/irq/proc.c26
-rw-r--r--kernel/irq/resend.c5
-rw-r--r--kernel/irq/spurious.c8
-rw-r--r--kernel/irq_work.c164
-rw-r--r--kernel/jump_label.c484
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kfifo.c11
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kprobes.c58
-rw-r--r--kernel/lockdep.c51
-rw-r--r--kernel/module.c12
-rw-r--r--kernel/mutex.c23
-rw-r--r--kernel/ns_cgroup.c8
-rw-r--r--kernel/perf_event.c2576
-rw-r--r--kernel/pid.c3
-rw-r--r--kernel/pm_qos_params.c19
-rw-r--r--kernel/power/Kconfig17
-rw-r--r--kernel/power/hibernate.c26
-rw-r--r--kernel/power/main.c29
-rw-r--r--kernel/power/power.h10
-rw-r--r--kernel/power/process.c11
-rw-r--r--kernel/power/snapshot.c117
-rw-r--r--kernel/power/swap.c312
-rw-r--r--kernel/printk.c9
-rw-r--r--kernel/profile.c1
-rw-r--r--kernel/ptrace.c36
-rw-r--r--kernel/rcupdate.c8
-rw-r--r--kernel/rcutiny.c33
-rw-r--r--kernel/rcutiny_plugin.h582
-rw-r--r--kernel/rcutorture.c17
-rw-r--r--kernel/rcutree.c92
-rw-r--r--kernel/rcutree.h20
-rw-r--r--kernel/rcutree_plugin.h47
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/resource.c153
-rw-r--r--kernel/rtmutex-tester.c6
-rw-r--r--kernel/sched.c341
-rw-r--r--kernel/sched_fair.c121
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_rt.c40
-rw-r--r--kernel/sched_stats.h20
-rw-r--r--kernel/sched_stoptask.c108
-rw-r--r--kernel/signal.c13
-rw-r--r--kernel/smp.c25
-rw-r--r--kernel/softirq.c91
-rw-r--r--kernel/srcu.c2
-rw-r--r--kernel/stop_machine.c14
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c21
-rw-r--r--kernel/sysctl_check.c9
-rw-r--r--kernel/taskstats.c172
-rw-r--r--kernel/test_kprobes.c12
-rw-r--r--kernel/time/ntp.c14
-rw-r--r--kernel/timer.c7
-rw-r--r--kernel/trace/Kconfig7
-rw-r--r--kernel/trace/blktrace.c16
-rw-r--r--kernel/trace/ftrace.c144
-rw-r--r--kernel/trace/ring_buffer.c362
-rw-r--r--kernel/trace/trace.c21
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_event_perf.c31
-rw-r--r--kernel/trace/trace_events.c194
-rw-r--r--kernel/trace/trace_functions_graph.c219
-rw-r--r--kernel/trace/trace_irqsoff.c152
-rw-r--r--kernel/trace/trace_kdb.c1
-rw-r--r--kernel/trace/trace_kprobe.c46
-rw-r--r--kernel/trace/trace_sched_wakeup.c256
-rw-r--r--kernel/trace/trace_stack.c3
-rw-r--r--kernel/trace/trace_workqueue.c10
-rw-r--r--kernel/tracepoint.c14
-rw-r--r--kernel/tsacct.c10
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/wait.c6
-rw-r--r--kernel/watchdog.c63
-rw-r--r--kernel/workqueue.c399
118 files changed, 7200 insertions, 4019 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b72d1a74be0..0b5ff083fa22 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,8 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o range.o 13 async.o range.o jump_label.o
14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
15obj-y += groups.o 14obj-y += groups.o
16 15
17ifdef CONFIG_FUNCTION_TRACER 16ifdef CONFIG_FUNCTION_TRACER
@@ -23,6 +22,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg
23CFLAGS_REMOVE_cgroup-debug.o = -pg 22CFLAGS_REMOVE_cgroup-debug.o = -pg
24CFLAGS_REMOVE_sched_clock.o = -pg 23CFLAGS_REMOVE_sched_clock.o = -pg
25CFLAGS_REMOVE_perf_event.o = -pg 24CFLAGS_REMOVE_perf_event.o = -pg
25CFLAGS_REMOVE_irq_work.o = -pg
26endif 26endif
27 27
28obj-$(CONFIG_FREEZER) += freezer.o 28obj-$(CONFIG_FREEZER) += freezer.o
@@ -86,6 +86,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o
86obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o 86obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
87obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 87obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
88obj-$(CONFIG_TINY_RCU) += rcutiny.o 88obj-$(CONFIG_TINY_RCU) += rcutiny.o
89obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
89obj-$(CONFIG_RELAY) += relay.o 90obj-$(CONFIG_RELAY) += relay.o
90obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 91obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
91obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 92obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -100,6 +101,7 @@ obj-$(CONFIG_TRACING) += trace/
100obj-$(CONFIG_X86_DS) += trace/ 101obj-$(CONFIG_X86_DS) += trace/
101obj-$(CONFIG_RING_BUFFER) += trace/ 102obj-$(CONFIG_RING_BUFFER) += trace/
102obj-$(CONFIG_SMP) += sched_cpupri.o 103obj-$(CONFIG_SMP) += sched_cpupri.o
104obj-$(CONFIG_IRQ_WORK) += irq_work.o
103obj-$(CONFIG_PERF_EVENTS) += perf_event.o 105obj-$(CONFIG_PERF_EVENTS) += perf_event.o
104obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 106obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
105obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 107obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
diff --git a/kernel/audit.c b/kernel/audit.c
index d96045789b54..77770a034d59 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -467,23 +467,16 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
467 struct task_struct *tsk; 467 struct task_struct *tsk;
468 int err; 468 int err;
469 469
470 read_lock(&tasklist_lock); 470 rcu_read_lock();
471 tsk = find_task_by_vpid(pid); 471 tsk = find_task_by_vpid(pid);
472 err = -ESRCH; 472 if (!tsk) {
473 if (!tsk) 473 rcu_read_unlock();
474 goto out; 474 return -ESRCH;
475 err = 0; 475 }
476 476 get_task_struct(tsk);
477 spin_lock_irq(&tsk->sighand->siglock); 477 rcu_read_unlock();
478 if (!tsk->signal->audit_tty) 478 err = tty_audit_push_task(tsk, loginuid, sessionid);
479 err = -EPERM; 479 put_task_struct(tsk);
480 spin_unlock_irq(&tsk->sighand->siglock);
481 if (err)
482 goto out;
483
484 tty_audit_push_task(tsk, loginuid, sessionid);
485out:
486 read_unlock(&tasklist_lock);
487 return err; 480 return err;
488} 481}
489 482
@@ -506,7 +499,7 @@ int audit_send_list(void *_dest)
506} 499}
507 500
508struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, 501struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
509 int multi, void *payload, int size) 502 int multi, const void *payload, int size)
510{ 503{
511 struct sk_buff *skb; 504 struct sk_buff *skb;
512 struct nlmsghdr *nlh; 505 struct nlmsghdr *nlh;
@@ -555,8 +548,8 @@ static int audit_send_reply_thread(void *arg)
555 * Allocates an skb, builds the netlink message, and sends it to the pid. 548 * Allocates an skb, builds the netlink message, and sends it to the pid.
556 * No failure notifications. 549 * No failure notifications.
557 */ 550 */
558void audit_send_reply(int pid, int seq, int type, int done, int multi, 551static void audit_send_reply(int pid, int seq, int type, int done, int multi,
559 void *payload, int size) 552 const void *payload, int size)
560{ 553{
561 struct sk_buff *skb; 554 struct sk_buff *skb;
562 struct task_struct *tsk; 555 struct task_struct *tsk;
@@ -880,40 +873,40 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
880 case AUDIT_TTY_GET: { 873 case AUDIT_TTY_GET: {
881 struct audit_tty_status s; 874 struct audit_tty_status s;
882 struct task_struct *tsk; 875 struct task_struct *tsk;
876 unsigned long flags;
883 877
884 read_lock(&tasklist_lock); 878 rcu_read_lock();
885 tsk = find_task_by_vpid(pid); 879 tsk = find_task_by_vpid(pid);
886 if (!tsk) 880 if (tsk && lock_task_sighand(tsk, &flags)) {
887 err = -ESRCH;
888 else {
889 spin_lock_irq(&tsk->sighand->siglock);
890 s.enabled = tsk->signal->audit_tty != 0; 881 s.enabled = tsk->signal->audit_tty != 0;
891 spin_unlock_irq(&tsk->sighand->siglock); 882 unlock_task_sighand(tsk, &flags);
892 } 883 } else
893 read_unlock(&tasklist_lock); 884 err = -ESRCH;
894 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0, 885 rcu_read_unlock();
895 &s, sizeof(s)); 886
887 if (!err)
888 audit_send_reply(NETLINK_CB(skb).pid, seq,
889 AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
896 break; 890 break;
897 } 891 }
898 case AUDIT_TTY_SET: { 892 case AUDIT_TTY_SET: {
899 struct audit_tty_status *s; 893 struct audit_tty_status *s;
900 struct task_struct *tsk; 894 struct task_struct *tsk;
895 unsigned long flags;
901 896
902 if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) 897 if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
903 return -EINVAL; 898 return -EINVAL;
904 s = data; 899 s = data;
905 if (s->enabled != 0 && s->enabled != 1) 900 if (s->enabled != 0 && s->enabled != 1)
906 return -EINVAL; 901 return -EINVAL;
907 read_lock(&tasklist_lock); 902 rcu_read_lock();
908 tsk = find_task_by_vpid(pid); 903 tsk = find_task_by_vpid(pid);
909 if (!tsk) 904 if (tsk && lock_task_sighand(tsk, &flags)) {
910 err = -ESRCH;
911 else {
912 spin_lock_irq(&tsk->sighand->siglock);
913 tsk->signal->audit_tty = s->enabled != 0; 905 tsk->signal->audit_tty = s->enabled != 0;
914 spin_unlock_irq(&tsk->sighand->siglock); 906 unlock_task_sighand(tsk, &flags);
915 } 907 } else
916 read_unlock(&tasklist_lock); 908 err = -ESRCH;
909 rcu_read_unlock();
917 break; 910 break;
918 } 911 }
919 default: 912 default:
diff --git a/kernel/audit.h b/kernel/audit.h
index f7206db4e13d..91e7071c4d2c 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -84,10 +84,7 @@ extern int audit_compare_dname_path(const char *dname, const char *path,
84 int *dirlen); 84 int *dirlen);
85extern struct sk_buff * audit_make_reply(int pid, int seq, int type, 85extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
86 int done, int multi, 86 int done, int multi,
87 void *payload, int size); 87 const void *payload, int size);
88extern void audit_send_reply(int pid, int seq, int type,
89 int done, int multi,
90 void *payload, int size);
91extern void audit_panic(const char *message); 88extern void audit_panic(const char *message);
92 89
93struct audit_netlink_list { 90struct audit_netlink_list {
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 7f18d3a4527e..37b2bea170c8 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -223,7 +223,7 @@ static void untag_chunk(struct node *p)
223{ 223{
224 struct audit_chunk *chunk = find_chunk(p); 224 struct audit_chunk *chunk = find_chunk(p);
225 struct fsnotify_mark *entry = &chunk->mark; 225 struct fsnotify_mark *entry = &chunk->mark;
226 struct audit_chunk *new; 226 struct audit_chunk *new = NULL;
227 struct audit_tree *owner; 227 struct audit_tree *owner;
228 int size = chunk->count - 1; 228 int size = chunk->count - 1;
229 int i, j; 229 int i, j;
@@ -232,9 +232,14 @@ static void untag_chunk(struct node *p)
232 232
233 spin_unlock(&hash_lock); 233 spin_unlock(&hash_lock);
234 234
235 if (size)
236 new = alloc_chunk(size);
237
235 spin_lock(&entry->lock); 238 spin_lock(&entry->lock);
236 if (chunk->dead || !entry->i.inode) { 239 if (chunk->dead || !entry->i.inode) {
237 spin_unlock(&entry->lock); 240 spin_unlock(&entry->lock);
241 if (new)
242 free_chunk(new);
238 goto out; 243 goto out;
239 } 244 }
240 245
@@ -255,9 +260,9 @@ static void untag_chunk(struct node *p)
255 goto out; 260 goto out;
256 } 261 }
257 262
258 new = alloc_chunk(size);
259 if (!new) 263 if (!new)
260 goto Fallback; 264 goto Fallback;
265
261 fsnotify_duplicate_mark(&new->mark, entry); 266 fsnotify_duplicate_mark(&new->mark, entry);
262 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { 267 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
263 free_chunk(new); 268 free_chunk(new);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index f0c9b2e7542d..d2e3c7866460 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -60,7 +60,7 @@ struct audit_parent {
60}; 60};
61 61
62/* fsnotify handle. */ 62/* fsnotify handle. */
63struct fsnotify_group *audit_watch_group; 63static struct fsnotify_group *audit_watch_group;
64 64
65/* fsnotify events we care about. */ 65/* fsnotify events we care about. */
66#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ 66#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
@@ -123,7 +123,7 @@ void audit_put_watch(struct audit_watch *watch)
123 } 123 }
124} 124}
125 125
126void audit_remove_watch(struct audit_watch *watch) 126static void audit_remove_watch(struct audit_watch *watch)
127{ 127{
128 list_del(&watch->wlist); 128 list_del(&watch->wlist);
129 audit_put_parent(watch->parent); 129 audit_put_parent(watch->parent);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index eb7675499fb5..add2819af71b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1252,6 +1252,18 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1252 case AUDIT_LOGINUID: 1252 case AUDIT_LOGINUID:
1253 result = audit_comparator(cb->loginuid, f->op, f->val); 1253 result = audit_comparator(cb->loginuid, f->op, f->val);
1254 break; 1254 break;
1255 case AUDIT_SUBJ_USER:
1256 case AUDIT_SUBJ_ROLE:
1257 case AUDIT_SUBJ_TYPE:
1258 case AUDIT_SUBJ_SEN:
1259 case AUDIT_SUBJ_CLR:
1260 if (f->lsm_rule)
1261 result = security_audit_rule_match(cb->sid,
1262 f->type,
1263 f->op,
1264 f->lsm_rule,
1265 NULL);
1266 break;
1255 } 1267 }
1256 1268
1257 if (!result) 1269 if (!result)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1b31c130d034..f49a0318c2ed 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -241,6 +241,10 @@ struct audit_context {
241 pid_t pid; 241 pid_t pid;
242 struct audit_cap_data cap; 242 struct audit_cap_data cap;
243 } capset; 243 } capset;
244 struct {
245 int fd;
246 int flags;
247 } mmap;
244 }; 248 };
245 int fds[2]; 249 int fds[2];
246 250
@@ -1305,6 +1309,10 @@ static void show_special(struct audit_context *context, int *call_panic)
1305 audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); 1309 audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
1306 audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); 1310 audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
1307 break; } 1311 break; }
1312 case AUDIT_MMAP: {
1313 audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
1314 context->mmap.flags);
1315 break; }
1308 } 1316 }
1309 audit_log_end(ab); 1317 audit_log_end(ab);
1310} 1318}
@@ -2476,6 +2484,14 @@ void __audit_log_capset(pid_t pid,
2476 context->type = AUDIT_CAPSET; 2484 context->type = AUDIT_CAPSET;
2477} 2485}
2478 2486
2487void __audit_mmap_fd(int fd, int flags)
2488{
2489 struct audit_context *context = current->audit_context;
2490 context->mmap.fd = fd;
2491 context->mmap.flags = flags;
2492 context->type = AUDIT_MMAP;
2493}
2494
2479/** 2495/**
2480 * audit_core_dumps - record information about processes that end abnormally 2496 * audit_core_dumps - record information about processes that end abnormally
2481 * @signr: signal value 2497 * @signr: signal value
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 192f88c5b0f9..66a416b42c18 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,6 @@
52#include <linux/cgroupstats.h> 52#include <linux/cgroupstats.h>
53#include <linux/hash.h> 53#include <linux/hash.h>
54#include <linux/namei.h> 54#include <linux/namei.h>
55#include <linux/smp_lock.h>
56#include <linux/pid_namespace.h> 55#include <linux/pid_namespace.h>
57#include <linux/idr.h> 56#include <linux/idr.h>
58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
@@ -138,7 +137,7 @@ struct css_id {
138 * is called after synchronize_rcu(). But for safe use, css_is_removed() 137 * is called after synchronize_rcu(). But for safe use, css_is_removed()
139 * css_tryget() should be used for avoiding race. 138 * css_tryget() should be used for avoiding race.
140 */ 139 */
141 struct cgroup_subsys_state *css; 140 struct cgroup_subsys_state __rcu *css;
142 /* 141 /*
143 * ID of this css. 142 * ID of this css.
144 */ 143 */
@@ -244,6 +243,11 @@ static int notify_on_release(const struct cgroup *cgrp)
244 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 243 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
245} 244}
246 245
246static int clone_children(const struct cgroup *cgrp)
247{
248 return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
249}
250
247/* 251/*
248 * for_each_subsys() allows you to iterate on each subsystem attached to 252 * for_each_subsys() allows you to iterate on each subsystem attached to
249 * an active hierarchy 253 * an active hierarchy
@@ -778,6 +782,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
778 struct inode *inode = new_inode(sb); 782 struct inode *inode = new_inode(sb);
779 783
780 if (inode) { 784 if (inode) {
785 inode->i_ino = get_next_ino();
781 inode->i_mode = mode; 786 inode->i_mode = mode;
782 inode->i_uid = current_fsuid(); 787 inode->i_uid = current_fsuid();
783 inode->i_gid = current_fsgid(); 788 inode->i_gid = current_fsgid();
@@ -1040,6 +1045,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
1040 seq_puts(seq, ",noprefix"); 1045 seq_puts(seq, ",noprefix");
1041 if (strlen(root->release_agent_path)) 1046 if (strlen(root->release_agent_path))
1042 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1047 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1048 if (clone_children(&root->top_cgroup))
1049 seq_puts(seq, ",clone_children");
1043 if (strlen(root->name)) 1050 if (strlen(root->name))
1044 seq_printf(seq, ",name=%s", root->name); 1051 seq_printf(seq, ",name=%s", root->name);
1045 mutex_unlock(&cgroup_mutex); 1052 mutex_unlock(&cgroup_mutex);
@@ -1050,6 +1057,7 @@ struct cgroup_sb_opts {
1050 unsigned long subsys_bits; 1057 unsigned long subsys_bits;
1051 unsigned long flags; 1058 unsigned long flags;
1052 char *release_agent; 1059 char *release_agent;
1060 bool clone_children;
1053 char *name; 1061 char *name;
1054 /* User explicitly requested empty subsystem */ 1062 /* User explicitly requested empty subsystem */
1055 bool none; 1063 bool none;
@@ -1066,7 +1074,8 @@ struct cgroup_sb_opts {
1066 */ 1074 */
1067static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) 1075static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1068{ 1076{
1069 char *token, *o = data ?: "all"; 1077 char *token, *o = data;
1078 bool all_ss = false, one_ss = false;
1070 unsigned long mask = (unsigned long)-1; 1079 unsigned long mask = (unsigned long)-1;
1071 int i; 1080 int i;
1072 bool module_pin_failed = false; 1081 bool module_pin_failed = false;
@@ -1082,22 +1091,27 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1082 while ((token = strsep(&o, ",")) != NULL) { 1091 while ((token = strsep(&o, ",")) != NULL) {
1083 if (!*token) 1092 if (!*token)
1084 return -EINVAL; 1093 return -EINVAL;
1085 if (!strcmp(token, "all")) { 1094 if (!strcmp(token, "none")) {
1086 /* Add all non-disabled subsystems */
1087 opts->subsys_bits = 0;
1088 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1089 struct cgroup_subsys *ss = subsys[i];
1090 if (ss == NULL)
1091 continue;
1092 if (!ss->disabled)
1093 opts->subsys_bits |= 1ul << i;
1094 }
1095 } else if (!strcmp(token, "none")) {
1096 /* Explicitly have no subsystems */ 1095 /* Explicitly have no subsystems */
1097 opts->none = true; 1096 opts->none = true;
1098 } else if (!strcmp(token, "noprefix")) { 1097 continue;
1098 }
1099 if (!strcmp(token, "all")) {
1100 /* Mutually exclusive option 'all' + subsystem name */
1101 if (one_ss)
1102 return -EINVAL;
1103 all_ss = true;
1104 continue;
1105 }
1106 if (!strcmp(token, "noprefix")) {
1099 set_bit(ROOT_NOPREFIX, &opts->flags); 1107 set_bit(ROOT_NOPREFIX, &opts->flags);
1100 } else if (!strncmp(token, "release_agent=", 14)) { 1108 continue;
1109 }
1110 if (!strcmp(token, "clone_children")) {
1111 opts->clone_children = true;
1112 continue;
1113 }
1114 if (!strncmp(token, "release_agent=", 14)) {
1101 /* Specifying two release agents is forbidden */ 1115 /* Specifying two release agents is forbidden */
1102 if (opts->release_agent) 1116 if (opts->release_agent)
1103 return -EINVAL; 1117 return -EINVAL;
@@ -1105,7 +1119,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1105 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); 1119 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1106 if (!opts->release_agent) 1120 if (!opts->release_agent)
1107 return -ENOMEM; 1121 return -ENOMEM;
1108 } else if (!strncmp(token, "name=", 5)) { 1122 continue;
1123 }
1124 if (!strncmp(token, "name=", 5)) {
1109 const char *name = token + 5; 1125 const char *name = token + 5;
1110 /* Can't specify an empty name */ 1126 /* Can't specify an empty name */
1111 if (!strlen(name)) 1127 if (!strlen(name))
@@ -1127,20 +1143,44 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1127 GFP_KERNEL); 1143 GFP_KERNEL);
1128 if (!opts->name) 1144 if (!opts->name)
1129 return -ENOMEM; 1145 return -ENOMEM;
1130 } else { 1146
1131 struct cgroup_subsys *ss; 1147 continue;
1132 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1148 }
1133 ss = subsys[i]; 1149
1134 if (ss == NULL) 1150 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1135 continue; 1151 struct cgroup_subsys *ss = subsys[i];
1136 if (!strcmp(token, ss->name)) { 1152 if (ss == NULL)
1137 if (!ss->disabled) 1153 continue;
1138 set_bit(i, &opts->subsys_bits); 1154 if (strcmp(token, ss->name))
1139 break; 1155 continue;
1140 } 1156 if (ss->disabled)
1141 } 1157 continue;
1142 if (i == CGROUP_SUBSYS_COUNT) 1158
1143 return -ENOENT; 1159 /* Mutually exclusive option 'all' + subsystem name */
1160 if (all_ss)
1161 return -EINVAL;
1162 set_bit(i, &opts->subsys_bits);
1163 one_ss = true;
1164
1165 break;
1166 }
1167 if (i == CGROUP_SUBSYS_COUNT)
1168 return -ENOENT;
1169 }
1170
1171 /*
1172 * If the 'all' option was specified select all the subsystems,
1173 * otherwise 'all, 'none' and a subsystem name options were not
1174 * specified, let's default to 'all'
1175 */
1176 if (all_ss || (!all_ss && !one_ss && !opts->none)) {
1177 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1178 struct cgroup_subsys *ss = subsys[i];
1179 if (ss == NULL)
1180 continue;
1181 if (ss->disabled)
1182 continue;
1183 set_bit(i, &opts->subsys_bits);
1144 } 1184 }
1145 } 1185 }
1146 1186
@@ -1222,7 +1262,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1222 struct cgroup *cgrp = &root->top_cgroup; 1262 struct cgroup *cgrp = &root->top_cgroup;
1223 struct cgroup_sb_opts opts; 1263 struct cgroup_sb_opts opts;
1224 1264
1225 lock_kernel();
1226 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1265 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1227 mutex_lock(&cgroup_mutex); 1266 mutex_lock(&cgroup_mutex);
1228 1267
@@ -1255,7 +1294,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1255 kfree(opts.name); 1294 kfree(opts.name);
1256 mutex_unlock(&cgroup_mutex); 1295 mutex_unlock(&cgroup_mutex);
1257 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1296 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1258 unlock_kernel();
1259 return ret; 1297 return ret;
1260} 1298}
1261 1299
@@ -1357,6 +1395,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1357 strcpy(root->release_agent_path, opts->release_agent); 1395 strcpy(root->release_agent_path, opts->release_agent);
1358 if (opts->name) 1396 if (opts->name)
1359 strcpy(root->name, opts->name); 1397 strcpy(root->name, opts->name);
1398 if (opts->clone_children)
1399 set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
1360 return root; 1400 return root;
1361} 1401}
1362 1402
@@ -1420,9 +1460,9 @@ static int cgroup_get_rootdir(struct super_block *sb)
1420 return 0; 1460 return 0;
1421} 1461}
1422 1462
1423static int cgroup_get_sb(struct file_system_type *fs_type, 1463static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1424 int flags, const char *unused_dev_name, 1464 int flags, const char *unused_dev_name,
1425 void *data, struct vfsmount *mnt) 1465 void *data)
1426{ 1466{
1427 struct cgroup_sb_opts opts; 1467 struct cgroup_sb_opts opts;
1428 struct cgroupfs_root *root; 1468 struct cgroupfs_root *root;
@@ -1556,10 +1596,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1556 drop_parsed_module_refcounts(opts.subsys_bits); 1596 drop_parsed_module_refcounts(opts.subsys_bits);
1557 } 1597 }
1558 1598
1559 simple_set_mnt(mnt, sb);
1560 kfree(opts.release_agent); 1599 kfree(opts.release_agent);
1561 kfree(opts.name); 1600 kfree(opts.name);
1562 return 0; 1601 return dget(sb->s_root);
1563 1602
1564 drop_new_super: 1603 drop_new_super:
1565 deactivate_locked_super(sb); 1604 deactivate_locked_super(sb);
@@ -1568,8 +1607,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1568 out_err: 1607 out_err:
1569 kfree(opts.release_agent); 1608 kfree(opts.release_agent);
1570 kfree(opts.name); 1609 kfree(opts.name);
1571 1610 return ERR_PTR(ret);
1572 return ret;
1573} 1611}
1574 1612
1575static void cgroup_kill_sb(struct super_block *sb) { 1613static void cgroup_kill_sb(struct super_block *sb) {
@@ -1619,7 +1657,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1619 1657
1620static struct file_system_type cgroup_fs_type = { 1658static struct file_system_type cgroup_fs_type = {
1621 .name = "cgroup", 1659 .name = "cgroup",
1622 .get_sb = cgroup_get_sb, 1660 .mount = cgroup_mount,
1623 .kill_sb = cgroup_kill_sb, 1661 .kill_sb = cgroup_kill_sb,
1624}; 1662};
1625 1663
@@ -1791,19 +1829,20 @@ out:
1791} 1829}
1792 1830
1793/** 1831/**
1794 * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup 1832 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
1833 * @from: attach to all cgroups of a given task
1795 * @tsk: the task to be attached 1834 * @tsk: the task to be attached
1796 */ 1835 */
1797int cgroup_attach_task_current_cg(struct task_struct *tsk) 1836int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
1798{ 1837{
1799 struct cgroupfs_root *root; 1838 struct cgroupfs_root *root;
1800 struct cgroup *cur_cg;
1801 int retval = 0; 1839 int retval = 0;
1802 1840
1803 cgroup_lock(); 1841 cgroup_lock();
1804 for_each_active_root(root) { 1842 for_each_active_root(root) {
1805 cur_cg = task_cgroup_from_root(current, root); 1843 struct cgroup *from_cg = task_cgroup_from_root(from, root);
1806 retval = cgroup_attach_task(cur_cg, tsk); 1844
1845 retval = cgroup_attach_task(from_cg, tsk);
1807 if (retval) 1846 if (retval)
1808 break; 1847 break;
1809 } 1848 }
@@ -1811,7 +1850,7 @@ int cgroup_attach_task_current_cg(struct task_struct *tsk)
1811 1850
1812 return retval; 1851 return retval;
1813} 1852}
1814EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg); 1853EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1815 1854
1816/* 1855/*
1817 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex 1856 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
@@ -1882,6 +1921,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1882 const char *buffer) 1921 const char *buffer)
1883{ 1922{
1884 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); 1923 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
1924 if (strlen(buffer) >= PATH_MAX)
1925 return -EINVAL;
1885 if (!cgroup_lock_live_group(cgrp)) 1926 if (!cgroup_lock_live_group(cgrp))
1886 return -ENODEV; 1927 return -ENODEV;
1887 strcpy(cgrp->root->release_agent_path, buffer); 1928 strcpy(cgrp->root->release_agent_path, buffer);
@@ -3175,6 +3216,23 @@ fail:
3175 return ret; 3216 return ret;
3176} 3217}
3177 3218
3219static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3220 struct cftype *cft)
3221{
3222 return clone_children(cgrp);
3223}
3224
3225static int cgroup_clone_children_write(struct cgroup *cgrp,
3226 struct cftype *cft,
3227 u64 val)
3228{
3229 if (val)
3230 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3231 else
3232 clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3233 return 0;
3234}
3235
3178/* 3236/*
3179 * for the common functions, 'private' gives the type of file 3237 * for the common functions, 'private' gives the type of file
3180 */ 3238 */
@@ -3205,6 +3263,11 @@ static struct cftype files[] = {
3205 .write_string = cgroup_write_event_control, 3263 .write_string = cgroup_write_event_control,
3206 .mode = S_IWUGO, 3264 .mode = S_IWUGO,
3207 }, 3265 },
3266 {
3267 .name = "cgroup.clone_children",
3268 .read_u64 = cgroup_clone_children_read,
3269 .write_u64 = cgroup_clone_children_write,
3270 },
3208}; 3271};
3209 3272
3210static struct cftype cft_release_agent = { 3273static struct cftype cft_release_agent = {
@@ -3334,6 +3397,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3334 if (notify_on_release(parent)) 3397 if (notify_on_release(parent))
3335 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3398 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3336 3399
3400 if (clone_children(parent))
3401 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3402
3337 for_each_subsys(root, ss) { 3403 for_each_subsys(root, ss) {
3338 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 3404 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
3339 3405
@@ -3348,6 +3414,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3348 goto err_destroy; 3414 goto err_destroy;
3349 } 3415 }
3350 /* At error, ->destroy() callback has to free assigned ID. */ 3416 /* At error, ->destroy() callback has to free assigned ID. */
3417 if (clone_children(parent) && ss->post_clone)
3418 ss->post_clone(ss, cgrp);
3351 } 3419 }
3352 3420
3353 cgroup_lock_hierarchy(root); 3421 cgroup_lock_hierarchy(root);
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index ce71ed53e88f..e7bebb7c6c38 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -48,20 +48,19 @@ static inline struct freezer *task_freezer(struct task_struct *task)
48 struct freezer, css); 48 struct freezer, css);
49} 49}
50 50
51int cgroup_freezing_or_frozen(struct task_struct *task) 51static inline int __cgroup_freezing_or_frozen(struct task_struct *task)
52{ 52{
53 struct freezer *freezer; 53 enum freezer_state state = task_freezer(task)->state;
54 enum freezer_state state; 54 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
55}
55 56
57int cgroup_freezing_or_frozen(struct task_struct *task)
58{
59 int result;
56 task_lock(task); 60 task_lock(task);
57 freezer = task_freezer(task); 61 result = __cgroup_freezing_or_frozen(task);
58 if (!freezer->css.cgroup->parent)
59 state = CGROUP_THAWED; /* root cgroup can't be frozen */
60 else
61 state = freezer->state;
62 task_unlock(task); 62 task_unlock(task);
63 63 return result;
64 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
65} 64}
66 65
67/* 66/*
@@ -154,13 +153,6 @@ static void freezer_destroy(struct cgroup_subsys *ss,
154 kfree(cgroup_freezer(cgroup)); 153 kfree(cgroup_freezer(cgroup));
155} 154}
156 155
157/* Task is frozen or will freeze immediately when next it gets woken */
158static bool is_task_frozen_enough(struct task_struct *task)
159{
160 return frozen(task) ||
161 (task_is_stopped_or_traced(task) && freezing(task));
162}
163
164/* 156/*
165 * The call to cgroup_lock() in the freezer.state write method prevents 157 * The call to cgroup_lock() in the freezer.state write method prevents
166 * a write to that file racing against an attach, and hence the 158 * a write to that file racing against an attach, and hence the
@@ -174,24 +166,25 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
174 166
175 /* 167 /*
176 * Anything frozen can't move or be moved to/from. 168 * Anything frozen can't move or be moved to/from.
177 *
178 * Since orig_freezer->state == FROZEN means that @task has been
179 * frozen, so it's sufficient to check the latter condition.
180 */ 169 */
181 170
182 if (is_task_frozen_enough(task)) 171 freezer = cgroup_freezer(new_cgroup);
172 if (freezer->state != CGROUP_THAWED)
183 return -EBUSY; 173 return -EBUSY;
184 174
185 freezer = cgroup_freezer(new_cgroup); 175 rcu_read_lock();
186 if (freezer->state == CGROUP_FROZEN) 176 if (__cgroup_freezing_or_frozen(task)) {
177 rcu_read_unlock();
187 return -EBUSY; 178 return -EBUSY;
179 }
180 rcu_read_unlock();
188 181
189 if (threadgroup) { 182 if (threadgroup) {
190 struct task_struct *c; 183 struct task_struct *c;
191 184
192 rcu_read_lock(); 185 rcu_read_lock();
193 list_for_each_entry_rcu(c, &task->thread_group, thread_group) { 186 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
194 if (is_task_frozen_enough(c)) { 187 if (__cgroup_freezing_or_frozen(c)) {
195 rcu_read_unlock(); 188 rcu_read_unlock();
196 return -EBUSY; 189 return -EBUSY;
197 } 190 }
@@ -236,31 +229,30 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
236/* 229/*
237 * caller must hold freezer->lock 230 * caller must hold freezer->lock
238 */ 231 */
239static void update_freezer_state(struct cgroup *cgroup, 232static void update_if_frozen(struct cgroup *cgroup,
240 struct freezer *freezer) 233 struct freezer *freezer)
241{ 234{
242 struct cgroup_iter it; 235 struct cgroup_iter it;
243 struct task_struct *task; 236 struct task_struct *task;
244 unsigned int nfrozen = 0, ntotal = 0; 237 unsigned int nfrozen = 0, ntotal = 0;
238 enum freezer_state old_state = freezer->state;
245 239
246 cgroup_iter_start(cgroup, &it); 240 cgroup_iter_start(cgroup, &it);
247 while ((task = cgroup_iter_next(cgroup, &it))) { 241 while ((task = cgroup_iter_next(cgroup, &it))) {
248 ntotal++; 242 ntotal++;
249 if (is_task_frozen_enough(task)) 243 if (frozen(task))
250 nfrozen++; 244 nfrozen++;
251 } 245 }
252 246
253 /* 247 if (old_state == CGROUP_THAWED) {
254 * Transition to FROZEN when no new tasks can be added ensures 248 BUG_ON(nfrozen > 0);
255 * that we never exist in the FROZEN state while there are unfrozen 249 } else if (old_state == CGROUP_FREEZING) {
256 * tasks. 250 if (nfrozen == ntotal)
257 */ 251 freezer->state = CGROUP_FROZEN;
258 if (nfrozen == ntotal) 252 } else { /* old_state == CGROUP_FROZEN */
259 freezer->state = CGROUP_FROZEN; 253 BUG_ON(nfrozen != ntotal);
260 else if (nfrozen > 0) 254 }
261 freezer->state = CGROUP_FREEZING; 255
262 else
263 freezer->state = CGROUP_THAWED;
264 cgroup_iter_end(cgroup, &it); 256 cgroup_iter_end(cgroup, &it);
265} 257}
266 258
@@ -279,7 +271,7 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
279 if (state == CGROUP_FREEZING) { 271 if (state == CGROUP_FREEZING) {
280 /* We change from FREEZING to FROZEN lazily if the cgroup was 272 /* We change from FREEZING to FROZEN lazily if the cgroup was
281 * only partially frozen when we exitted write. */ 273 * only partially frozen when we exitted write. */
282 update_freezer_state(cgroup, freezer); 274 update_if_frozen(cgroup, freezer);
283 state = freezer->state; 275 state = freezer->state;
284 } 276 }
285 spin_unlock_irq(&freezer->lock); 277 spin_unlock_irq(&freezer->lock);
@@ -301,7 +293,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
301 while ((task = cgroup_iter_next(cgroup, &it))) { 293 while ((task = cgroup_iter_next(cgroup, &it))) {
302 if (!freeze_task(task, true)) 294 if (!freeze_task(task, true))
303 continue; 295 continue;
304 if (is_task_frozen_enough(task)) 296 if (frozen(task))
305 continue; 297 continue;
306 if (!freezing(task) && !freezer_should_skip(task)) 298 if (!freezing(task) && !freezer_should_skip(task))
307 num_cant_freeze_now++; 299 num_cant_freeze_now++;
@@ -335,7 +327,7 @@ static int freezer_change_state(struct cgroup *cgroup,
335 327
336 spin_lock_irq(&freezer->lock); 328 spin_lock_irq(&freezer->lock);
337 329
338 update_freezer_state(cgroup, freezer); 330 update_if_frozen(cgroup, freezer);
339 if (goal_state == freezer->state) 331 if (goal_state == freezer->state)
340 goto out; 332 goto out;
341 333
diff --git a/kernel/compat.c b/kernel/compat.c
index e167efce8423..c9e2ec0b34a8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -1126,3 +1126,24 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
1126 1126
1127 return 0; 1127 return 0;
1128} 1128}
1129
1130/*
1131 * Allocate user-space memory for the duration of a single system call,
1132 * in order to marshall parameters inside a compat thunk.
1133 */
1134void __user *compat_alloc_user_space(unsigned long len)
1135{
1136 void __user *ptr;
1137
1138 /* If len would occupy more than half of the entire compat space... */
1139 if (unlikely(len > (((compat_uptr_t)~0) >> 1)))
1140 return NULL;
1141
1142 ptr = arch_compat_alloc_user_space(len);
1143
1144 if (unlikely(!access_ok(VERIFY_WRITE, ptr, len)))
1145 return NULL;
1146
1147 return ptr;
1148}
1149EXPORT_SYMBOL_GPL(compat_alloc_user_space);
diff --git a/kernel/configs.c b/kernel/configs.c
index abaee684ecbf..b4066b44a99d 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -66,6 +66,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
66static const struct file_operations ikconfig_file_ops = { 66static const struct file_operations ikconfig_file_ops = {
67 .owner = THIS_MODULE, 67 .owner = THIS_MODULE,
68 .read = ikconfig_read_current, 68 .read = ikconfig_read_current,
69 .llseek = default_llseek,
69}; 70};
70 71
71static int __init ikconfig_init(void) 72static int __init ikconfig_init(void)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b23c0979bbe7..4349935c2ad8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -231,18 +231,17 @@ static DEFINE_SPINLOCK(cpuset_buffer_lock);
231 * users. If someone tries to mount the "cpuset" filesystem, we 231 * users. If someone tries to mount the "cpuset" filesystem, we
232 * silently switch it to mount "cgroup" instead 232 * silently switch it to mount "cgroup" instead
233 */ 233 */
234static int cpuset_get_sb(struct file_system_type *fs_type, 234static struct dentry *cpuset_mount(struct file_system_type *fs_type,
235 int flags, const char *unused_dev_name, 235 int flags, const char *unused_dev_name, void *data)
236 void *data, struct vfsmount *mnt)
237{ 236{
238 struct file_system_type *cgroup_fs = get_fs_type("cgroup"); 237 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
239 int ret = -ENODEV; 238 struct dentry *ret = ERR_PTR(-ENODEV);
240 if (cgroup_fs) { 239 if (cgroup_fs) {
241 char mountopts[] = 240 char mountopts[] =
242 "cpuset,noprefix," 241 "cpuset,noprefix,"
243 "release_agent=/sbin/cpuset_release_agent"; 242 "release_agent=/sbin/cpuset_release_agent";
244 ret = cgroup_fs->get_sb(cgroup_fs, flags, 243 ret = cgroup_fs->mount(cgroup_fs, flags,
245 unused_dev_name, mountopts, mnt); 244 unused_dev_name, mountopts);
246 put_filesystem(cgroup_fs); 245 put_filesystem(cgroup_fs);
247 } 246 }
248 return ret; 247 return ret;
@@ -250,7 +249,7 @@ static int cpuset_get_sb(struct file_system_type *fs_type,
250 249
251static struct file_system_type cpuset_fs_type = { 250static struct file_system_type cpuset_fs_type = {
252 .name = "cpuset", 251 .name = "cpuset",
253 .get_sb = cpuset_get_sb, 252 .mount = cpuset_mount,
254}; 253};
255 254
256/* 255/*
@@ -1397,7 +1396,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1397 if (tsk->flags & PF_THREAD_BOUND) 1396 if (tsk->flags & PF_THREAD_BOUND)
1398 return -EINVAL; 1397 return -EINVAL;
1399 1398
1400 ret = security_task_setscheduler(tsk, 0, NULL); 1399 ret = security_task_setscheduler(tsk);
1401 if (ret) 1400 if (ret)
1402 return ret; 1401 return ret;
1403 if (threadgroup) { 1402 if (threadgroup) {
@@ -1405,7 +1404,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1405 1404
1406 rcu_read_lock(); 1405 rcu_read_lock();
1407 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 1406 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1408 ret = security_task_setscheduler(c, 0, NULL); 1407 ret = security_task_setscheduler(c);
1409 if (ret) { 1408 if (ret) {
1410 rcu_read_unlock(); 1409 rcu_read_unlock();
1411 return ret; 1410 return ret;
diff --git a/kernel/cred.c b/kernel/cred.c
index 9a3e22641fe7..6a1aa004e376 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -325,7 +325,7 @@ EXPORT_SYMBOL(prepare_creds);
325 325
326/* 326/*
327 * Prepare credentials for current to perform an execve() 327 * Prepare credentials for current to perform an execve()
328 * - The caller must hold current->cred_guard_mutex 328 * - The caller must hold ->cred_guard_mutex
329 */ 329 */
330struct cred *prepare_exec_creds(void) 330struct cred *prepare_exec_creds(void)
331{ 331{
@@ -384,8 +384,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
384 struct cred *new; 384 struct cred *new;
385 int ret; 385 int ret;
386 386
387 mutex_init(&p->cred_guard_mutex);
388
389 if ( 387 if (
390#ifdef CONFIG_KEYS 388#ifdef CONFIG_KEYS
391 !p->cred->thread_keyring && 389 !p->cred->thread_keyring &&
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index de407c78178d..cefd4a11f6d9 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -47,6 +47,7 @@
47#include <linux/pid.h> 47#include <linux/pid.h>
48#include <linux/smp.h> 48#include <linux/smp.h>
49#include <linux/mm.h> 49#include <linux/mm.h>
50#include <linux/rcupdate.h>
50 51
51#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
52#include <asm/byteorder.h> 53#include <asm/byteorder.h>
@@ -109,13 +110,15 @@ static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
109 */ 110 */
110atomic_t kgdb_active = ATOMIC_INIT(-1); 111atomic_t kgdb_active = ATOMIC_INIT(-1);
111EXPORT_SYMBOL_GPL(kgdb_active); 112EXPORT_SYMBOL_GPL(kgdb_active);
113static DEFINE_RAW_SPINLOCK(dbg_master_lock);
114static DEFINE_RAW_SPINLOCK(dbg_slave_lock);
112 115
113/* 116/*
114 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early 117 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
115 * bootup code (which might not have percpu set up yet): 118 * bootup code (which might not have percpu set up yet):
116 */ 119 */
117static atomic_t passive_cpu_wait[NR_CPUS]; 120static atomic_t masters_in_kgdb;
118static atomic_t cpu_in_kgdb[NR_CPUS]; 121static atomic_t slaves_in_kgdb;
119static atomic_t kgdb_break_tasklet_var; 122static atomic_t kgdb_break_tasklet_var;
120atomic_t kgdb_setting_breakpoint; 123atomic_t kgdb_setting_breakpoint;
121 124
@@ -206,18 +209,6 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
206 return 0; 209 return 0;
207} 210}
208 211
209/**
210 * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
211 * @regs: Current &struct pt_regs.
212 *
213 * This function will be called if the particular architecture must
214 * disable hardware debugging while it is processing gdb packets or
215 * handling exception.
216 */
217void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
218{
219}
220
221/* 212/*
222 * Some architectures need cache flushes when we set/clear a 213 * Some architectures need cache flushes when we set/clear a
223 * breakpoint: 214 * breakpoint:
@@ -457,26 +448,34 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
457 return 1; 448 return 1;
458} 449}
459 450
460static void dbg_cpu_switch(int cpu, int next_cpu) 451static void dbg_touch_watchdogs(void)
461{ 452{
462 /* Mark the cpu we are switching away from as a slave when it 453 touch_softlockup_watchdog_sync();
463 * holds the kgdb_active token. This must be done so that the 454 clocksource_touch_watchdog();
464 * that all the cpus wait in for the debug core will not enter 455 rcu_cpu_stall_reset();
465 * again as the master. */
466 if (cpu == atomic_read(&kgdb_active)) {
467 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
468 kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER;
469 }
470 kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER;
471} 456}
472 457
473static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs) 458static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
459 int exception_state)
474{ 460{
475 unsigned long flags; 461 unsigned long flags;
476 int sstep_tries = 100; 462 int sstep_tries = 100;
477 int error; 463 int error;
478 int i, cpu; 464 int cpu;
479 int trace_on = 0; 465 int trace_on = 0;
466 int online_cpus = num_online_cpus();
467
468 kgdb_info[ks->cpu].enter_kgdb++;
469 kgdb_info[ks->cpu].exception_state |= exception_state;
470
471 if (exception_state == DCPU_WANT_MASTER)
472 atomic_inc(&masters_in_kgdb);
473 else
474 atomic_inc(&slaves_in_kgdb);
475
476 if (arch_kgdb_ops.disable_hw_break)
477 arch_kgdb_ops.disable_hw_break(regs);
478
480acquirelock: 479acquirelock:
481 /* 480 /*
482 * Interrupts will be restored by the 'trap return' code, except when 481 * Interrupts will be restored by the 'trap return' code, except when
@@ -489,14 +488,15 @@ acquirelock:
489 kgdb_info[cpu].task = current; 488 kgdb_info[cpu].task = current;
490 kgdb_info[cpu].ret_state = 0; 489 kgdb_info[cpu].ret_state = 0;
491 kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT; 490 kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
492 /*
493 * Make sure the above info reaches the primary CPU before
494 * our cpu_in_kgdb[] flag setting does:
495 */
496 atomic_inc(&cpu_in_kgdb[cpu]);
497 491
498 if (exception_level == 1) 492 /* Make sure the above info reaches the primary CPU */
493 smp_mb();
494
495 if (exception_level == 1) {
496 if (raw_spin_trylock(&dbg_master_lock))
497 atomic_xchg(&kgdb_active, cpu);
499 goto cpu_master_loop; 498 goto cpu_master_loop;
499 }
500 500
501 /* 501 /*
502 * CPU will loop if it is a slave or request to become a kgdb 502 * CPU will loop if it is a slave or request to become a kgdb
@@ -508,10 +508,12 @@ cpu_loop:
508 kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER; 508 kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
509 goto cpu_master_loop; 509 goto cpu_master_loop;
510 } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) { 510 } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
511 if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu) 511 if (raw_spin_trylock(&dbg_master_lock)) {
512 atomic_xchg(&kgdb_active, cpu);
512 break; 513 break;
514 }
513 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { 515 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
514 if (!atomic_read(&passive_cpu_wait[cpu])) 516 if (!raw_spin_is_locked(&dbg_slave_lock))
515 goto return_normal; 517 goto return_normal;
516 } else { 518 } else {
517return_normal: 519return_normal:
@@ -522,9 +524,12 @@ return_normal:
522 arch_kgdb_ops.correct_hw_break(); 524 arch_kgdb_ops.correct_hw_break();
523 if (trace_on) 525 if (trace_on)
524 tracing_on(); 526 tracing_on();
525 atomic_dec(&cpu_in_kgdb[cpu]); 527 kgdb_info[cpu].exception_state &=
526 touch_softlockup_watchdog_sync(); 528 ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
527 clocksource_touch_watchdog(); 529 kgdb_info[cpu].enter_kgdb--;
530 smp_mb__before_atomic_dec();
531 atomic_dec(&slaves_in_kgdb);
532 dbg_touch_watchdogs();
528 local_irq_restore(flags); 533 local_irq_restore(flags);
529 return 0; 534 return 0;
530 } 535 }
@@ -541,8 +546,8 @@ return_normal:
541 (kgdb_info[cpu].task && 546 (kgdb_info[cpu].task &&
542 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { 547 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
543 atomic_set(&kgdb_active, -1); 548 atomic_set(&kgdb_active, -1);
544 touch_softlockup_watchdog_sync(); 549 raw_spin_unlock(&dbg_master_lock);
545 clocksource_touch_watchdog(); 550 dbg_touch_watchdogs();
546 local_irq_restore(flags); 551 local_irq_restore(flags);
547 552
548 goto acquirelock; 553 goto acquirelock;
@@ -563,16 +568,12 @@ return_normal:
563 if (dbg_io_ops->pre_exception) 568 if (dbg_io_ops->pre_exception)
564 dbg_io_ops->pre_exception(); 569 dbg_io_ops->pre_exception();
565 570
566 kgdb_disable_hw_debug(ks->linux_regs);
567
568 /* 571 /*
569 * Get the passive CPU lock which will hold all the non-primary 572 * Get the passive CPU lock which will hold all the non-primary
570 * CPU in a spin state while the debugger is active 573 * CPU in a spin state while the debugger is active
571 */ 574 */
572 if (!kgdb_single_step) { 575 if (!kgdb_single_step)
573 for (i = 0; i < NR_CPUS; i++) 576 raw_spin_lock(&dbg_slave_lock);
574 atomic_inc(&passive_cpu_wait[i]);
575 }
576 577
577#ifdef CONFIG_SMP 578#ifdef CONFIG_SMP
578 /* Signal the other CPUs to enter kgdb_wait() */ 579 /* Signal the other CPUs to enter kgdb_wait() */
@@ -583,10 +584,9 @@ return_normal:
583 /* 584 /*
584 * Wait for the other CPUs to be notified and be waiting for us: 585 * Wait for the other CPUs to be notified and be waiting for us:
585 */ 586 */
586 for_each_online_cpu(i) { 587 while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) +
587 while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i])) 588 atomic_read(&slaves_in_kgdb)) != online_cpus)
588 cpu_relax(); 589 cpu_relax();
589 }
590 590
591 /* 591 /*
592 * At this point the primary processor is completely 592 * At this point the primary processor is completely
@@ -615,7 +615,8 @@ cpu_master_loop:
615 if (error == DBG_PASS_EVENT) { 615 if (error == DBG_PASS_EVENT) {
616 dbg_kdb_mode = !dbg_kdb_mode; 616 dbg_kdb_mode = !dbg_kdb_mode;
617 } else if (error == DBG_SWITCH_CPU_EVENT) { 617 } else if (error == DBG_SWITCH_CPU_EVENT) {
618 dbg_cpu_switch(cpu, dbg_switch_cpu); 618 kgdb_info[dbg_switch_cpu].exception_state |=
619 DCPU_NEXT_MASTER;
619 goto cpu_loop; 620 goto cpu_loop;
620 } else { 621 } else {
621 kgdb_info[cpu].ret_state = error; 622 kgdb_info[cpu].ret_state = error;
@@ -627,24 +628,11 @@ cpu_master_loop:
627 if (dbg_io_ops->post_exception) 628 if (dbg_io_ops->post_exception)
628 dbg_io_ops->post_exception(); 629 dbg_io_ops->post_exception();
629 630
630 atomic_dec(&cpu_in_kgdb[ks->cpu]);
631
632 if (!kgdb_single_step) { 631 if (!kgdb_single_step) {
633 for (i = NR_CPUS-1; i >= 0; i--) 632 raw_spin_unlock(&dbg_slave_lock);
634 atomic_dec(&passive_cpu_wait[i]); 633 /* Wait till all the CPUs have quit from the debugger. */
635 /* 634 while (kgdb_do_roundup && atomic_read(&slaves_in_kgdb))
636 * Wait till all the CPUs have quit from the debugger, 635 cpu_relax();
637 * but allow a CPU that hit an exception and is
638 * waiting to become the master to remain in the debug
639 * core.
640 */
641 for_each_online_cpu(i) {
642 while (kgdb_do_roundup &&
643 atomic_read(&cpu_in_kgdb[i]) &&
644 !(kgdb_info[i].exception_state &
645 DCPU_WANT_MASTER))
646 cpu_relax();
647 }
648 } 636 }
649 637
650kgdb_restore: 638kgdb_restore:
@@ -655,12 +643,20 @@ kgdb_restore:
655 else 643 else
656 kgdb_sstep_pid = 0; 644 kgdb_sstep_pid = 0;
657 } 645 }
646 if (arch_kgdb_ops.correct_hw_break)
647 arch_kgdb_ops.correct_hw_break();
658 if (trace_on) 648 if (trace_on)
659 tracing_on(); 649 tracing_on();
650
651 kgdb_info[cpu].exception_state &=
652 ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
653 kgdb_info[cpu].enter_kgdb--;
654 smp_mb__before_atomic_dec();
655 atomic_dec(&masters_in_kgdb);
660 /* Free kgdb_active */ 656 /* Free kgdb_active */
661 atomic_set(&kgdb_active, -1); 657 atomic_set(&kgdb_active, -1);
662 touch_softlockup_watchdog_sync(); 658 raw_spin_unlock(&dbg_master_lock);
663 clocksource_touch_watchdog(); 659 dbg_touch_watchdogs();
664 local_irq_restore(flags); 660 local_irq_restore(flags);
665 661
666 return kgdb_info[cpu].ret_state; 662 return kgdb_info[cpu].ret_state;
@@ -678,7 +674,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
678{ 674{
679 struct kgdb_state kgdb_var; 675 struct kgdb_state kgdb_var;
680 struct kgdb_state *ks = &kgdb_var; 676 struct kgdb_state *ks = &kgdb_var;
681 int ret;
682 677
683 ks->cpu = raw_smp_processor_id(); 678 ks->cpu = raw_smp_processor_id();
684 ks->ex_vector = evector; 679 ks->ex_vector = evector;
@@ -689,11 +684,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
689 684
690 if (kgdb_reenter_check(ks)) 685 if (kgdb_reenter_check(ks))
691 return 0; /* Ouch, double exception ! */ 686 return 0; /* Ouch, double exception ! */
692 kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER; 687 if (kgdb_info[ks->cpu].enter_kgdb != 0)
693 ret = kgdb_cpu_enter(ks, regs); 688 return 0;
694 kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER | 689
695 DCPU_IS_SLAVE); 690 return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
696 return ret;
697} 691}
698 692
699int kgdb_nmicallback(int cpu, void *regs) 693int kgdb_nmicallback(int cpu, void *regs)
@@ -706,12 +700,9 @@ int kgdb_nmicallback(int cpu, void *regs)
706 ks->cpu = cpu; 700 ks->cpu = cpu;
707 ks->linux_regs = regs; 701 ks->linux_regs = regs;
708 702
709 if (!atomic_read(&cpu_in_kgdb[cpu]) && 703 if (kgdb_info[ks->cpu].enter_kgdb == 0 &&
710 atomic_read(&kgdb_active) != -1 && 704 raw_spin_is_locked(&dbg_master_lock)) {
711 atomic_read(&kgdb_active) != cpu) { 705 kgdb_cpu_enter(ks, regs, DCPU_IS_SLAVE);
712 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
713 kgdb_cpu_enter(ks, regs);
714 kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
715 return 0; 706 return 0;
716 } 707 }
717#endif 708#endif
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index c5d753d80f67..3494c28a7e7a 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -40,6 +40,7 @@ struct debuggerinfo_struct {
40 int exception_state; 40 int exception_state;
41 int ret_state; 41 int ret_state;
42 int irq_depth; 42 int irq_depth;
43 int enter_kgdb;
43}; 44};
44 45
45extern struct debuggerinfo_struct kgdb_info[]; 46extern struct debuggerinfo_struct kgdb_info[];
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 75bd9b3ebbb7..20059ef4459a 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -274,7 +274,6 @@ static int kdb_bp(int argc, const char **argv)
274 int i, bpno; 274 int i, bpno;
275 kdb_bp_t *bp, *bp_check; 275 kdb_bp_t *bp, *bp_check;
276 int diag; 276 int diag;
277 int free;
278 char *symname = NULL; 277 char *symname = NULL;
279 long offset = 0ul; 278 long offset = 0ul;
280 int nextarg; 279 int nextarg;
@@ -305,7 +304,6 @@ static int kdb_bp(int argc, const char **argv)
305 /* 304 /*
306 * Find an empty bp structure to allocate 305 * Find an empty bp structure to allocate
307 */ 306 */
308 free = KDB_MAXBPT;
309 for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) { 307 for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) {
310 if (bp->bp_free) 308 if (bp->bp_free)
311 break; 309 break;
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index bf6e8270e957..dd0b1b7dd02c 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -86,7 +86,7 @@ int kdb_stub(struct kgdb_state *ks)
86 } 86 }
87 /* Set initial kdb state variables */ 87 /* Set initial kdb state variables */
88 KDB_STATE_CLEAR(KGDB_TRANS); 88 KDB_STATE_CLEAR(KGDB_TRANS);
89 kdb_initial_cpu = ks->cpu; 89 kdb_initial_cpu = atomic_read(&kgdb_active);
90 kdb_current_task = kgdb_info[ks->cpu].task; 90 kdb_current_task = kgdb_info[ks->cpu].task;
91 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; 91 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
92 /* Remove any breakpoints as needed by kdb and clear single step */ 92 /* Remove any breakpoints as needed by kdb and clear single step */
@@ -105,7 +105,6 @@ int kdb_stub(struct kgdb_state *ks)
105 ks->pass_exception = 1; 105 ks->pass_exception = 1;
106 KDB_FLAG_SET(CATASTROPHIC); 106 KDB_FLAG_SET(CATASTROPHIC);
107 } 107 }
108 kdb_initial_cpu = ks->cpu;
109 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { 108 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
110 KDB_STATE_CLEAR(SSBPT); 109 KDB_STATE_CLEAR(SSBPT);
111 KDB_STATE_CLEAR(DOING_SS); 110 KDB_STATE_CLEAR(DOING_SS);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index c9b7f4f90bba..96fdaac46a80 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -823,4 +823,4 @@ int kdb_printf(const char *fmt, ...)
823 823
824 return r; 824 return r;
825} 825}
826 826EXPORT_SYMBOL_GPL(kdb_printf);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index caf057a3de0e..37755d621924 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1127,7 +1127,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1127 /* special case below */ 1127 /* special case below */
1128 } else { 1128 } else {
1129 kdb_printf("\nEntering kdb (current=0x%p, pid %d) ", 1129 kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
1130 kdb_current, kdb_current->pid); 1130 kdb_current, kdb_current ? kdb_current->pid : 0);
1131#if defined(CONFIG_SMP) 1131#if defined(CONFIG_SMP)
1132 kdb_printf("on processor %d ", raw_smp_processor_id()); 1132 kdb_printf("on processor %d ", raw_smp_processor_id());
1133#endif 1133#endif
@@ -1749,13 +1749,13 @@ static int kdb_go(int argc, const char **argv)
1749 int nextarg; 1749 int nextarg;
1750 long offset; 1750 long offset;
1751 1751
1752 if (raw_smp_processor_id() != kdb_initial_cpu) {
1753 kdb_printf("go must execute on the entry cpu, "
1754 "please use \"cpu %d\" and then execute go\n",
1755 kdb_initial_cpu);
1756 return KDB_BADCPUNUM;
1757 }
1752 if (argc == 1) { 1758 if (argc == 1) {
1753 if (raw_smp_processor_id() != kdb_initial_cpu) {
1754 kdb_printf("go <address> must be issued from the "
1755 "initial cpu, do cpu %d first\n",
1756 kdb_initial_cpu);
1757 return KDB_ARGCOUNT;
1758 }
1759 nextarg = 1; 1759 nextarg = 1;
1760 diag = kdbgetaddrarg(argc, argv, &nextarg, 1760 diag = kdbgetaddrarg(argc, argv, &nextarg,
1761 &addr, &offset, NULL); 1761 &addr, &offset, NULL);
@@ -2603,20 +2603,17 @@ static int kdb_summary(int argc, const char **argv)
2603 */ 2603 */
2604static int kdb_per_cpu(int argc, const char **argv) 2604static int kdb_per_cpu(int argc, const char **argv)
2605{ 2605{
2606 char buf[256], fmtstr[64]; 2606 char fmtstr[64];
2607 kdb_symtab_t symtab; 2607 int cpu, diag, nextarg = 1;
2608 cpumask_t suppress = CPU_MASK_NONE; 2608 unsigned long addr, symaddr, val, bytesperword = 0, whichcpu = ~0UL;
2609 int cpu, diag;
2610 unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL;
2611 2609
2612 if (argc < 1 || argc > 3) 2610 if (argc < 1 || argc > 3)
2613 return KDB_ARGCOUNT; 2611 return KDB_ARGCOUNT;
2614 2612
2615 snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]); 2613 diag = kdbgetaddrarg(argc, argv, &nextarg, &symaddr, NULL, NULL);
2616 if (!kdbgetsymval(buf, &symtab)) { 2614 if (diag)
2617 kdb_printf("%s is not a per_cpu variable\n", argv[1]); 2615 return diag;
2618 return KDB_BADADDR; 2616
2619 }
2620 if (argc >= 2) { 2617 if (argc >= 2) {
2621 diag = kdbgetularg(argv[2], &bytesperword); 2618 diag = kdbgetularg(argv[2], &bytesperword);
2622 if (diag) 2619 if (diag)
@@ -2649,46 +2646,25 @@ static int kdb_per_cpu(int argc, const char **argv)
2649#define KDB_PCU(cpu) 0 2646#define KDB_PCU(cpu) 0
2650#endif 2647#endif
2651#endif 2648#endif
2652
2653 for_each_online_cpu(cpu) { 2649 for_each_online_cpu(cpu) {
2650 if (KDB_FLAG(CMD_INTERRUPT))
2651 return 0;
2652
2654 if (whichcpu != ~0UL && whichcpu != cpu) 2653 if (whichcpu != ~0UL && whichcpu != cpu)
2655 continue; 2654 continue;
2656 addr = symtab.sym_start + KDB_PCU(cpu); 2655 addr = symaddr + KDB_PCU(cpu);
2657 diag = kdb_getword(&val, addr, bytesperword); 2656 diag = kdb_getword(&val, addr, bytesperword);
2658 if (diag) { 2657 if (diag) {
2659 kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to " 2658 kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
2660 "read, diag=%d\n", cpu, addr, diag); 2659 "read, diag=%d\n", cpu, addr, diag);
2661 continue; 2660 continue;
2662 } 2661 }
2663#ifdef CONFIG_SMP
2664 if (!val) {
2665 cpu_set(cpu, suppress);
2666 continue;
2667 }
2668#endif /* CONFIG_SMP */
2669 kdb_printf("%5d ", cpu); 2662 kdb_printf("%5d ", cpu);
2670 kdb_md_line(fmtstr, addr, 2663 kdb_md_line(fmtstr, addr,
2671 bytesperword == KDB_WORD_SIZE, 2664 bytesperword == KDB_WORD_SIZE,
2672 1, bytesperword, 1, 1, 0); 2665 1, bytesperword, 1, 1, 0);
2673 } 2666 }
2674 if (cpus_weight(suppress) == 0)
2675 return 0;
2676 kdb_printf("Zero suppressed cpu(s):");
2677 for (cpu = first_cpu(suppress); cpu < num_possible_cpus();
2678 cpu = next_cpu(cpu, suppress)) {
2679 kdb_printf(" %d", cpu);
2680 if (cpu == num_possible_cpus() - 1 ||
2681 next_cpu(cpu, suppress) != cpu + 1)
2682 continue;
2683 while (cpu < num_possible_cpus() &&
2684 next_cpu(cpu, suppress) == cpu + 1)
2685 ++cpu;
2686 kdb_printf("-%d", cpu);
2687 }
2688 kdb_printf("\n");
2689
2690#undef KDB_PCU 2667#undef KDB_PCU
2691
2692 return 0; 2668 return 0;
2693} 2669}
2694 2670
@@ -2783,6 +2759,8 @@ int kdb_register_repeat(char *cmd,
2783 2759
2784 return 0; 2760 return 0;
2785} 2761}
2762EXPORT_SYMBOL_GPL(kdb_register_repeat);
2763
2786 2764
2787/* 2765/*
2788 * kdb_register - Compatibility register function for commands that do 2766 * kdb_register - Compatibility register function for commands that do
@@ -2805,6 +2783,7 @@ int kdb_register(char *cmd,
2805 return kdb_register_repeat(cmd, func, usage, help, minlen, 2783 return kdb_register_repeat(cmd, func, usage, help, minlen,
2806 KDB_REPEAT_NONE); 2784 KDB_REPEAT_NONE);
2807} 2785}
2786EXPORT_SYMBOL_GPL(kdb_register);
2808 2787
2809/* 2788/*
2810 * kdb_unregister - This function is used to unregister a kernel 2789 * kdb_unregister - This function is used to unregister a kernel
@@ -2823,7 +2802,7 @@ int kdb_unregister(char *cmd)
2823 /* 2802 /*
2824 * find the command. 2803 * find the command.
2825 */ 2804 */
2826 for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) { 2805 for_each_kdbcmd(kp, i) {
2827 if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { 2806 if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
2828 kp->cmd_name = NULL; 2807 kp->cmd_name = NULL;
2829 return 0; 2808 return 0;
@@ -2833,6 +2812,7 @@ int kdb_unregister(char *cmd)
2833 /* Couldn't find it. */ 2812 /* Couldn't find it. */
2834 return 1; 2813 return 1;
2835} 2814}
2815EXPORT_SYMBOL_GPL(kdb_unregister);
2836 2816
2837/* Initialize the kdb command table. */ 2817/* Initialize the kdb command table. */
2838static void __init kdb_inittab(void) 2818static void __init kdb_inittab(void)
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index c438f545a321..35d69ed1dfb5 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -15,29 +15,6 @@
15#include <linux/kgdb.h> 15#include <linux/kgdb.h>
16#include "../debug_core.h" 16#include "../debug_core.h"
17 17
18/* Kernel Debugger Error codes. Must not overlap with command codes. */
19#define KDB_NOTFOUND (-1)
20#define KDB_ARGCOUNT (-2)
21#define KDB_BADWIDTH (-3)
22#define KDB_BADRADIX (-4)
23#define KDB_NOTENV (-5)
24#define KDB_NOENVVALUE (-6)
25#define KDB_NOTIMP (-7)
26#define KDB_ENVFULL (-8)
27#define KDB_ENVBUFFULL (-9)
28#define KDB_TOOMANYBPT (-10)
29#define KDB_TOOMANYDBREGS (-11)
30#define KDB_DUPBPT (-12)
31#define KDB_BPTNOTFOUND (-13)
32#define KDB_BADMODE (-14)
33#define KDB_BADINT (-15)
34#define KDB_INVADDRFMT (-16)
35#define KDB_BADREG (-17)
36#define KDB_BADCPUNUM (-18)
37#define KDB_BADLENGTH (-19)
38#define KDB_NOBP (-20)
39#define KDB_BADADDR (-21)
40
41/* Kernel Debugger Command codes. Must not overlap with error codes. */ 18/* Kernel Debugger Command codes. Must not overlap with error codes. */
42#define KDB_CMD_GO (-1001) 19#define KDB_CMD_GO (-1001)
43#define KDB_CMD_CPU (-1002) 20#define KDB_CMD_CPU (-1002)
@@ -93,17 +70,6 @@
93 */ 70 */
94#define KDB_MAXBPT 16 71#define KDB_MAXBPT 16
95 72
96/* Maximum number of arguments to a function */
97#define KDB_MAXARGS 16
98
99typedef enum {
100 KDB_REPEAT_NONE = 0, /* Do not repeat this command */
101 KDB_REPEAT_NO_ARGS, /* Repeat the command without arguments */
102 KDB_REPEAT_WITH_ARGS, /* Repeat the command including its arguments */
103} kdb_repeat_t;
104
105typedef int (*kdb_func_t)(int, const char **);
106
107/* Symbol table format returned by kallsyms. */ 73/* Symbol table format returned by kallsyms. */
108typedef struct __ksymtab { 74typedef struct __ksymtab {
109 unsigned long value; /* Address of symbol */ 75 unsigned long value; /* Address of symbol */
@@ -123,11 +89,6 @@ extern int kallsyms_symbol_next(char *prefix_name, int flag);
123extern int kallsyms_symbol_complete(char *prefix_name, int max_len); 89extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
124 90
125/* Exported Symbols for kernel loadable modules to use. */ 91/* Exported Symbols for kernel loadable modules to use. */
126extern int kdb_register(char *, kdb_func_t, char *, char *, short);
127extern int kdb_register_repeat(char *, kdb_func_t, char *, char *,
128 short, kdb_repeat_t);
129extern int kdb_unregister(char *);
130
131extern int kdb_getarea_size(void *, unsigned long, size_t); 92extern int kdb_getarea_size(void *, unsigned long, size_t);
132extern int kdb_putarea_size(unsigned long, void *, size_t); 93extern int kdb_putarea_size(unsigned long, void *, size_t);
133 94
@@ -144,6 +105,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t);
144extern int kdb_putword(unsigned long, unsigned long, size_t); 105extern int kdb_putword(unsigned long, unsigned long, size_t);
145 106
146extern int kdbgetularg(const char *, unsigned long *); 107extern int kdbgetularg(const char *, unsigned long *);
108extern int kdbgetu64arg(const char *, u64 *);
147extern char *kdbgetenv(const char *); 109extern char *kdbgetenv(const char *);
148extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, 110extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
149 long *, char **); 111 long *, char **);
@@ -255,7 +217,6 @@ extern void kdb_ps1(const struct task_struct *p);
255extern void kdb_print_nameval(const char *name, unsigned long val); 217extern void kdb_print_nameval(const char *name, unsigned long val);
256extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); 218extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
257extern void kdb_meminfo_proc_show(void); 219extern void kdb_meminfo_proc_show(void);
258extern const char *kdb_walk_kallsyms(loff_t *pos);
259extern char *kdb_getstr(char *, size_t, char *); 220extern char *kdb_getstr(char *, size_t, char *);
260 221
261/* Defines for kdb_symbol_print */ 222/* Defines for kdb_symbol_print */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 45344d5c53dd..6b2485dcb050 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -82,8 +82,8 @@ static char *kdb_name_table[100]; /* arbitrary size */
82int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) 82int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
83{ 83{
84 int ret = 0; 84 int ret = 0;
85 unsigned long symbolsize; 85 unsigned long symbolsize = 0;
86 unsigned long offset; 86 unsigned long offset = 0;
87#define knt1_size 128 /* must be >= kallsyms table size */ 87#define knt1_size 128 /* must be >= kallsyms table size */
88 char *knt1 = NULL; 88 char *knt1 = NULL;
89 89
diff --git a/kernel/early_res.c b/kernel/early_res.c
deleted file mode 100644
index 7bfae887f211..000000000000
--- a/kernel/early_res.c
+++ /dev/null
@@ -1,590 +0,0 @@
1/*
2 * early_res, could be used to replace bootmem
3 */
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/init.h>
7#include <linux/bootmem.h>
8#include <linux/mm.h>
9#include <linux/early_res.h>
10#include <linux/slab.h>
11#include <linux/kmemleak.h>
12
13/*
14 * Early reserved memory areas.
15 */
16/*
17 * need to make sure this one is bigger enough before
18 * find_fw_memmap_area could be used
19 */
20#define MAX_EARLY_RES_X 32
21
22struct early_res {
23 u64 start, end;
24 char name[15];
25 char overlap_ok;
26};
27static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
28
29static int max_early_res __initdata = MAX_EARLY_RES_X;
30static struct early_res *early_res __initdata = &early_res_x[0];
31static int early_res_count __initdata;
32
33static int __init find_overlapped_early(u64 start, u64 end)
34{
35 int i;
36 struct early_res *r;
37
38 for (i = 0; i < max_early_res && early_res[i].end; i++) {
39 r = &early_res[i];
40 if (end > r->start && start < r->end)
41 break;
42 }
43
44 return i;
45}
46
47/*
48 * Drop the i-th range from the early reservation map,
49 * by copying any higher ranges down one over it, and
50 * clearing what had been the last slot.
51 */
52static void __init drop_range(int i)
53{
54 int j;
55
56 for (j = i + 1; j < max_early_res && early_res[j].end; j++)
57 ;
58
59 memmove(&early_res[i], &early_res[i + 1],
60 (j - 1 - i) * sizeof(struct early_res));
61
62 early_res[j - 1].end = 0;
63 early_res_count--;
64}
65
66static void __init drop_range_partial(int i, u64 start, u64 end)
67{
68 u64 common_start, common_end;
69 u64 old_start, old_end;
70
71 old_start = early_res[i].start;
72 old_end = early_res[i].end;
73 common_start = max(old_start, start);
74 common_end = min(old_end, end);
75
76 /* no overlap ? */
77 if (common_start >= common_end)
78 return;
79
80 if (old_start < common_start) {
81 /* make head segment */
82 early_res[i].end = common_start;
83 if (old_end > common_end) {
84 char name[15];
85
86 /*
87 * Save a local copy of the name, since the
88 * early_res array could get resized inside
89 * reserve_early_without_check() ->
90 * __check_and_double_early_res(), which would
91 * make the current name pointer invalid.
92 */
93 strncpy(name, early_res[i].name,
94 sizeof(early_res[i].name) - 1);
95 /* add another for left over on tail */
96 reserve_early_without_check(common_end, old_end, name);
97 }
98 return;
99 } else {
100 if (old_end > common_end) {
101 /* reuse the entry for tail left */
102 early_res[i].start = common_end;
103 return;
104 }
105 /* all covered */
106 drop_range(i);
107 }
108}
109
110/*
111 * Split any existing ranges that:
112 * 1) are marked 'overlap_ok', and
113 * 2) overlap with the stated range [start, end)
114 * into whatever portion (if any) of the existing range is entirely
115 * below or entirely above the stated range. Drop the portion
116 * of the existing range that overlaps with the stated range,
117 * which will allow the caller of this routine to then add that
118 * stated range without conflicting with any existing range.
119 */
120static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
121{
122 int i;
123 struct early_res *r;
124 u64 lower_start, lower_end;
125 u64 upper_start, upper_end;
126 char name[15];
127
128 for (i = 0; i < max_early_res && early_res[i].end; i++) {
129 r = &early_res[i];
130
131 /* Continue past non-overlapping ranges */
132 if (end <= r->start || start >= r->end)
133 continue;
134
135 /*
136 * Leave non-ok overlaps as is; let caller
137 * panic "Overlapping early reservations"
138 * when it hits this overlap.
139 */
140 if (!r->overlap_ok)
141 return;
142
143 /*
144 * We have an ok overlap. We will drop it from the early
145 * reservation map, and add back in any non-overlapping
146 * portions (lower or upper) as separate, overlap_ok,
147 * non-overlapping ranges.
148 */
149
150 /* 1. Note any non-overlapping (lower or upper) ranges. */
151 strncpy(name, r->name, sizeof(name) - 1);
152
153 lower_start = lower_end = 0;
154 upper_start = upper_end = 0;
155 if (r->start < start) {
156 lower_start = r->start;
157 lower_end = start;
158 }
159 if (r->end > end) {
160 upper_start = end;
161 upper_end = r->end;
162 }
163
164 /* 2. Drop the original ok overlapping range */
165 drop_range(i);
166
167 i--; /* resume for-loop on copied down entry */
168
169 /* 3. Add back in any non-overlapping ranges. */
170 if (lower_end)
171 reserve_early_overlap_ok(lower_start, lower_end, name);
172 if (upper_end)
173 reserve_early_overlap_ok(upper_start, upper_end, name);
174 }
175}
176
177static void __init __reserve_early(u64 start, u64 end, char *name,
178 int overlap_ok)
179{
180 int i;
181 struct early_res *r;
182
183 i = find_overlapped_early(start, end);
184 if (i >= max_early_res)
185 panic("Too many early reservations");
186 r = &early_res[i];
187 if (r->end)
188 panic("Overlapping early reservations "
189 "%llx-%llx %s to %llx-%llx %s\n",
190 start, end - 1, name ? name : "", r->start,
191 r->end - 1, r->name);
192 r->start = start;
193 r->end = end;
194 r->overlap_ok = overlap_ok;
195 if (name)
196 strncpy(r->name, name, sizeof(r->name) - 1);
197 early_res_count++;
198}
199
200/*
201 * A few early reservtations come here.
202 *
203 * The 'overlap_ok' in the name of this routine does -not- mean it
204 * is ok for these reservations to overlap an earlier reservation.
205 * Rather it means that it is ok for subsequent reservations to
206 * overlap this one.
207 *
208 * Use this entry point to reserve early ranges when you are doing
209 * so out of "Paranoia", reserving perhaps more memory than you need,
210 * just in case, and don't mind a subsequent overlapping reservation
211 * that is known to be needed.
212 *
213 * The drop_overlaps_that_are_ok() call here isn't really needed.
214 * It would be needed if we had two colliding 'overlap_ok'
215 * reservations, so that the second such would not panic on the
216 * overlap with the first. We don't have any such as of this
217 * writing, but might as well tolerate such if it happens in
218 * the future.
219 */
220void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
221{
222 drop_overlaps_that_are_ok(start, end);
223 __reserve_early(start, end, name, 1);
224}
225
226static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
227{
228 u64 start, end, size, mem;
229 struct early_res *new;
230
231 /* do we have enough slots left ? */
232 if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
233 return;
234
235 /* double it */
236 mem = -1ULL;
237 size = sizeof(struct early_res) * max_early_res * 2;
238 if (early_res == early_res_x)
239 start = 0;
240 else
241 start = early_res[0].end;
242 end = ex_start;
243 if (start + size < end)
244 mem = find_fw_memmap_area(start, end, size,
245 sizeof(struct early_res));
246 if (mem == -1ULL) {
247 start = ex_end;
248 end = get_max_mapped();
249 if (start + size < end)
250 mem = find_fw_memmap_area(start, end, size,
251 sizeof(struct early_res));
252 }
253 if (mem == -1ULL)
254 panic("can not find more space for early_res array");
255
256 new = __va(mem);
257 /* save the first one for own */
258 new[0].start = mem;
259 new[0].end = mem + size;
260 new[0].overlap_ok = 0;
261 /* copy old to new */
262 if (early_res == early_res_x) {
263 memcpy(&new[1], &early_res[0],
264 sizeof(struct early_res) * max_early_res);
265 memset(&new[max_early_res+1], 0,
266 sizeof(struct early_res) * (max_early_res - 1));
267 early_res_count++;
268 } else {
269 memcpy(&new[1], &early_res[1],
270 sizeof(struct early_res) * (max_early_res - 1));
271 memset(&new[max_early_res], 0,
272 sizeof(struct early_res) * max_early_res);
273 }
274 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
275 early_res = new;
276 max_early_res *= 2;
277 printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
278 max_early_res, mem, mem + size - 1);
279}
280
281/*
282 * Most early reservations come here.
283 *
284 * We first have drop_overlaps_that_are_ok() drop any pre-existing
285 * 'overlap_ok' ranges, so that we can then reserve this memory
286 * range without risk of panic'ing on an overlapping overlap_ok
287 * early reservation.
288 */
289void __init reserve_early(u64 start, u64 end, char *name)
290{
291 if (start >= end)
292 return;
293
294 __check_and_double_early_res(start, end);
295
296 drop_overlaps_that_are_ok(start, end);
297 __reserve_early(start, end, name, 0);
298}
299
300void __init reserve_early_without_check(u64 start, u64 end, char *name)
301{
302 struct early_res *r;
303
304 if (start >= end)
305 return;
306
307 __check_and_double_early_res(start, end);
308
309 r = &early_res[early_res_count];
310
311 r->start = start;
312 r->end = end;
313 r->overlap_ok = 0;
314 if (name)
315 strncpy(r->name, name, sizeof(r->name) - 1);
316 early_res_count++;
317}
318
319void __init free_early(u64 start, u64 end)
320{
321 struct early_res *r;
322 int i;
323
324 kmemleak_free_part(__va(start), end - start);
325
326 i = find_overlapped_early(start, end);
327 r = &early_res[i];
328 if (i >= max_early_res || r->end != end || r->start != start)
329 panic("free_early on not reserved area: %llx-%llx!",
330 start, end - 1);
331
332 drop_range(i);
333}
334
335void __init free_early_partial(u64 start, u64 end)
336{
337 struct early_res *r;
338 int i;
339
340 kmemleak_free_part(__va(start), end - start);
341
342 if (start == end)
343 return;
344
345 if (WARN_ONCE(start > end, " wrong range [%#llx, %#llx]\n", start, end))
346 return;
347
348try_next:
349 i = find_overlapped_early(start, end);
350 if (i >= max_early_res)
351 return;
352
353 r = &early_res[i];
354 /* hole ? */
355 if (r->end >= end && r->start <= start) {
356 drop_range_partial(i, start, end);
357 return;
358 }
359
360 drop_range_partial(i, start, end);
361 goto try_next;
362}
363
364#ifdef CONFIG_NO_BOOTMEM
365static void __init subtract_early_res(struct range *range, int az)
366{
367 int i, count;
368 u64 final_start, final_end;
369 int idx = 0;
370
371 count = 0;
372 for (i = 0; i < max_early_res && early_res[i].end; i++)
373 count++;
374
375 /* need to skip first one ?*/
376 if (early_res != early_res_x)
377 idx = 1;
378
379#define DEBUG_PRINT_EARLY_RES 1
380
381#if DEBUG_PRINT_EARLY_RES
382 printk(KERN_INFO "Subtract (%d early reservations)\n", count);
383#endif
384 for (i = idx; i < count; i++) {
385 struct early_res *r = &early_res[i];
386#if DEBUG_PRINT_EARLY_RES
387 printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i,
388 r->start, r->end, r->name);
389#endif
390 final_start = PFN_DOWN(r->start);
391 final_end = PFN_UP(r->end);
392 if (final_start >= final_end)
393 continue;
394 subtract_range(range, az, final_start, final_end);
395 }
396
397}
398
399int __init get_free_all_memory_range(struct range **rangep, int nodeid)
400{
401 int i, count;
402 u64 start = 0, end;
403 u64 size;
404 u64 mem;
405 struct range *range;
406 int nr_range;
407
408 count = 0;
409 for (i = 0; i < max_early_res && early_res[i].end; i++)
410 count++;
411
412 count *= 2;
413
414 size = sizeof(struct range) * count;
415 end = get_max_mapped();
416#ifdef MAX_DMA32_PFN
417 if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
418 start = MAX_DMA32_PFN << PAGE_SHIFT;
419#endif
420 mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
421 if (mem == -1ULL)
422 panic("can not find more space for range free");
423
424 range = __va(mem);
425 /* use early_node_map[] and early_res to get range array at first */
426 memset(range, 0, size);
427 nr_range = 0;
428
429 /* need to go over early_node_map to find out good range for node */
430 nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
431#ifdef CONFIG_X86_32
432 subtract_range(range, count, max_low_pfn, -1ULL);
433#endif
434 subtract_early_res(range, count);
435 nr_range = clean_sort_range(range, count);
436
437 /* need to clear it ? */
438 if (nodeid == MAX_NUMNODES) {
439 memset(&early_res[0], 0,
440 sizeof(struct early_res) * max_early_res);
441 early_res = NULL;
442 max_early_res = 0;
443 }
444
445 *rangep = range;
446 return nr_range;
447}
448#else
449void __init early_res_to_bootmem(u64 start, u64 end)
450{
451 int i, count;
452 u64 final_start, final_end;
453 int idx = 0;
454
455 count = 0;
456 for (i = 0; i < max_early_res && early_res[i].end; i++)
457 count++;
458
459 /* need to skip first one ?*/
460 if (early_res != early_res_x)
461 idx = 1;
462
463 printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
464 count - idx, max_early_res, start, end);
465 for (i = idx; i < count; i++) {
466 struct early_res *r = &early_res[i];
467 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
468 r->start, r->end, r->name);
469 final_start = max(start, r->start);
470 final_end = min(end, r->end);
471 if (final_start >= final_end) {
472 printk(KERN_CONT "\n");
473 continue;
474 }
475 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
476 final_start, final_end);
477 reserve_bootmem_generic(final_start, final_end - final_start,
478 BOOTMEM_DEFAULT);
479 }
480 /* clear them */
481 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
482 early_res = NULL;
483 max_early_res = 0;
484 early_res_count = 0;
485}
486#endif
487
488/* Check for already reserved areas */
489static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
490{
491 int i;
492 u64 addr = *addrp;
493 int changed = 0;
494 struct early_res *r;
495again:
496 i = find_overlapped_early(addr, addr + size);
497 r = &early_res[i];
498 if (i < max_early_res && r->end) {
499 *addrp = addr = round_up(r->end, align);
500 changed = 1;
501 goto again;
502 }
503 return changed;
504}
505
506/* Check for already reserved areas */
507static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
508{
509 int i;
510 u64 addr = *addrp, last;
511 u64 size = *sizep;
512 int changed = 0;
513again:
514 last = addr + size;
515 for (i = 0; i < max_early_res && early_res[i].end; i++) {
516 struct early_res *r = &early_res[i];
517 if (last > r->start && addr < r->start) {
518 size = r->start - addr;
519 changed = 1;
520 goto again;
521 }
522 if (last > r->end && addr < r->end) {
523 addr = round_up(r->end, align);
524 size = last - addr;
525 changed = 1;
526 goto again;
527 }
528 if (last <= r->end && addr >= r->start) {
529 (*sizep)++;
530 return 0;
531 }
532 }
533 if (changed) {
534 *addrp = addr;
535 *sizep = size;
536 }
537 return changed;
538}
539
540/*
541 * Find a free area with specified alignment in a specific range.
542 * only with the area.between start to end is active range from early_node_map
543 * so they are good as RAM
544 */
545u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
546 u64 size, u64 align)
547{
548 u64 addr, last;
549
550 addr = round_up(ei_start, align);
551 if (addr < start)
552 addr = round_up(start, align);
553 if (addr >= ei_last)
554 goto out;
555 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
556 ;
557 last = addr + size;
558 if (last > ei_last)
559 goto out;
560 if (last > end)
561 goto out;
562
563 return addr;
564
565out:
566 return -1ULL;
567}
568
569u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
570 u64 *sizep, u64 align)
571{
572 u64 addr, last;
573
574 addr = round_up(ei_start, align);
575 if (addr < start)
576 addr = round_up(start, align);
577 if (addr >= ei_last)
578 goto out;
579 *sizep = ei_last - addr;
580 while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
581 ;
582 last = addr + *sizep;
583 if (last > ei_last)
584 goto out;
585
586 return addr;
587
588out:
589 return -1ULL;
590}
diff --git a/kernel/exit.c b/kernel/exit.c
index 671ed56e0a49..b194febf5799 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -50,6 +50,7 @@
50#include <linux/perf_event.h> 50#include <linux/perf_event.h>
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h> 52#include <linux/hw_breakpoint.h>
53#include <linux/oom.h>
53 54
54#include <asm/uaccess.h> 55#include <asm/uaccess.h>
55#include <asm/unistd.h> 56#include <asm/unistd.h>
@@ -149,9 +150,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
149{ 150{
150 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 151 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
151 152
152#ifdef CONFIG_PERF_EVENTS 153 perf_event_delayed_put(tsk);
153 WARN_ON_ONCE(tsk->perf_event_ctxp);
154#endif
155 trace_sched_process_free(tsk); 154 trace_sched_process_free(tsk);
156 put_task_struct(tsk); 155 put_task_struct(tsk);
157} 156}
@@ -689,6 +688,8 @@ static void exit_mm(struct task_struct * tsk)
689 enter_lazy_tlb(mm, current); 688 enter_lazy_tlb(mm, current);
690 /* We don't want this task to be frozen prematurely */ 689 /* We don't want this task to be frozen prematurely */
691 clear_freeze_flag(tsk); 690 clear_freeze_flag(tsk);
691 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
692 atomic_dec(&mm->oom_disable_count);
692 task_unlock(tsk); 693 task_unlock(tsk);
693 mm_update_next_owner(mm); 694 mm_update_next_owner(mm);
694 mmput(mm); 695 mmput(mm);
@@ -702,6 +703,8 @@ static void exit_mm(struct task_struct * tsk)
702 * space. 703 * space.
703 */ 704 */
704static struct task_struct *find_new_reaper(struct task_struct *father) 705static struct task_struct *find_new_reaper(struct task_struct *father)
706 __releases(&tasklist_lock)
707 __acquires(&tasklist_lock)
705{ 708{
706 struct pid_namespace *pid_ns = task_active_pid_ns(father); 709 struct pid_namespace *pid_ns = task_active_pid_ns(father);
707 struct task_struct *thread; 710 struct task_struct *thread;
@@ -1386,8 +1389,7 @@ static int wait_task_stopped(struct wait_opts *wo,
1386 if (!unlikely(wo->wo_flags & WNOWAIT)) 1389 if (!unlikely(wo->wo_flags & WNOWAIT))
1387 *p_code = 0; 1390 *p_code = 0;
1388 1391
1389 /* don't need the RCU readlock here as we're holding a spinlock */ 1392 uid = task_uid(p);
1390 uid = __task_cred(p)->uid;
1391unlock_sig: 1393unlock_sig:
1392 spin_unlock_irq(&p->sighand->siglock); 1394 spin_unlock_irq(&p->sighand->siglock);
1393 if (!exit_code) 1395 if (!exit_code)
@@ -1460,7 +1462,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1460 } 1462 }
1461 if (!unlikely(wo->wo_flags & WNOWAIT)) 1463 if (!unlikely(wo->wo_flags & WNOWAIT))
1462 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1464 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1463 uid = __task_cred(p)->uid; 1465 uid = task_uid(p);
1464 spin_unlock_irq(&p->sighand->siglock); 1466 spin_unlock_irq(&p->sighand->siglock);
1465 1467
1466 pid = task_pid_vnr(p); 1468 pid = task_pid_vnr(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 98b450876f93..3b159c5991b7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -65,6 +65,7 @@
65#include <linux/perf_event.h> 65#include <linux/perf_event.h>
66#include <linux/posix-timers.h> 66#include <linux/posix-timers.h>
67#include <linux/user-return-notifier.h> 67#include <linux/user-return-notifier.h>
68#include <linux/oom.h>
68 69
69#include <asm/pgtable.h> 70#include <asm/pgtable.h>
70#include <asm/pgalloc.h> 71#include <asm/pgalloc.h>
@@ -300,7 +301,7 @@ out:
300#ifdef CONFIG_MMU 301#ifdef CONFIG_MMU
301static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) 302static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
302{ 303{
303 struct vm_area_struct *mpnt, *tmp, **pprev; 304 struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
304 struct rb_node **rb_link, *rb_parent; 305 struct rb_node **rb_link, *rb_parent;
305 int retval; 306 int retval;
306 unsigned long charge; 307 unsigned long charge;
@@ -328,6 +329,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
328 if (retval) 329 if (retval)
329 goto out; 330 goto out;
330 331
332 prev = NULL;
331 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 333 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
332 struct file *file; 334 struct file *file;
333 335
@@ -355,11 +357,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
355 if (IS_ERR(pol)) 357 if (IS_ERR(pol))
356 goto fail_nomem_policy; 358 goto fail_nomem_policy;
357 vma_set_policy(tmp, pol); 359 vma_set_policy(tmp, pol);
360 tmp->vm_mm = mm;
358 if (anon_vma_fork(tmp, mpnt)) 361 if (anon_vma_fork(tmp, mpnt))
359 goto fail_nomem_anon_vma_fork; 362 goto fail_nomem_anon_vma_fork;
360 tmp->vm_flags &= ~VM_LOCKED; 363 tmp->vm_flags &= ~VM_LOCKED;
361 tmp->vm_mm = mm; 364 tmp->vm_next = tmp->vm_prev = NULL;
362 tmp->vm_next = NULL;
363 file = tmp->vm_file; 365 file = tmp->vm_file;
364 if (file) { 366 if (file) {
365 struct inode *inode = file->f_path.dentry->d_inode; 367 struct inode *inode = file->f_path.dentry->d_inode;
@@ -392,6 +394,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
392 */ 394 */
393 *pprev = tmp; 395 *pprev = tmp;
394 pprev = &tmp->vm_next; 396 pprev = &tmp->vm_next;
397 tmp->vm_prev = prev;
398 prev = tmp;
395 399
396 __vma_link_rb(mm, tmp, rb_link, rb_parent); 400 __vma_link_rb(mm, tmp, rb_link, rb_parent);
397 rb_link = &tmp->vm_rb.rb_right; 401 rb_link = &tmp->vm_rb.rb_right;
@@ -485,6 +489,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
485 mm->cached_hole_size = ~0UL; 489 mm->cached_hole_size = ~0UL;
486 mm_init_aio(mm); 490 mm_init_aio(mm);
487 mm_init_owner(mm, p); 491 mm_init_owner(mm, p);
492 atomic_set(&mm->oom_disable_count, 0);
488 493
489 if (likely(!mm_alloc_pgd(mm))) { 494 if (likely(!mm_alloc_pgd(mm))) {
490 mm->def_flags = 0; 495 mm->def_flags = 0;
@@ -738,6 +743,8 @@ good_mm:
738 /* Initializing for Swap token stuff */ 743 /* Initializing for Swap token stuff */
739 mm->token_priority = 0; 744 mm->token_priority = 0;
740 mm->last_interval = 0; 745 mm->last_interval = 0;
746 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
747 atomic_inc(&mm->oom_disable_count);
741 748
742 tsk->mm = mm; 749 tsk->mm = mm;
743 tsk->active_mm = mm; 750 tsk->active_mm = mm;
@@ -752,13 +759,13 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
752 struct fs_struct *fs = current->fs; 759 struct fs_struct *fs = current->fs;
753 if (clone_flags & CLONE_FS) { 760 if (clone_flags & CLONE_FS) {
754 /* tsk->fs is already what we want */ 761 /* tsk->fs is already what we want */
755 write_lock(&fs->lock); 762 spin_lock(&fs->lock);
756 if (fs->in_exec) { 763 if (fs->in_exec) {
757 write_unlock(&fs->lock); 764 spin_unlock(&fs->lock);
758 return -EAGAIN; 765 return -EAGAIN;
759 } 766 }
760 fs->users++; 767 fs->users++;
761 write_unlock(&fs->lock); 768 spin_unlock(&fs->lock);
762 return 0; 769 return 0;
763 } 770 }
764 tsk->fs = copy_fs_struct(fs); 771 tsk->fs = copy_fs_struct(fs);
@@ -901,6 +908,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
901 sig->oom_adj = current->signal->oom_adj; 908 sig->oom_adj = current->signal->oom_adj;
902 sig->oom_score_adj = current->signal->oom_score_adj; 909 sig->oom_score_adj = current->signal->oom_score_adj;
903 910
911 mutex_init(&sig->cred_guard_mutex);
912
904 return 0; 913 return 0;
905} 914}
906 915
@@ -1296,8 +1305,13 @@ bad_fork_cleanup_io:
1296bad_fork_cleanup_namespaces: 1305bad_fork_cleanup_namespaces:
1297 exit_task_namespaces(p); 1306 exit_task_namespaces(p);
1298bad_fork_cleanup_mm: 1307bad_fork_cleanup_mm:
1299 if (p->mm) 1308 if (p->mm) {
1309 task_lock(p);
1310 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1311 atomic_dec(&p->mm->oom_disable_count);
1312 task_unlock(p);
1300 mmput(p->mm); 1313 mmput(p->mm);
1314 }
1301bad_fork_cleanup_signal: 1315bad_fork_cleanup_signal:
1302 if (!(clone_flags & CLONE_THREAD)) 1316 if (!(clone_flags & CLONE_THREAD))
1303 free_signal_struct(p->signal); 1317 free_signal_struct(p->signal);
@@ -1676,13 +1690,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1676 1690
1677 if (new_fs) { 1691 if (new_fs) {
1678 fs = current->fs; 1692 fs = current->fs;
1679 write_lock(&fs->lock); 1693 spin_lock(&fs->lock);
1680 current->fs = new_fs; 1694 current->fs = new_fs;
1681 if (--fs->users) 1695 if (--fs->users)
1682 new_fs = NULL; 1696 new_fs = NULL;
1683 else 1697 else
1684 new_fs = fs; 1698 new_fs = fs;
1685 write_unlock(&fs->lock); 1699 spin_unlock(&fs->lock);
1686 } 1700 }
1687 1701
1688 if (new_mm) { 1702 if (new_mm) {
@@ -1690,6 +1704,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1690 active_mm = current->active_mm; 1704 active_mm = current->active_mm;
1691 current->mm = new_mm; 1705 current->mm = new_mm;
1692 current->active_mm = new_mm; 1706 current->active_mm = new_mm;
1707 if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
1708 atomic_dec(&mm->oom_disable_count);
1709 atomic_inc(&new_mm->oom_disable_count);
1710 }
1693 activate_mm(active_mm, new_mm); 1711 activate_mm(active_mm, new_mm);
1694 new_mm = mm; 1712 new_mm = mm;
1695 } 1713 }
diff --git a/kernel/futex.c b/kernel/futex.c
index 6a3a5fa1526d..6c683b37f2ce 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -91,6 +91,7 @@ struct futex_pi_state {
91 91
92/** 92/**
93 * struct futex_q - The hashed futex queue entry, one per waiting task 93 * struct futex_q - The hashed futex queue entry, one per waiting task
94 * @list: priority-sorted list of tasks waiting on this futex
94 * @task: the task waiting on the futex 95 * @task: the task waiting on the futex
95 * @lock_ptr: the hash bucket lock 96 * @lock_ptr: the hash bucket lock
96 * @key: the key the futex is hashed on 97 * @key: the key the futex is hashed on
@@ -104,7 +105,7 @@ struct futex_pi_state {
104 * 105 *
105 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 106 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
106 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 107 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
107 * The order of wakup is always to make the first condition true, then 108 * The order of wakeup is always to make the first condition true, then
108 * the second. 109 * the second.
109 * 110 *
110 * PI futexes are typically woken before they are removed from the hash list via 111 * PI futexes are typically woken before they are removed from the hash list via
@@ -168,7 +169,7 @@ static void get_futex_key_refs(union futex_key *key)
168 169
169 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 170 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
170 case FUT_OFF_INODE: 171 case FUT_OFF_INODE:
171 atomic_inc(&key->shared.inode->i_count); 172 ihold(key->shared.inode);
172 break; 173 break;
173 case FUT_OFF_MMSHARED: 174 case FUT_OFF_MMSHARED:
174 atomic_inc(&key->private.mm->mm_count); 175 atomic_inc(&key->private.mm->mm_count);
@@ -295,7 +296,7 @@ void put_futex_key(int fshared, union futex_key *key)
295 * Slow path to fixup the fault we just took in the atomic write 296 * Slow path to fixup the fault we just took in the atomic write
296 * access to @uaddr. 297 * access to @uaddr.
297 * 298 *
298 * We have no generic implementation of a non destructive write to the 299 * We have no generic implementation of a non-destructive write to the
299 * user address. We know that we faulted in the atomic pagefault 300 * user address. We know that we faulted in the atomic pagefault
300 * disabled section so we can as well avoid the #PF overhead by 301 * disabled section so we can as well avoid the #PF overhead by
301 * calling get_user_pages() right away. 302 * calling get_user_pages() right away.
@@ -515,7 +516,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
515 */ 516 */
516 pi_state = this->pi_state; 517 pi_state = this->pi_state;
517 /* 518 /*
518 * Userspace might have messed up non PI and PI futexes 519 * Userspace might have messed up non-PI and PI futexes
519 */ 520 */
520 if (unlikely(!pi_state)) 521 if (unlikely(!pi_state))
521 return -EINVAL; 522 return -EINVAL;
@@ -736,8 +737,8 @@ static void wake_futex(struct futex_q *q)
736 737
737 /* 738 /*
738 * We set q->lock_ptr = NULL _before_ we wake up the task. If 739 * We set q->lock_ptr = NULL _before_ we wake up the task. If
739 * a non futex wake up happens on another CPU then the task 740 * a non-futex wake up happens on another CPU then the task
740 * might exit and p would dereference a non existing task 741 * might exit and p would dereference a non-existing task
741 * struct. Prevent this by holding a reference on p across the 742 * struct. Prevent this by holding a reference on p across the
742 * wake up. 743 * wake up.
743 */ 744 */
@@ -1131,11 +1132,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1131 1132
1132/** 1133/**
1133 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 1134 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1134 * uaddr1: source futex user address 1135 * @uaddr1: source futex user address
1135 * uaddr2: target futex user address 1136 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
1136 * nr_wake: number of waiters to wake (must be 1 for requeue_pi) 1137 * @uaddr2: target futex user address
1137 * nr_requeue: number of waiters to requeue (0-INT_MAX) 1138 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1138 * requeue_pi: if we are attempting to requeue from a non-pi futex to a 1139 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1140 * @cmpval: @uaddr1 expected value (or %NULL)
1141 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1139 * pi futex (pi to pi requeue is not supported) 1142 * pi futex (pi to pi requeue is not supported)
1140 * 1143 *
1141 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire 1144 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
@@ -1360,10 +1363,10 @@ out:
1360 1363
1361/* The key must be already stored in q->key. */ 1364/* The key must be already stored in q->key. */
1362static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) 1365static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1366 __acquires(&hb->lock)
1363{ 1367{
1364 struct futex_hash_bucket *hb; 1368 struct futex_hash_bucket *hb;
1365 1369
1366 get_futex_key_refs(&q->key);
1367 hb = hash_futex(&q->key); 1370 hb = hash_futex(&q->key);
1368 q->lock_ptr = &hb->lock; 1371 q->lock_ptr = &hb->lock;
1369 1372
@@ -1373,9 +1376,9 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1373 1376
1374static inline void 1377static inline void
1375queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) 1378queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1379 __releases(&hb->lock)
1376{ 1380{
1377 spin_unlock(&hb->lock); 1381 spin_unlock(&hb->lock);
1378 drop_futex_key_refs(&q->key);
1379} 1382}
1380 1383
1381/** 1384/**
@@ -1391,6 +1394,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1391 * an example). 1394 * an example).
1392 */ 1395 */
1393static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 1396static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1397 __releases(&hb->lock)
1394{ 1398{
1395 int prio; 1399 int prio;
1396 1400
@@ -1471,6 +1475,7 @@ retry:
1471 * and dropped here. 1475 * and dropped here.
1472 */ 1476 */
1473static void unqueue_me_pi(struct futex_q *q) 1477static void unqueue_me_pi(struct futex_q *q)
1478 __releases(q->lock_ptr)
1474{ 1479{
1475 WARN_ON(plist_node_empty(&q->list)); 1480 WARN_ON(plist_node_empty(&q->list));
1476 plist_del(&q->list, &q->list.plist); 1481 plist_del(&q->list, &q->list.plist);
@@ -1480,8 +1485,6 @@ static void unqueue_me_pi(struct futex_q *q)
1480 q->pi_state = NULL; 1485 q->pi_state = NULL;
1481 1486
1482 spin_unlock(q->lock_ptr); 1487 spin_unlock(q->lock_ptr);
1483
1484 drop_futex_key_refs(&q->key);
1485} 1488}
1486 1489
1487/* 1490/*
@@ -1812,7 +1815,10 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1812 } 1815 }
1813 1816
1814retry: 1817retry:
1815 /* Prepare to wait on uaddr. */ 1818 /*
1819 * Prepare to wait on uaddr. On success, holds hb lock and increments
1820 * q.key refs.
1821 */
1816 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 1822 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
1817 if (ret) 1823 if (ret)
1818 goto out; 1824 goto out;
@@ -1822,28 +1828,27 @@ retry:
1822 1828
1823 /* If we were woken (and unqueued), we succeeded, whatever. */ 1829 /* If we were woken (and unqueued), we succeeded, whatever. */
1824 ret = 0; 1830 ret = 0;
1831 /* unqueue_me() drops q.key ref */
1825 if (!unqueue_me(&q)) 1832 if (!unqueue_me(&q))
1826 goto out_put_key; 1833 goto out;
1827 ret = -ETIMEDOUT; 1834 ret = -ETIMEDOUT;
1828 if (to && !to->task) 1835 if (to && !to->task)
1829 goto out_put_key; 1836 goto out;
1830 1837
1831 /* 1838 /*
1832 * We expect signal_pending(current), but we might be the 1839 * We expect signal_pending(current), but we might be the
1833 * victim of a spurious wakeup as well. 1840 * victim of a spurious wakeup as well.
1834 */ 1841 */
1835 if (!signal_pending(current)) { 1842 if (!signal_pending(current))
1836 put_futex_key(fshared, &q.key);
1837 goto retry; 1843 goto retry;
1838 }
1839 1844
1840 ret = -ERESTARTSYS; 1845 ret = -ERESTARTSYS;
1841 if (!abs_time) 1846 if (!abs_time)
1842 goto out_put_key; 1847 goto out;
1843 1848
1844 restart = &current_thread_info()->restart_block; 1849 restart = &current_thread_info()->restart_block;
1845 restart->fn = futex_wait_restart; 1850 restart->fn = futex_wait_restart;
1846 restart->futex.uaddr = (u32 *)uaddr; 1851 restart->futex.uaddr = uaddr;
1847 restart->futex.val = val; 1852 restart->futex.val = val;
1848 restart->futex.time = abs_time->tv64; 1853 restart->futex.time = abs_time->tv64;
1849 restart->futex.bitset = bitset; 1854 restart->futex.bitset = bitset;
@@ -1856,8 +1861,6 @@ retry:
1856 1861
1857 ret = -ERESTART_RESTARTBLOCK; 1862 ret = -ERESTART_RESTARTBLOCK;
1858 1863
1859out_put_key:
1860 put_futex_key(fshared, &q.key);
1861out: 1864out:
1862 if (to) { 1865 if (to) {
1863 hrtimer_cancel(&to->timer); 1866 hrtimer_cancel(&to->timer);
@@ -1869,7 +1872,7 @@ out:
1869 1872
1870static long futex_wait_restart(struct restart_block *restart) 1873static long futex_wait_restart(struct restart_block *restart)
1871{ 1874{
1872 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1875 u32 __user *uaddr = restart->futex.uaddr;
1873 int fshared = 0; 1876 int fshared = 0;
1874 ktime_t t, *tp = NULL; 1877 ktime_t t, *tp = NULL;
1875 1878
@@ -2236,7 +2239,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2236 q.rt_waiter = &rt_waiter; 2239 q.rt_waiter = &rt_waiter;
2237 q.requeue_pi_key = &key2; 2240 q.requeue_pi_key = &key2;
2238 2241
2239 /* Prepare to wait on uaddr. */ 2242 /*
2243 * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2244 * count.
2245 */
2240 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 2246 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
2241 if (ret) 2247 if (ret)
2242 goto out_key2; 2248 goto out_key2;
@@ -2254,7 +2260,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2254 * In order for us to be here, we know our q.key == key2, and since 2260 * In order for us to be here, we know our q.key == key2, and since
2255 * we took the hb->lock above, we also know that futex_requeue() has 2261 * we took the hb->lock above, we also know that futex_requeue() has
2256 * completed and we no longer have to concern ourselves with a wakeup 2262 * completed and we no longer have to concern ourselves with a wakeup
2257 * race with the atomic proxy lock acquition by the requeue code. 2263 * race with the atomic proxy lock acquisition by the requeue code. The
2264 * futex_requeue dropped our key1 reference and incremented our key2
2265 * reference count.
2258 */ 2266 */
2259 2267
2260 /* Check if the requeue code acquired the second futex for us. */ 2268 /* Check if the requeue code acquired the second futex for us. */
@@ -2458,7 +2466,7 @@ retry:
2458 */ 2466 */
2459static inline int fetch_robust_entry(struct robust_list __user **entry, 2467static inline int fetch_robust_entry(struct robust_list __user **entry,
2460 struct robust_list __user * __user *head, 2468 struct robust_list __user * __user *head,
2461 int *pi) 2469 unsigned int *pi)
2462{ 2470{
2463 unsigned long uentry; 2471 unsigned long uentry;
2464 2472
@@ -2647,7 +2655,7 @@ static int __init futex_init(void)
2647 * of the complex code paths. Also we want to prevent 2655 * of the complex code paths. Also we want to prevent
2648 * registration of robust lists in that case. NULL is 2656 * registration of robust lists in that case. NULL is
2649 * guaranteed to fault and we get -EFAULT on functional 2657 * guaranteed to fault and we get -EFAULT on functional
2650 * implementation, the non functional ones will return 2658 * implementation, the non-functional ones will return
2651 * -ENOSYS. 2659 * -ENOSYS.
2652 */ 2660 */
2653 curval = cmpxchg_futex_value_locked(NULL, 0, 0); 2661 curval = cmpxchg_futex_value_locked(NULL, 0, 0);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d49afb2395e5..06da4dfc339b 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -19,7 +19,7 @@
19 */ 19 */
20static inline int 20static inline int
21fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, 21fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
22 compat_uptr_t __user *head, int *pi) 22 compat_uptr_t __user *head, unsigned int *pi)
23{ 23{
24 if (get_user(*uentry, head)) 24 if (get_user(*uentry, head))
25 return -EFAULT; 25 return -EFAULT;
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index ef3c3f88a7a3..9bd0934f6c33 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -33,10 +33,11 @@
33 * @children: child nodes 33 * @children: child nodes
34 * @all: list head for list of all nodes 34 * @all: list head for list of all nodes
35 * @parent: parent node 35 * @parent: parent node
36 * @info: associated profiling data structure if not a directory 36 * @loaded_info: array of pointers to profiling data sets for loaded object
37 * @ghost: when an object file containing profiling data is unloaded we keep a 37 * files.
38 * copy of the profiling data here to allow collecting coverage data 38 * @num_loaded: number of profiling data sets for loaded object files.
39 * for cleanup code. Such a node is called a "ghost". 39 * @unloaded_info: accumulated copy of profiling data sets for unloaded
40 * object files. Used only when gcov_persist=1.
40 * @dentry: main debugfs entry, either a directory or data file 41 * @dentry: main debugfs entry, either a directory or data file
41 * @links: associated symbolic links 42 * @links: associated symbolic links
42 * @name: data file basename 43 * @name: data file basename
@@ -51,10 +52,11 @@ struct gcov_node {
51 struct list_head children; 52 struct list_head children;
52 struct list_head all; 53 struct list_head all;
53 struct gcov_node *parent; 54 struct gcov_node *parent;
54 struct gcov_info *info; 55 struct gcov_info **loaded_info;
55 struct gcov_info *ghost; 56 struct gcov_info *unloaded_info;
56 struct dentry *dentry; 57 struct dentry *dentry;
57 struct dentry **links; 58 struct dentry **links;
59 int num_loaded;
58 char name[0]; 60 char name[0];
59}; 61};
60 62
@@ -136,16 +138,37 @@ static const struct seq_operations gcov_seq_ops = {
136}; 138};
137 139
138/* 140/*
139 * Return the profiling data set for a given node. This can either be the 141 * Return a profiling data set associated with the given node. This is
140 * original profiling data structure or a duplicate (also called "ghost") 142 * either a data set for a loaded object file or a data set copy in case
141 * in case the associated object file has been unloaded. 143 * all associated object files have been unloaded.
142 */ 144 */
143static struct gcov_info *get_node_info(struct gcov_node *node) 145static struct gcov_info *get_node_info(struct gcov_node *node)
144{ 146{
145 if (node->info) 147 if (node->num_loaded > 0)
146 return node->info; 148 return node->loaded_info[0];
147 149
148 return node->ghost; 150 return node->unloaded_info;
151}
152
153/*
154 * Return a newly allocated profiling data set which contains the sum of
155 * all profiling data associated with the given node.
156 */
157static struct gcov_info *get_accumulated_info(struct gcov_node *node)
158{
159 struct gcov_info *info;
160 int i = 0;
161
162 if (node->unloaded_info)
163 info = gcov_info_dup(node->unloaded_info);
164 else
165 info = gcov_info_dup(node->loaded_info[i++]);
166 if (!info)
167 return NULL;
168 for (; i < node->num_loaded; i++)
169 gcov_info_add(info, node->loaded_info[i]);
170
171 return info;
149} 172}
150 173
151/* 174/*
@@ -163,9 +186,10 @@ static int gcov_seq_open(struct inode *inode, struct file *file)
163 mutex_lock(&node_lock); 186 mutex_lock(&node_lock);
164 /* 187 /*
165 * Read from a profiling data copy to minimize reference tracking 188 * Read from a profiling data copy to minimize reference tracking
166 * complexity and concurrent access. 189 * complexity and concurrent access and to keep accumulating multiple
190 * profiling data sets associated with one node simple.
167 */ 191 */
168 info = gcov_info_dup(get_node_info(node)); 192 info = get_accumulated_info(node);
169 if (!info) 193 if (!info)
170 goto out_unlock; 194 goto out_unlock;
171 iter = gcov_iter_new(info); 195 iter = gcov_iter_new(info);
@@ -225,12 +249,25 @@ static struct gcov_node *get_node_by_name(const char *name)
225 return NULL; 249 return NULL;
226} 250}
227 251
252/*
253 * Reset all profiling data associated with the specified node.
254 */
255static void reset_node(struct gcov_node *node)
256{
257 int i;
258
259 if (node->unloaded_info)
260 gcov_info_reset(node->unloaded_info);
261 for (i = 0; i < node->num_loaded; i++)
262 gcov_info_reset(node->loaded_info[i]);
263}
264
228static void remove_node(struct gcov_node *node); 265static void remove_node(struct gcov_node *node);
229 266
230/* 267/*
231 * write() implementation for gcov data files. Reset profiling data for the 268 * write() implementation for gcov data files. Reset profiling data for the
232 * associated file. If the object file has been unloaded (i.e. this is 269 * corresponding file. If all associated object files have been unloaded,
233 * a "ghost" node), remove the debug fs node as well. 270 * remove the debug fs node as well.
234 */ 271 */
235static ssize_t gcov_seq_write(struct file *file, const char __user *addr, 272static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
236 size_t len, loff_t *pos) 273 size_t len, loff_t *pos)
@@ -245,10 +282,10 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
245 node = get_node_by_name(info->filename); 282 node = get_node_by_name(info->filename);
246 if (node) { 283 if (node) {
247 /* Reset counts or remove node for unloaded modules. */ 284 /* Reset counts or remove node for unloaded modules. */
248 if (node->ghost) 285 if (node->num_loaded == 0)
249 remove_node(node); 286 remove_node(node);
250 else 287 else
251 gcov_info_reset(node->info); 288 reset_node(node);
252 } 289 }
253 /* Reset counts for open file. */ 290 /* Reset counts for open file. */
254 gcov_info_reset(info); 291 gcov_info_reset(info);
@@ -378,7 +415,10 @@ static void init_node(struct gcov_node *node, struct gcov_info *info,
378 INIT_LIST_HEAD(&node->list); 415 INIT_LIST_HEAD(&node->list);
379 INIT_LIST_HEAD(&node->children); 416 INIT_LIST_HEAD(&node->children);
380 INIT_LIST_HEAD(&node->all); 417 INIT_LIST_HEAD(&node->all);
381 node->info = info; 418 if (node->loaded_info) {
419 node->loaded_info[0] = info;
420 node->num_loaded = 1;
421 }
382 node->parent = parent; 422 node->parent = parent;
383 if (name) 423 if (name)
384 strcpy(node->name, name); 424 strcpy(node->name, name);
@@ -394,9 +434,13 @@ static struct gcov_node *new_node(struct gcov_node *parent,
394 struct gcov_node *node; 434 struct gcov_node *node;
395 435
396 node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL); 436 node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
397 if (!node) { 437 if (!node)
398 pr_warning("out of memory\n"); 438 goto err_nomem;
399 return NULL; 439 if (info) {
440 node->loaded_info = kcalloc(1, sizeof(struct gcov_info *),
441 GFP_KERNEL);
442 if (!node->loaded_info)
443 goto err_nomem;
400 } 444 }
401 init_node(node, info, name, parent); 445 init_node(node, info, name, parent);
402 /* Differentiate between gcov data file nodes and directory nodes. */ 446 /* Differentiate between gcov data file nodes and directory nodes. */
@@ -416,6 +460,11 @@ static struct gcov_node *new_node(struct gcov_node *parent,
416 list_add(&node->all, &all_head); 460 list_add(&node->all, &all_head);
417 461
418 return node; 462 return node;
463
464err_nomem:
465 kfree(node);
466 pr_warning("out of memory\n");
467 return NULL;
419} 468}
420 469
421/* Remove symbolic links associated with node. */ 470/* Remove symbolic links associated with node. */
@@ -441,8 +490,9 @@ static void release_node(struct gcov_node *node)
441 list_del(&node->all); 490 list_del(&node->all);
442 debugfs_remove(node->dentry); 491 debugfs_remove(node->dentry);
443 remove_links(node); 492 remove_links(node);
444 if (node->ghost) 493 kfree(node->loaded_info);
445 gcov_info_free(node->ghost); 494 if (node->unloaded_info)
495 gcov_info_free(node->unloaded_info);
446 kfree(node); 496 kfree(node);
447} 497}
448 498
@@ -477,7 +527,7 @@ static struct gcov_node *get_child_by_name(struct gcov_node *parent,
477 527
478/* 528/*
479 * write() implementation for reset file. Reset all profiling data to zero 529 * write() implementation for reset file. Reset all profiling data to zero
480 * and remove ghost nodes. 530 * and remove nodes for which all associated object files are unloaded.
481 */ 531 */
482static ssize_t reset_write(struct file *file, const char __user *addr, 532static ssize_t reset_write(struct file *file, const char __user *addr,
483 size_t len, loff_t *pos) 533 size_t len, loff_t *pos)
@@ -487,8 +537,8 @@ static ssize_t reset_write(struct file *file, const char __user *addr,
487 mutex_lock(&node_lock); 537 mutex_lock(&node_lock);
488restart: 538restart:
489 list_for_each_entry(node, &all_head, all) { 539 list_for_each_entry(node, &all_head, all) {
490 if (node->info) 540 if (node->num_loaded > 0)
491 gcov_info_reset(node->info); 541 reset_node(node);
492 else if (list_empty(&node->children)) { 542 else if (list_empty(&node->children)) {
493 remove_node(node); 543 remove_node(node);
494 /* Several nodes may have gone - restart loop. */ 544 /* Several nodes may have gone - restart loop. */
@@ -511,6 +561,7 @@ static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
511static const struct file_operations gcov_reset_fops = { 561static const struct file_operations gcov_reset_fops = {
512 .write = reset_write, 562 .write = reset_write,
513 .read = reset_read, 563 .read = reset_read,
564 .llseek = noop_llseek,
514}; 565};
515 566
516/* 567/*
@@ -564,37 +615,115 @@ err_remove:
564} 615}
565 616
566/* 617/*
567 * The profiling data set associated with this node is being unloaded. Store a 618 * Associate a profiling data set with an existing node. Needs to be called
568 * copy of the profiling data and turn this node into a "ghost". 619 * with node_lock held.
569 */ 620 */
570static int ghost_node(struct gcov_node *node) 621static void add_info(struct gcov_node *node, struct gcov_info *info)
571{ 622{
572 node->ghost = gcov_info_dup(node->info); 623 struct gcov_info **loaded_info;
573 if (!node->ghost) { 624 int num = node->num_loaded;
574 pr_warning("could not save data for '%s' (out of memory)\n", 625
575 node->info->filename); 626 /*
576 return -ENOMEM; 627 * Prepare new array. This is done first to simplify cleanup in
628 * case the new data set is incompatible, the node only contains
629 * unloaded data sets and there's not enough memory for the array.
630 */
631 loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL);
632 if (!loaded_info) {
633 pr_warning("could not add '%s' (out of memory)\n",
634 info->filename);
635 return;
636 }
637 memcpy(loaded_info, node->loaded_info,
638 num * sizeof(struct gcov_info *));
639 loaded_info[num] = info;
640 /* Check if the new data set is compatible. */
641 if (num == 0) {
642 /*
643 * A module was unloaded, modified and reloaded. The new
644 * data set replaces the copy of the last one.
645 */
646 if (!gcov_info_is_compatible(node->unloaded_info, info)) {
647 pr_warning("discarding saved data for %s "
648 "(incompatible version)\n", info->filename);
649 gcov_info_free(node->unloaded_info);
650 node->unloaded_info = NULL;
651 }
652 } else {
653 /*
654 * Two different versions of the same object file are loaded.
655 * The initial one takes precedence.
656 */
657 if (!gcov_info_is_compatible(node->loaded_info[0], info)) {
658 pr_warning("could not add '%s' (incompatible "
659 "version)\n", info->filename);
660 kfree(loaded_info);
661 return;
662 }
577 } 663 }
578 node->info = NULL; 664 /* Overwrite previous array. */
665 kfree(node->loaded_info);
666 node->loaded_info = loaded_info;
667 node->num_loaded = num + 1;
668}
579 669
580 return 0; 670/*
671 * Return the index of a profiling data set associated with a node.
672 */
673static int get_info_index(struct gcov_node *node, struct gcov_info *info)
674{
675 int i;
676
677 for (i = 0; i < node->num_loaded; i++) {
678 if (node->loaded_info[i] == info)
679 return i;
680 }
681 return -ENOENT;
581} 682}
582 683
583/* 684/*
584 * Profiling data for this node has been loaded again. Add profiling data 685 * Save the data of a profiling data set which is being unloaded.
585 * from previous instantiation and turn this node into a regular node.
586 */ 686 */
587static void revive_node(struct gcov_node *node, struct gcov_info *info) 687static void save_info(struct gcov_node *node, struct gcov_info *info)
588{ 688{
589 if (gcov_info_is_compatible(node->ghost, info)) 689 if (node->unloaded_info)
590 gcov_info_add(info, node->ghost); 690 gcov_info_add(node->unloaded_info, info);
591 else { 691 else {
592 pr_warning("discarding saved data for '%s' (version changed)\n", 692 node->unloaded_info = gcov_info_dup(info);
693 if (!node->unloaded_info) {
694 pr_warning("could not save data for '%s' "
695 "(out of memory)\n", info->filename);
696 }
697 }
698}
699
700/*
701 * Disassociate a profiling data set from a node. Needs to be called with
702 * node_lock held.
703 */
704static void remove_info(struct gcov_node *node, struct gcov_info *info)
705{
706 int i;
707
708 i = get_info_index(node, info);
709 if (i < 0) {
710 pr_warning("could not remove '%s' (not found)\n",
593 info->filename); 711 info->filename);
712 return;
594 } 713 }
595 gcov_info_free(node->ghost); 714 if (gcov_persist)
596 node->ghost = NULL; 715 save_info(node, info);
597 node->info = info; 716 /* Shrink array. */
717 node->loaded_info[i] = node->loaded_info[node->num_loaded - 1];
718 node->num_loaded--;
719 if (node->num_loaded > 0)
720 return;
721 /* Last loaded data set was removed. */
722 kfree(node->loaded_info);
723 node->loaded_info = NULL;
724 node->num_loaded = 0;
725 if (!node->unloaded_info)
726 remove_node(node);
598} 727}
599 728
600/* 729/*
@@ -609,30 +738,18 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
609 node = get_node_by_name(info->filename); 738 node = get_node_by_name(info->filename);
610 switch (action) { 739 switch (action) {
611 case GCOV_ADD: 740 case GCOV_ADD:
612 /* Add new node or revive ghost. */ 741 if (node)
613 if (!node) { 742 add_info(node, info);
743 else
614 add_node(info); 744 add_node(info);
615 break;
616 }
617 if (gcov_persist)
618 revive_node(node, info);
619 else {
620 pr_warning("could not add '%s' (already exists)\n",
621 info->filename);
622 }
623 break; 745 break;
624 case GCOV_REMOVE: 746 case GCOV_REMOVE:
625 /* Remove node or turn into ghost. */ 747 if (node)
626 if (!node) { 748 remove_info(node, info);
749 else {
627 pr_warning("could not remove '%s' (not found)\n", 750 pr_warning("could not remove '%s' (not found)\n",
628 info->filename); 751 info->filename);
629 break;
630 } 752 }
631 if (gcov_persist) {
632 if (!ghost_node(node))
633 break;
634 }
635 remove_node(node);
636 break; 753 break;
637 } 754 }
638 mutex_unlock(&node_lock); 755 mutex_unlock(&node_lock);
diff --git a/kernel/groups.c b/kernel/groups.c
index 53b1916c9492..253dc0f35cf4 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -143,10 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp)
143 right = group_info->ngroups; 143 right = group_info->ngroups;
144 while (left < right) { 144 while (left < right) {
145 unsigned int mid = (left+right)/2; 145 unsigned int mid = (left+right)/2;
146 int cmp = grp - GROUP_AT(group_info, mid); 146 if (grp > GROUP_AT(group_info, mid))
147 if (cmp > 0)
148 left = mid + 1; 147 left = mid + 1;
149 else if (cmp < 0) 148 else if (grp < GROUP_AT(group_info, mid))
150 right = mid; 149 right = mid;
151 else 150 else
152 return 1; 151 return 1;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ce669174f355..72206cf5c6cf 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -931,6 +931,7 @@ static inline int
931remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) 931remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
932{ 932{
933 if (hrtimer_is_queued(timer)) { 933 if (hrtimer_is_queued(timer)) {
934 unsigned long state;
934 int reprogram; 935 int reprogram;
935 936
936 /* 937 /*
@@ -944,8 +945,13 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
944 debug_deactivate(timer); 945 debug_deactivate(timer);
945 timer_stats_hrtimer_clear_start_info(timer); 946 timer_stats_hrtimer_clear_start_info(timer);
946 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); 947 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
947 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 948 /*
948 reprogram); 949 * We must preserve the CALLBACK state flag here,
950 * otherwise we could move the timer base in
951 * switch_hrtimer_base.
952 */
953 state = timer->state & HRTIMER_STATE_CALLBACK;
954 __remove_hrtimer(timer, base, state, reprogram);
949 return 1; 955 return 1;
950 } 956 }
951 return 0; 957 return 0;
@@ -1091,11 +1097,10 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
1091 */ 1097 */
1092ktime_t hrtimer_get_remaining(const struct hrtimer *timer) 1098ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
1093{ 1099{
1094 struct hrtimer_clock_base *base;
1095 unsigned long flags; 1100 unsigned long flags;
1096 ktime_t rem; 1101 ktime_t rem;
1097 1102
1098 base = lock_hrtimer_base(timer, &flags); 1103 lock_hrtimer_base(timer, &flags);
1099 rem = hrtimer_expires_remaining(timer); 1104 rem = hrtimer_expires_remaining(timer);
1100 unlock_hrtimer_base(timer, &flags); 1105 unlock_hrtimer_base(timer, &flags);
1101 1106
@@ -1232,6 +1237,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1232 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); 1237 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1233 enqueue_hrtimer(timer, base); 1238 enqueue_hrtimer(timer, base);
1234 } 1239 }
1240
1241 WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
1242
1235 timer->state &= ~HRTIMER_STATE_CALLBACK; 1243 timer->state &= ~HRTIMER_STATE_CALLBACK;
1236} 1244}
1237 1245
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 0c642d51aac2..53ead174da2f 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
98 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" 98 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
99 " disables this message.\n"); 99 " disables this message.\n");
100 sched_show_task(t); 100 sched_show_task(t);
101 __debug_show_held_locks(t); 101 debug_show_held_locks(t);
102 102
103 touch_nmi_watchdog(); 103 touch_nmi_watchdog();
104 104
@@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
111 * periodically exit the critical section and enter a new one. 111 * periodically exit the critical section and enter a new one.
112 * 112 *
113 * For preemptible RCU it is sufficient to call rcu_read_unlock in order 113 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
114 * exit the grace period. For classic RCU, a reschedule is required. 114 * to exit the grace period. For classic RCU, a reschedule is required.
115 */ 115 */
116static void rcu_lock_break(struct task_struct *g, struct task_struct *t) 116static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
117{ 117{
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index d71a987fd2bf..2c9120f0afca 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -113,12 +113,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
113 */ 113 */
114static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) 114static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
115{ 115{
116 struct perf_event_context *ctx = bp->ctx; 116 struct task_struct *tsk = bp->hw.bp_target;
117 struct perf_event *iter; 117 struct perf_event *iter;
118 int count = 0; 118 int count = 0;
119 119
120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) { 120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
121 if (iter->ctx == ctx && find_slot_idx(iter) == type) 121 if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type)
122 count += hw_breakpoint_weight(iter); 122 count += hw_breakpoint_weight(iter);
123 } 123 }
124 124
@@ -134,7 +134,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
134 enum bp_type_idx type) 134 enum bp_type_idx type)
135{ 135{
136 int cpu = bp->cpu; 136 int cpu = bp->cpu;
137 struct task_struct *tsk = bp->ctx->task; 137 struct task_struct *tsk = bp->hw.bp_target;
138 138
139 if (cpu >= 0) { 139 if (cpu >= 0) {
140 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); 140 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
@@ -213,7 +213,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
213 int weight) 213 int weight)
214{ 214{
215 int cpu = bp->cpu; 215 int cpu = bp->cpu;
216 struct task_struct *tsk = bp->ctx->task; 216 struct task_struct *tsk = bp->hw.bp_target;
217 217
218 /* Pinned counter cpu profiling */ 218 /* Pinned counter cpu profiling */
219 if (!tsk) { 219 if (!tsk) {
@@ -433,7 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
433 perf_overflow_handler_t triggered, 433 perf_overflow_handler_t triggered,
434 struct task_struct *tsk) 434 struct task_struct *tsk)
435{ 435{
436 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); 436 return perf_event_create_kernel_counter(attr, -1, tsk, triggered);
437} 437}
438EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); 438EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
439 439
@@ -515,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
515 get_online_cpus(); 515 get_online_cpus();
516 for_each_online_cpu(cpu) { 516 for_each_online_cpu(cpu) {
517 pevent = per_cpu_ptr(cpu_events, cpu); 517 pevent = per_cpu_ptr(cpu_events, cpu);
518 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); 518 bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered);
519 519
520 *pevent = bp; 520 *pevent = bp;
521 521
@@ -565,6 +565,61 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
565 .priority = 0x7fffffff 565 .priority = 0x7fffffff
566}; 566};
567 567
568static void bp_perf_event_destroy(struct perf_event *event)
569{
570 release_bp_slot(event);
571}
572
573static int hw_breakpoint_event_init(struct perf_event *bp)
574{
575 int err;
576
577 if (bp->attr.type != PERF_TYPE_BREAKPOINT)
578 return -ENOENT;
579
580 err = register_perf_hw_breakpoint(bp);
581 if (err)
582 return err;
583
584 bp->destroy = bp_perf_event_destroy;
585
586 return 0;
587}
588
589static int hw_breakpoint_add(struct perf_event *bp, int flags)
590{
591 if (!(flags & PERF_EF_START))
592 bp->hw.state = PERF_HES_STOPPED;
593
594 return arch_install_hw_breakpoint(bp);
595}
596
597static void hw_breakpoint_del(struct perf_event *bp, int flags)
598{
599 arch_uninstall_hw_breakpoint(bp);
600}
601
602static void hw_breakpoint_start(struct perf_event *bp, int flags)
603{
604 bp->hw.state = 0;
605}
606
607static void hw_breakpoint_stop(struct perf_event *bp, int flags)
608{
609 bp->hw.state = PERF_HES_STOPPED;
610}
611
612static struct pmu perf_breakpoint = {
613 .task_ctx_nr = perf_sw_context, /* could eventually get its own */
614
615 .event_init = hw_breakpoint_event_init,
616 .add = hw_breakpoint_add,
617 .del = hw_breakpoint_del,
618 .start = hw_breakpoint_start,
619 .stop = hw_breakpoint_stop,
620 .read = hw_breakpoint_pmu_read,
621};
622
568static int __init init_hw_breakpoint(void) 623static int __init init_hw_breakpoint(void)
569{ 624{
570 unsigned int **task_bp_pinned; 625 unsigned int **task_bp_pinned;
@@ -586,6 +641,8 @@ static int __init init_hw_breakpoint(void)
586 641
587 constraints_initialized = 1; 642 constraints_initialized = 1;
588 643
644 perf_pmu_register(&perf_breakpoint);
645
589 return register_die_notifier(&hw_breakpoint_exceptions_nb); 646 return register_die_notifier(&hw_breakpoint_exceptions_nb);
590 647
591 err_alloc: 648 err_alloc:
@@ -601,8 +658,3 @@ static int __init init_hw_breakpoint(void)
601core_initcall(init_hw_breakpoint); 658core_initcall(init_hw_breakpoint);
602 659
603 660
604struct pmu perf_ops_bp = {
605 .enable = arch_install_hw_breakpoint,
606 .disable = arch_uninstall_hw_breakpoint,
607 .read = hw_breakpoint_pmu_read,
608};
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
new file mode 100644
index 000000000000..31d766bf5d2e
--- /dev/null
+++ b/kernel/irq/Kconfig
@@ -0,0 +1,53 @@
1config HAVE_GENERIC_HARDIRQS
2 def_bool n
3
4if HAVE_GENERIC_HARDIRQS
5menu "IRQ subsystem"
6#
7# Interrupt subsystem related configuration options
8#
9config GENERIC_HARDIRQS
10 def_bool y
11
12config GENERIC_HARDIRQS_NO__DO_IRQ
13 def_bool y
14
15# Select this to disable the deprecated stuff
16config GENERIC_HARDIRQS_NO_DEPRECATED
17 def_bool n
18
19# Options selectable by the architecture code
20config HAVE_SPARSE_IRQ
21 def_bool n
22
23config GENERIC_IRQ_PROBE
24 def_bool n
25
26config GENERIC_PENDING_IRQ
27 def_bool n
28
29config AUTO_IRQ_AFFINITY
30 def_bool n
31
32config IRQ_PER_CPU
33 def_bool n
34
35config HARDIRQS_SW_RESEND
36 def_bool n
37
38config SPARSE_IRQ
39 bool "Support sparse irq numbering"
40 depends on HAVE_SPARSE_IRQ
41 ---help---
42
43 Sparse irq numbering is useful for distro kernels that want
44 to define a high CONFIG_NR_CPUS value but still want to have
45 low kernel memory footprint on smaller machines.
46
47 ( Sparse irqs can also be beneficial on NUMA boxes, as they spread
48 out the interrupt descriptors in a more NUMA-friendly way. )
49
50 If you don't know what to do here, say N.
51
52endmenu
53endif
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 7d047808419d..54329cd7b3ee 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,7 +1,6 @@
1 1
2obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o 2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
6obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
7obj-$(CONFIG_PM_SLEEP) += pm.o 6obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 2295a31ef110..505798f86c36 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -57,9 +57,10 @@ unsigned long probe_irq_on(void)
57 * Some chips need to know about probing in 57 * Some chips need to know about probing in
58 * progress: 58 * progress:
59 */ 59 */
60 if (desc->chip->set_type) 60 if (desc->irq_data.chip->irq_set_type)
61 desc->chip->set_type(i, IRQ_TYPE_PROBE); 61 desc->irq_data.chip->irq_set_type(&desc->irq_data,
62 desc->chip->startup(i); 62 IRQ_TYPE_PROBE);
63 desc->irq_data.chip->irq_startup(&desc->irq_data);
63 } 64 }
64 raw_spin_unlock_irq(&desc->lock); 65 raw_spin_unlock_irq(&desc->lock);
65 } 66 }
@@ -76,7 +77,7 @@ unsigned long probe_irq_on(void)
76 raw_spin_lock_irq(&desc->lock); 77 raw_spin_lock_irq(&desc->lock);
77 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 78 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
78 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 79 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
79 if (desc->chip->startup(i)) 80 if (desc->irq_data.chip->irq_startup(&desc->irq_data))
80 desc->status |= IRQ_PENDING; 81 desc->status |= IRQ_PENDING;
81 } 82 }
82 raw_spin_unlock_irq(&desc->lock); 83 raw_spin_unlock_irq(&desc->lock);
@@ -98,7 +99,7 @@ unsigned long probe_irq_on(void)
98 /* It triggered already - consider it spurious. */ 99 /* It triggered already - consider it spurious. */
99 if (!(status & IRQ_WAITING)) { 100 if (!(status & IRQ_WAITING)) {
100 desc->status = status & ~IRQ_AUTODETECT; 101 desc->status = status & ~IRQ_AUTODETECT;
101 desc->chip->shutdown(i); 102 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
102 } else 103 } else
103 if (i < 32) 104 if (i < 32)
104 mask |= 1 << i; 105 mask |= 1 << i;
@@ -137,7 +138,7 @@ unsigned int probe_irq_mask(unsigned long val)
137 mask |= 1 << i; 138 mask |= 1 << i;
138 139
139 desc->status = status & ~IRQ_AUTODETECT; 140 desc->status = status & ~IRQ_AUTODETECT;
140 desc->chip->shutdown(i); 141 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
141 } 142 }
142 raw_spin_unlock_irq(&desc->lock); 143 raw_spin_unlock_irq(&desc->lock);
143 } 144 }
@@ -181,7 +182,7 @@ int probe_irq_off(unsigned long val)
181 nr_of_irqs++; 182 nr_of_irqs++;
182 } 183 }
183 desc->status = status & ~IRQ_AUTODETECT; 184 desc->status = status & ~IRQ_AUTODETECT;
184 desc->chip->shutdown(i); 185 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
185 } 186 }
186 raw_spin_unlock_irq(&desc->lock); 187 raw_spin_unlock_irq(&desc->lock);
187 } 188 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b7091d5ca2f8..baa5c4acad83 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,108 +18,6 @@
18 18
19#include "internals.h" 19#include "internals.h"
20 20
21static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
22{
23 struct irq_desc *desc;
24 unsigned long flags;
25
26 desc = irq_to_desc(irq);
27 if (!desc) {
28 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
29 return;
30 }
31
32 /* Ensure we don't have left over values from a previous use of this irq */
33 raw_spin_lock_irqsave(&desc->lock, flags);
34 desc->status = IRQ_DISABLED;
35 desc->chip = &no_irq_chip;
36 desc->handle_irq = handle_bad_irq;
37 desc->depth = 1;
38 desc->msi_desc = NULL;
39 desc->handler_data = NULL;
40 if (!keep_chip_data)
41 desc->chip_data = NULL;
42 desc->action = NULL;
43 desc->irq_count = 0;
44 desc->irqs_unhandled = 0;
45#ifdef CONFIG_SMP
46 cpumask_setall(desc->affinity);
47#ifdef CONFIG_GENERIC_PENDING_IRQ
48 cpumask_clear(desc->pending_mask);
49#endif
50#endif
51 raw_spin_unlock_irqrestore(&desc->lock, flags);
52}
53
54/**
55 * dynamic_irq_init - initialize a dynamically allocated irq
56 * @irq: irq number to initialize
57 */
58void dynamic_irq_init(unsigned int irq)
59{
60 dynamic_irq_init_x(irq, false);
61}
62
63/**
64 * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
65 * @irq: irq number to initialize
66 *
67 * does not set irq_to_desc(irq)->chip_data to NULL
68 */
69void dynamic_irq_init_keep_chip_data(unsigned int irq)
70{
71 dynamic_irq_init_x(irq, true);
72}
73
74static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
75{
76 struct irq_desc *desc = irq_to_desc(irq);
77 unsigned long flags;
78
79 if (!desc) {
80 WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
81 return;
82 }
83
84 raw_spin_lock_irqsave(&desc->lock, flags);
85 if (desc->action) {
86 raw_spin_unlock_irqrestore(&desc->lock, flags);
87 WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
88 irq);
89 return;
90 }
91 desc->msi_desc = NULL;
92 desc->handler_data = NULL;
93 if (!keep_chip_data)
94 desc->chip_data = NULL;
95 desc->handle_irq = handle_bad_irq;
96 desc->chip = &no_irq_chip;
97 desc->name = NULL;
98 clear_kstat_irqs(desc);
99 raw_spin_unlock_irqrestore(&desc->lock, flags);
100}
101
102/**
103 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
104 * @irq: irq number to initialize
105 */
106void dynamic_irq_cleanup(unsigned int irq)
107{
108 dynamic_irq_cleanup_x(irq, false);
109}
110
111/**
112 * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
113 * @irq: irq number to initialize
114 *
115 * does not set irq_to_desc(irq)->chip_data to NULL
116 */
117void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
118{
119 dynamic_irq_cleanup_x(irq, true);
120}
121
122
123/** 21/**
124 * set_irq_chip - set the irq chip for an irq 22 * set_irq_chip - set the irq chip for an irq
125 * @irq: irq number 23 * @irq: irq number
@@ -140,7 +38,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
140 38
141 raw_spin_lock_irqsave(&desc->lock, flags); 39 raw_spin_lock_irqsave(&desc->lock, flags);
142 irq_chip_set_defaults(chip); 40 irq_chip_set_defaults(chip);
143 desc->chip = chip; 41 desc->irq_data.chip = chip;
144 raw_spin_unlock_irqrestore(&desc->lock, flags); 42 raw_spin_unlock_irqrestore(&desc->lock, flags);
145 43
146 return 0; 44 return 0;
@@ -193,7 +91,7 @@ int set_irq_data(unsigned int irq, void *data)
193 } 91 }
194 92
195 raw_spin_lock_irqsave(&desc->lock, flags); 93 raw_spin_lock_irqsave(&desc->lock, flags);
196 desc->handler_data = data; 94 desc->irq_data.handler_data = data;
197 raw_spin_unlock_irqrestore(&desc->lock, flags); 95 raw_spin_unlock_irqrestore(&desc->lock, flags);
198 return 0; 96 return 0;
199} 97}
@@ -218,7 +116,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
218 } 116 }
219 117
220 raw_spin_lock_irqsave(&desc->lock, flags); 118 raw_spin_lock_irqsave(&desc->lock, flags);
221 desc->msi_desc = entry; 119 desc->irq_data.msi_desc = entry;
222 if (entry) 120 if (entry)
223 entry->irq = irq; 121 entry->irq = irq;
224 raw_spin_unlock_irqrestore(&desc->lock, flags); 122 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -243,19 +141,27 @@ int set_irq_chip_data(unsigned int irq, void *data)
243 return -EINVAL; 141 return -EINVAL;
244 } 142 }
245 143
246 if (!desc->chip) { 144 if (!desc->irq_data.chip) {
247 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); 145 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
248 return -EINVAL; 146 return -EINVAL;
249 } 147 }
250 148
251 raw_spin_lock_irqsave(&desc->lock, flags); 149 raw_spin_lock_irqsave(&desc->lock, flags);
252 desc->chip_data = data; 150 desc->irq_data.chip_data = data;
253 raw_spin_unlock_irqrestore(&desc->lock, flags); 151 raw_spin_unlock_irqrestore(&desc->lock, flags);
254 152
255 return 0; 153 return 0;
256} 154}
257EXPORT_SYMBOL(set_irq_chip_data); 155EXPORT_SYMBOL(set_irq_chip_data);
258 156
157struct irq_data *irq_get_irq_data(unsigned int irq)
158{
159 struct irq_desc *desc = irq_to_desc(irq);
160
161 return desc ? &desc->irq_data : NULL;
162}
163EXPORT_SYMBOL_GPL(irq_get_irq_data);
164
259/** 165/**
260 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq 166 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
261 * 167 *
@@ -287,93 +193,216 @@ EXPORT_SYMBOL_GPL(set_irq_nested_thread);
287/* 193/*
288 * default enable function 194 * default enable function
289 */ 195 */
290static void default_enable(unsigned int irq) 196static void default_enable(struct irq_data *data)
291{ 197{
292 struct irq_desc *desc = irq_to_desc(irq); 198 struct irq_desc *desc = irq_data_to_desc(data);
293 199
294 desc->chip->unmask(irq); 200 desc->irq_data.chip->irq_unmask(&desc->irq_data);
295 desc->status &= ~IRQ_MASKED; 201 desc->status &= ~IRQ_MASKED;
296} 202}
297 203
298/* 204/*
299 * default disable function 205 * default disable function
300 */ 206 */
301static void default_disable(unsigned int irq) 207static void default_disable(struct irq_data *data)
302{ 208{
303} 209}
304 210
305/* 211/*
306 * default startup function 212 * default startup function
307 */ 213 */
308static unsigned int default_startup(unsigned int irq) 214static unsigned int default_startup(struct irq_data *data)
309{ 215{
310 struct irq_desc *desc = irq_to_desc(irq); 216 struct irq_desc *desc = irq_data_to_desc(data);
311 217
312 desc->chip->enable(irq); 218 desc->irq_data.chip->irq_enable(data);
313 return 0; 219 return 0;
314} 220}
315 221
316/* 222/*
317 * default shutdown function 223 * default shutdown function
318 */ 224 */
319static void default_shutdown(unsigned int irq) 225static void default_shutdown(struct irq_data *data)
320{ 226{
321 struct irq_desc *desc = irq_to_desc(irq); 227 struct irq_desc *desc = irq_data_to_desc(data);
322 228
323 desc->chip->mask(irq); 229 desc->irq_data.chip->irq_mask(&desc->irq_data);
324 desc->status |= IRQ_MASKED; 230 desc->status |= IRQ_MASKED;
325} 231}
326 232
233#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
234/* Temporary migration helpers */
235static void compat_irq_mask(struct irq_data *data)
236{
237 data->chip->mask(data->irq);
238}
239
240static void compat_irq_unmask(struct irq_data *data)
241{
242 data->chip->unmask(data->irq);
243}
244
245static void compat_irq_ack(struct irq_data *data)
246{
247 data->chip->ack(data->irq);
248}
249
250static void compat_irq_mask_ack(struct irq_data *data)
251{
252 data->chip->mask_ack(data->irq);
253}
254
255static void compat_irq_eoi(struct irq_data *data)
256{
257 data->chip->eoi(data->irq);
258}
259
260static void compat_irq_enable(struct irq_data *data)
261{
262 data->chip->enable(data->irq);
263}
264
265static void compat_irq_disable(struct irq_data *data)
266{
267 data->chip->disable(data->irq);
268}
269
270static void compat_irq_shutdown(struct irq_data *data)
271{
272 data->chip->shutdown(data->irq);
273}
274
275static unsigned int compat_irq_startup(struct irq_data *data)
276{
277 return data->chip->startup(data->irq);
278}
279
280static int compat_irq_set_affinity(struct irq_data *data,
281 const struct cpumask *dest, bool force)
282{
283 return data->chip->set_affinity(data->irq, dest);
284}
285
286static int compat_irq_set_type(struct irq_data *data, unsigned int type)
287{
288 return data->chip->set_type(data->irq, type);
289}
290
291static int compat_irq_set_wake(struct irq_data *data, unsigned int on)
292{
293 return data->chip->set_wake(data->irq, on);
294}
295
296static int compat_irq_retrigger(struct irq_data *data)
297{
298 return data->chip->retrigger(data->irq);
299}
300
301static void compat_bus_lock(struct irq_data *data)
302{
303 data->chip->bus_lock(data->irq);
304}
305
306static void compat_bus_sync_unlock(struct irq_data *data)
307{
308 data->chip->bus_sync_unlock(data->irq);
309}
310#endif
311
327/* 312/*
328 * Fixup enable/disable function pointers 313 * Fixup enable/disable function pointers
329 */ 314 */
330void irq_chip_set_defaults(struct irq_chip *chip) 315void irq_chip_set_defaults(struct irq_chip *chip)
331{ 316{
332 if (!chip->enable) 317#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
333 chip->enable = default_enable;
334 if (!chip->disable)
335 chip->disable = default_disable;
336 if (!chip->startup)
337 chip->startup = default_startup;
338 /* 318 /*
339 * We use chip->disable, when the user provided its own. When 319 * Compat fixup functions need to be before we set the
340 * we have default_disable set for chip->disable, then we need 320 * defaults for enable/disable/startup/shutdown
321 */
322 if (chip->enable)
323 chip->irq_enable = compat_irq_enable;
324 if (chip->disable)
325 chip->irq_disable = compat_irq_disable;
326 if (chip->shutdown)
327 chip->irq_shutdown = compat_irq_shutdown;
328 if (chip->startup)
329 chip->irq_startup = compat_irq_startup;
330#endif
331 /*
332 * The real defaults
333 */
334 if (!chip->irq_enable)
335 chip->irq_enable = default_enable;
336 if (!chip->irq_disable)
337 chip->irq_disable = default_disable;
338 if (!chip->irq_startup)
339 chip->irq_startup = default_startup;
340 /*
341 * We use chip->irq_disable, when the user provided its own. When
342 * we have default_disable set for chip->irq_disable, then we need
341 * to use default_shutdown, otherwise the irq line is not 343 * to use default_shutdown, otherwise the irq line is not
342 * disabled on free_irq(): 344 * disabled on free_irq():
343 */ 345 */
344 if (!chip->shutdown) 346 if (!chip->irq_shutdown)
345 chip->shutdown = chip->disable != default_disable ? 347 chip->irq_shutdown = chip->irq_disable != default_disable ?
346 chip->disable : default_shutdown; 348 chip->irq_disable : default_shutdown;
347 if (!chip->name) 349
348 chip->name = chip->typename; 350#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
349 if (!chip->end) 351 if (!chip->end)
350 chip->end = dummy_irq_chip.end; 352 chip->end = dummy_irq_chip.end;
353
354 /*
355 * Now fix up the remaining compat handlers
356 */
357 if (chip->bus_lock)
358 chip->irq_bus_lock = compat_bus_lock;
359 if (chip->bus_sync_unlock)
360 chip->irq_bus_sync_unlock = compat_bus_sync_unlock;
361 if (chip->mask)
362 chip->irq_mask = compat_irq_mask;
363 if (chip->unmask)
364 chip->irq_unmask = compat_irq_unmask;
365 if (chip->ack)
366 chip->irq_ack = compat_irq_ack;
367 if (chip->mask_ack)
368 chip->irq_mask_ack = compat_irq_mask_ack;
369 if (chip->eoi)
370 chip->irq_eoi = compat_irq_eoi;
371 if (chip->set_affinity)
372 chip->irq_set_affinity = compat_irq_set_affinity;
373 if (chip->set_type)
374 chip->irq_set_type = compat_irq_set_type;
375 if (chip->set_wake)
376 chip->irq_set_wake = compat_irq_set_wake;
377 if (chip->retrigger)
378 chip->irq_retrigger = compat_irq_retrigger;
379#endif
351} 380}
352 381
353static inline void mask_ack_irq(struct irq_desc *desc, int irq) 382static inline void mask_ack_irq(struct irq_desc *desc)
354{ 383{
355 if (desc->chip->mask_ack) 384 if (desc->irq_data.chip->irq_mask_ack)
356 desc->chip->mask_ack(irq); 385 desc->irq_data.chip->irq_mask_ack(&desc->irq_data);
357 else { 386 else {
358 desc->chip->mask(irq); 387 desc->irq_data.chip->irq_mask(&desc->irq_data);
359 if (desc->chip->ack) 388 if (desc->irq_data.chip->irq_ack)
360 desc->chip->ack(irq); 389 desc->irq_data.chip->irq_ack(&desc->irq_data);
361 } 390 }
362 desc->status |= IRQ_MASKED; 391 desc->status |= IRQ_MASKED;
363} 392}
364 393
365static inline void mask_irq(struct irq_desc *desc, int irq) 394static inline void mask_irq(struct irq_desc *desc)
366{ 395{
367 if (desc->chip->mask) { 396 if (desc->irq_data.chip->irq_mask) {
368 desc->chip->mask(irq); 397 desc->irq_data.chip->irq_mask(&desc->irq_data);
369 desc->status |= IRQ_MASKED; 398 desc->status |= IRQ_MASKED;
370 } 399 }
371} 400}
372 401
373static inline void unmask_irq(struct irq_desc *desc, int irq) 402static inline void unmask_irq(struct irq_desc *desc)
374{ 403{
375 if (desc->chip->unmask) { 404 if (desc->irq_data.chip->irq_unmask) {
376 desc->chip->unmask(irq); 405 desc->irq_data.chip->irq_unmask(&desc->irq_data);
377 desc->status &= ~IRQ_MASKED; 406 desc->status &= ~IRQ_MASKED;
378 } 407 }
379} 408}
@@ -476,7 +505,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
476 irqreturn_t action_ret; 505 irqreturn_t action_ret;
477 506
478 raw_spin_lock(&desc->lock); 507 raw_spin_lock(&desc->lock);
479 mask_ack_irq(desc, irq); 508 mask_ack_irq(desc);
480 509
481 if (unlikely(desc->status & IRQ_INPROGRESS)) 510 if (unlikely(desc->status & IRQ_INPROGRESS))
482 goto out_unlock; 511 goto out_unlock;
@@ -502,7 +531,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
502 desc->status &= ~IRQ_INPROGRESS; 531 desc->status &= ~IRQ_INPROGRESS;
503 532
504 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) 533 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
505 unmask_irq(desc, irq); 534 unmask_irq(desc);
506out_unlock: 535out_unlock:
507 raw_spin_unlock(&desc->lock); 536 raw_spin_unlock(&desc->lock);
508} 537}
@@ -539,7 +568,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
539 action = desc->action; 568 action = desc->action;
540 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 569 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
541 desc->status |= IRQ_PENDING; 570 desc->status |= IRQ_PENDING;
542 mask_irq(desc, irq); 571 mask_irq(desc);
543 goto out; 572 goto out;
544 } 573 }
545 574
@@ -554,7 +583,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
554 raw_spin_lock(&desc->lock); 583 raw_spin_lock(&desc->lock);
555 desc->status &= ~IRQ_INPROGRESS; 584 desc->status &= ~IRQ_INPROGRESS;
556out: 585out:
557 desc->chip->eoi(irq); 586 desc->irq_data.chip->irq_eoi(&desc->irq_data);
558 587
559 raw_spin_unlock(&desc->lock); 588 raw_spin_unlock(&desc->lock);
560} 589}
@@ -590,14 +619,13 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
590 if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || 619 if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
591 !desc->action)) { 620 !desc->action)) {
592 desc->status |= (IRQ_PENDING | IRQ_MASKED); 621 desc->status |= (IRQ_PENDING | IRQ_MASKED);
593 mask_ack_irq(desc, irq); 622 mask_ack_irq(desc);
594 goto out_unlock; 623 goto out_unlock;
595 } 624 }
596 kstat_incr_irqs_this_cpu(irq, desc); 625 kstat_incr_irqs_this_cpu(irq, desc);
597 626
598 /* Start handling the irq */ 627 /* Start handling the irq */
599 if (desc->chip->ack) 628 desc->irq_data.chip->irq_ack(&desc->irq_data);
600 desc->chip->ack(irq);
601 629
602 /* Mark the IRQ currently in progress.*/ 630 /* Mark the IRQ currently in progress.*/
603 desc->status |= IRQ_INPROGRESS; 631 desc->status |= IRQ_INPROGRESS;
@@ -607,7 +635,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
607 irqreturn_t action_ret; 635 irqreturn_t action_ret;
608 636
609 if (unlikely(!action)) { 637 if (unlikely(!action)) {
610 mask_irq(desc, irq); 638 mask_irq(desc);
611 goto out_unlock; 639 goto out_unlock;
612 } 640 }
613 641
@@ -619,7 +647,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
619 if (unlikely((desc->status & 647 if (unlikely((desc->status &
620 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == 648 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
621 (IRQ_PENDING | IRQ_MASKED))) { 649 (IRQ_PENDING | IRQ_MASKED))) {
622 unmask_irq(desc, irq); 650 unmask_irq(desc);
623 } 651 }
624 652
625 desc->status &= ~IRQ_PENDING; 653 desc->status &= ~IRQ_PENDING;
@@ -650,15 +678,15 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
650 678
651 kstat_incr_irqs_this_cpu(irq, desc); 679 kstat_incr_irqs_this_cpu(irq, desc);
652 680
653 if (desc->chip->ack) 681 if (desc->irq_data.chip->irq_ack)
654 desc->chip->ack(irq); 682 desc->irq_data.chip->irq_ack(&desc->irq_data);
655 683
656 action_ret = handle_IRQ_event(irq, desc->action); 684 action_ret = handle_IRQ_event(irq, desc->action);
657 if (!noirqdebug) 685 if (!noirqdebug)
658 note_interrupt(irq, desc, action_ret); 686 note_interrupt(irq, desc, action_ret);
659 687
660 if (desc->chip->eoi) 688 if (desc->irq_data.chip->irq_eoi)
661 desc->chip->eoi(irq); 689 desc->irq_data.chip->irq_eoi(&desc->irq_data);
662} 690}
663 691
664void 692void
@@ -676,7 +704,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
676 704
677 if (!handle) 705 if (!handle)
678 handle = handle_bad_irq; 706 handle = handle_bad_irq;
679 else if (desc->chip == &no_irq_chip) { 707 else if (desc->irq_data.chip == &no_irq_chip) {
680 printk(KERN_WARNING "Trying to install %sinterrupt handler " 708 printk(KERN_WARNING "Trying to install %sinterrupt handler "
681 "for IRQ%d\n", is_chained ? "chained " : "", irq); 709 "for IRQ%d\n", is_chained ? "chained " : "", irq);
682 /* 710 /*
@@ -686,16 +714,16 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
686 * prevent us to setup the interrupt at all. Switch it to 714 * prevent us to setup the interrupt at all. Switch it to
687 * dummy_irq_chip for easy transition. 715 * dummy_irq_chip for easy transition.
688 */ 716 */
689 desc->chip = &dummy_irq_chip; 717 desc->irq_data.chip = &dummy_irq_chip;
690 } 718 }
691 719
692 chip_bus_lock(irq, desc); 720 chip_bus_lock(desc);
693 raw_spin_lock_irqsave(&desc->lock, flags); 721 raw_spin_lock_irqsave(&desc->lock, flags);
694 722
695 /* Uninstall? */ 723 /* Uninstall? */
696 if (handle == handle_bad_irq) { 724 if (handle == handle_bad_irq) {
697 if (desc->chip != &no_irq_chip) 725 if (desc->irq_data.chip != &no_irq_chip)
698 mask_ack_irq(desc, irq); 726 mask_ack_irq(desc);
699 desc->status |= IRQ_DISABLED; 727 desc->status |= IRQ_DISABLED;
700 desc->depth = 1; 728 desc->depth = 1;
701 } 729 }
@@ -706,10 +734,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
706 desc->status &= ~IRQ_DISABLED; 734 desc->status &= ~IRQ_DISABLED;
707 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; 735 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
708 desc->depth = 0; 736 desc->depth = 0;
709 desc->chip->startup(irq); 737 desc->irq_data.chip->irq_startup(&desc->irq_data);
710 } 738 }
711 raw_spin_unlock_irqrestore(&desc->lock, flags); 739 raw_spin_unlock_irqrestore(&desc->lock, flags);
712 chip_bus_sync_unlock(irq, desc); 740 chip_bus_sync_unlock(desc);
713} 741}
714EXPORT_SYMBOL_GPL(__set_irq_handler); 742EXPORT_SYMBOL_GPL(__set_irq_handler);
715 743
@@ -729,32 +757,20 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
729 __set_irq_handler(irq, handle, 0, name); 757 __set_irq_handler(irq, handle, 0, name);
730} 758}
731 759
732void set_irq_noprobe(unsigned int irq) 760void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
733{ 761{
734 struct irq_desc *desc = irq_to_desc(irq); 762 struct irq_desc *desc = irq_to_desc(irq);
735 unsigned long flags; 763 unsigned long flags;
736 764
737 if (!desc) { 765 if (!desc)
738 printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
739 return; 766 return;
740 }
741
742 raw_spin_lock_irqsave(&desc->lock, flags);
743 desc->status |= IRQ_NOPROBE;
744 raw_spin_unlock_irqrestore(&desc->lock, flags);
745}
746
747void set_irq_probe(unsigned int irq)
748{
749 struct irq_desc *desc = irq_to_desc(irq);
750 unsigned long flags;
751 767
752 if (!desc) { 768 /* Sanitize flags */
753 printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); 769 set &= IRQF_MODIFY_MASK;
754 return; 770 clr &= IRQF_MODIFY_MASK;
755 }
756 771
757 raw_spin_lock_irqsave(&desc->lock, flags); 772 raw_spin_lock_irqsave(&desc->lock, flags);
758 desc->status &= ~IRQ_NOPROBE; 773 desc->status &= ~clr;
774 desc->status |= set;
759 raw_spin_unlock_irqrestore(&desc->lock, flags); 775 raw_spin_unlock_irqrestore(&desc->lock, flags);
760} 776}
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
new file mode 100644
index 000000000000..20dc5474947e
--- /dev/null
+++ b/kernel/irq/dummychip.c
@@ -0,0 +1,68 @@
1/*
2 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
3 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
4 *
5 * This file contains the dummy interrupt chip implementation
6 */
7#include <linux/interrupt.h>
8#include <linux/irq.h>
9
10#include "internals.h"
11
12/*
13 * What should we do if we get a hw irq event on an illegal vector?
14 * Each architecture has to answer this themself.
15 */
16static void ack_bad(struct irq_data *data)
17{
18 struct irq_desc *desc = irq_data_to_desc(data);
19
20 print_irq_desc(data->irq, desc);
21 ack_bad_irq(data->irq);
22}
23
24/*
25 * NOP functions
26 */
27static void noop(struct irq_data *data) { }
28
29static unsigned int noop_ret(struct irq_data *data)
30{
31 return 0;
32}
33
34#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
35static void compat_noop(unsigned int irq) { }
36#define END_INIT .end = compat_noop
37#else
38#define END_INIT
39#endif
40
41/*
42 * Generic no controller implementation
43 */
44struct irq_chip no_irq_chip = {
45 .name = "none",
46 .irq_startup = noop_ret,
47 .irq_shutdown = noop,
48 .irq_enable = noop,
49 .irq_disable = noop,
50 .irq_ack = ack_bad,
51 END_INIT
52};
53
54/*
55 * Generic dummy implementation which can be used for
56 * real dumb interrupt sources
57 */
58struct irq_chip dummy_irq_chip = {
59 .name = "dummy",
60 .irq_startup = noop_ret,
61 .irq_shutdown = noop,
62 .irq_enable = noop,
63 .irq_disable = noop,
64 .irq_ack = noop,
65 .irq_mask = noop,
66 .irq_unmask = noop,
67 END_INIT
68};
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 27e5c6911223..e2347eb63306 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,24 +11,15 @@
11 */ 11 */
12 12
13#include <linux/irq.h> 13#include <linux/irq.h>
14#include <linux/sched.h>
15#include <linux/slab.h>
16#include <linux/module.h>
17#include <linux/random.h> 14#include <linux/random.h>
15#include <linux/sched.h>
18#include <linux/interrupt.h> 16#include <linux/interrupt.h>
19#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
20#include <linux/rculist.h> 18
21#include <linux/hash.h>
22#include <linux/radix-tree.h>
23#include <trace/events/irq.h> 19#include <trace/events/irq.h>
24 20
25#include "internals.h" 21#include "internals.h"
26 22
27/*
28 * lockdep: we want to handle all irq_desc locks as a single lock-class:
29 */
30struct lock_class_key irq_desc_lock_class;
31
32/** 23/**
33 * handle_bad_irq - handle spurious and unhandled irqs 24 * handle_bad_irq - handle spurious and unhandled irqs
34 * @irq: the interrupt number 25 * @irq: the interrupt number
@@ -43,304 +34,6 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
43 ack_bad_irq(irq); 34 ack_bad_irq(irq);
44} 35}
45 36
46#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
47static void __init init_irq_default_affinity(void)
48{
49 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
50 cpumask_setall(irq_default_affinity);
51}
52#else
53static void __init init_irq_default_affinity(void)
54{
55}
56#endif
57
58/*
59 * Linux has a controller-independent interrupt architecture.
60 * Every controller has a 'controller-template', that is used
61 * by the main code to do the right thing. Each driver-visible
62 * interrupt source is transparently wired to the appropriate
63 * controller. Thus drivers need not be aware of the
64 * interrupt-controller.
65 *
66 * The code is designed to be easily extended with new/different
67 * interrupt controllers, without having to do assembly magic or
68 * having to touch the generic code.
69 *
70 * Controller mappings for all interrupt sources:
71 */
72int nr_irqs = NR_IRQS;
73EXPORT_SYMBOL_GPL(nr_irqs);
74
75#ifdef CONFIG_SPARSE_IRQ
76
77static struct irq_desc irq_desc_init = {
78 .irq = -1,
79 .status = IRQ_DISABLED,
80 .chip = &no_irq_chip,
81 .handle_irq = handle_bad_irq,
82 .depth = 1,
83 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
84};
85
86void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
87{
88 void *ptr;
89
90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
91 GFP_ATOMIC, node);
92
93 /*
94 * don't overwite if can not get new one
95 * init_copy_kstat_irqs() could still use old one
96 */
97 if (ptr) {
98 printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node);
99 desc->kstat_irqs = ptr;
100 }
101}
102
103static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
104{
105 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
106
107 raw_spin_lock_init(&desc->lock);
108 desc->irq = irq;
109#ifdef CONFIG_SMP
110 desc->node = node;
111#endif
112 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
113 init_kstat_irqs(desc, node, nr_cpu_ids);
114 if (!desc->kstat_irqs) {
115 printk(KERN_ERR "can not alloc kstat_irqs\n");
116 BUG_ON(1);
117 }
118 if (!alloc_desc_masks(desc, node, false)) {
119 printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
120 BUG_ON(1);
121 }
122 init_desc_masks(desc);
123 arch_init_chip_data(desc, node);
124}
125
126/*
127 * Protect the sparse_irqs:
128 */
129DEFINE_RAW_SPINLOCK(sparse_irq_lock);
130
131static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
132
133static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
134{
135 radix_tree_insert(&irq_desc_tree, irq, desc);
136}
137
138struct irq_desc *irq_to_desc(unsigned int irq)
139{
140 return radix_tree_lookup(&irq_desc_tree, irq);
141}
142
143void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
144{
145 void **ptr;
146
147 ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
148 if (ptr)
149 radix_tree_replace_slot(ptr, desc);
150}
151
152static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
153 [0 ... NR_IRQS_LEGACY-1] = {
154 .irq = -1,
155 .status = IRQ_DISABLED,
156 .chip = &no_irq_chip,
157 .handle_irq = handle_bad_irq,
158 .depth = 1,
159 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
160 }
161};
162
163static unsigned int *kstat_irqs_legacy;
164
165int __init early_irq_init(void)
166{
167 struct irq_desc *desc;
168 int legacy_count;
169 int node;
170 int i;
171
172 init_irq_default_affinity();
173
174 /* initialize nr_irqs based on nr_cpu_ids */
175 arch_probe_nr_irqs();
176 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
177
178 desc = irq_desc_legacy;
179 legacy_count = ARRAY_SIZE(irq_desc_legacy);
180 node = first_online_node;
181
182 /* allocate based on nr_cpu_ids */
183 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
184 sizeof(int), GFP_NOWAIT, node);
185
186 for (i = 0; i < legacy_count; i++) {
187 desc[i].irq = i;
188#ifdef CONFIG_SMP
189 desc[i].node = node;
190#endif
191 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
192 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
193 alloc_desc_masks(&desc[i], node, true);
194 init_desc_masks(&desc[i]);
195 set_irq_desc(i, &desc[i]);
196 }
197
198 return arch_early_irq_init();
199}
200
201struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
202{
203 struct irq_desc *desc;
204 unsigned long flags;
205
206 if (irq >= nr_irqs) {
207 WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
208 irq, nr_irqs);
209 return NULL;
210 }
211
212 desc = irq_to_desc(irq);
213 if (desc)
214 return desc;
215
216 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
217
218 /* We have to check it to avoid races with another CPU */
219 desc = irq_to_desc(irq);
220 if (desc)
221 goto out_unlock;
222
223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
224
225 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
226 if (!desc) {
227 printk(KERN_ERR "can not alloc irq_desc\n");
228 BUG_ON(1);
229 }
230 init_one_irq_desc(irq, desc, node);
231
232 set_irq_desc(irq, desc);
233
234out_unlock:
235 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
236
237 return desc;
238}
239
240#else /* !CONFIG_SPARSE_IRQ */
241
242struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
243 [0 ... NR_IRQS-1] = {
244 .status = IRQ_DISABLED,
245 .chip = &no_irq_chip,
246 .handle_irq = handle_bad_irq,
247 .depth = 1,
248 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
249 }
250};
251
252static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
253int __init early_irq_init(void)
254{
255 struct irq_desc *desc;
256 int count;
257 int i;
258
259 init_irq_default_affinity();
260
261 printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
262
263 desc = irq_desc;
264 count = ARRAY_SIZE(irq_desc);
265
266 for (i = 0; i < count; i++) {
267 desc[i].irq = i;
268 alloc_desc_masks(&desc[i], 0, true);
269 init_desc_masks(&desc[i]);
270 desc[i].kstat_irqs = kstat_irqs_all[i];
271 }
272 return arch_early_irq_init();
273}
274
275struct irq_desc *irq_to_desc(unsigned int irq)
276{
277 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
278}
279
280struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
281{
282 return irq_to_desc(irq);
283}
284#endif /* !CONFIG_SPARSE_IRQ */
285
286void clear_kstat_irqs(struct irq_desc *desc)
287{
288 memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
289}
290
291/*
292 * What should we do if we get a hw irq event on an illegal vector?
293 * Each architecture has to answer this themself.
294 */
295static void ack_bad(unsigned int irq)
296{
297 struct irq_desc *desc = irq_to_desc(irq);
298
299 print_irq_desc(irq, desc);
300 ack_bad_irq(irq);
301}
302
303/*
304 * NOP functions
305 */
306static void noop(unsigned int irq)
307{
308}
309
310static unsigned int noop_ret(unsigned int irq)
311{
312 return 0;
313}
314
315/*
316 * Generic no controller implementation
317 */
318struct irq_chip no_irq_chip = {
319 .name = "none",
320 .startup = noop_ret,
321 .shutdown = noop,
322 .enable = noop,
323 .disable = noop,
324 .ack = ack_bad,
325 .end = noop,
326};
327
328/*
329 * Generic dummy implementation which can be used for
330 * real dumb interrupt sources
331 */
332struct irq_chip dummy_irq_chip = {
333 .name = "dummy",
334 .startup = noop_ret,
335 .shutdown = noop,
336 .enable = noop,
337 .disable = noop,
338 .ack = noop,
339 .mask = noop,
340 .unmask = noop,
341 .end = noop,
342};
343
344/* 37/*
345 * Special, empty irq handler: 38 * Special, empty irq handler:
346 */ 39 */
@@ -457,20 +150,20 @@ unsigned int __do_IRQ(unsigned int irq)
457 /* 150 /*
458 * No locking required for CPU-local interrupts: 151 * No locking required for CPU-local interrupts:
459 */ 152 */
460 if (desc->chip->ack) 153 if (desc->irq_data.chip->ack)
461 desc->chip->ack(irq); 154 desc->irq_data.chip->ack(irq);
462 if (likely(!(desc->status & IRQ_DISABLED))) { 155 if (likely(!(desc->status & IRQ_DISABLED))) {
463 action_ret = handle_IRQ_event(irq, desc->action); 156 action_ret = handle_IRQ_event(irq, desc->action);
464 if (!noirqdebug) 157 if (!noirqdebug)
465 note_interrupt(irq, desc, action_ret); 158 note_interrupt(irq, desc, action_ret);
466 } 159 }
467 desc->chip->end(irq); 160 desc->irq_data.chip->end(irq);
468 return 1; 161 return 1;
469 } 162 }
470 163
471 raw_spin_lock(&desc->lock); 164 raw_spin_lock(&desc->lock);
472 if (desc->chip->ack) 165 if (desc->irq_data.chip->ack)
473 desc->chip->ack(irq); 166 desc->irq_data.chip->ack(irq);
474 /* 167 /*
475 * REPLAY is when Linux resends an IRQ that was dropped earlier 168 * REPLAY is when Linux resends an IRQ that was dropped earlier
476 * WAITING is used by probe to mark irqs that are being tested 169 * WAITING is used by probe to mark irqs that are being tested
@@ -530,27 +223,9 @@ out:
530 * The ->end() handler has to deal with interrupts which got 223 * The ->end() handler has to deal with interrupts which got
531 * disabled while the handler was running. 224 * disabled while the handler was running.
532 */ 225 */
533 desc->chip->end(irq); 226 desc->irq_data.chip->end(irq);
534 raw_spin_unlock(&desc->lock); 227 raw_spin_unlock(&desc->lock);
535 228
536 return 1; 229 return 1;
537} 230}
538#endif 231#endif
539
540void early_init_irq_lock_class(void)
541{
542 struct irq_desc *desc;
543 int i;
544
545 for_each_irq_desc(i, desc) {
546 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
547 }
548}
549
550unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
551{
552 struct irq_desc *desc = irq_to_desc(irq);
553 return desc ? desc->kstat_irqs[cpu] : 0;
554}
555EXPORT_SYMBOL(kstat_irqs_cpu);
556
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index c63f3bc88f0b..4571ae7e085a 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,9 +1,12 @@
1/* 1/*
2 * IRQ subsystem internal functions and variables: 2 * IRQ subsystem internal functions and variables:
3 */ 3 */
4#include <linux/irqdesc.h>
4 5
5extern int noirqdebug; 6extern int noirqdebug;
6 7
8#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
9
7/* Set default functions for irq_chip structures: */ 10/* Set default functions for irq_chip structures: */
8extern void irq_chip_set_defaults(struct irq_chip *chip); 11extern void irq_chip_set_defaults(struct irq_chip *chip);
9 12
@@ -15,21 +18,19 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
15extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); 18extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
16extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 19extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
17 20
18extern struct lock_class_key irq_desc_lock_class;
19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 21extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
20extern void clear_kstat_irqs(struct irq_desc *desc);
21extern raw_spinlock_t sparse_irq_lock;
22 22
23#ifdef CONFIG_SPARSE_IRQ 23/* Resending of interrupts :*/
24void replace_irq_desc(unsigned int irq, struct irq_desc *desc); 24void check_irq_resend(struct irq_desc *desc, unsigned int irq);
25#endif
26 25
27#ifdef CONFIG_PROC_FS 26#ifdef CONFIG_PROC_FS
28extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); 27extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
28extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc);
29extern void register_handler_proc(unsigned int irq, struct irqaction *action); 29extern void register_handler_proc(unsigned int irq, struct irqaction *action);
30extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); 30extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
31#else 31#else
32static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } 32static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
33static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { }
33static inline void register_handler_proc(unsigned int irq, 34static inline void register_handler_proc(unsigned int irq,
34 struct irqaction *action) { } 35 struct irqaction *action) { }
35static inline void unregister_handler_proc(unsigned int irq, 36static inline void unregister_handler_proc(unsigned int irq,
@@ -40,17 +41,27 @@ extern int irq_select_affinity_usr(unsigned int irq);
40 41
41extern void irq_set_thread_affinity(struct irq_desc *desc); 42extern void irq_set_thread_affinity(struct irq_desc *desc);
42 43
44#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
45static inline void irq_end(unsigned int irq, struct irq_desc *desc)
46{
47 if (desc->irq_data.chip && desc->irq_data.chip->end)
48 desc->irq_data.chip->end(irq);
49}
50#else
51static inline void irq_end(unsigned int irq, struct irq_desc *desc) { }
52#endif
53
43/* Inline functions for support of irq chips on slow busses */ 54/* Inline functions for support of irq chips on slow busses */
44static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc) 55static inline void chip_bus_lock(struct irq_desc *desc)
45{ 56{
46 if (unlikely(desc->chip->bus_lock)) 57 if (unlikely(desc->irq_data.chip->irq_bus_lock))
47 desc->chip->bus_lock(irq); 58 desc->irq_data.chip->irq_bus_lock(&desc->irq_data);
48} 59}
49 60
50static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc) 61static inline void chip_bus_sync_unlock(struct irq_desc *desc)
51{ 62{
52 if (unlikely(desc->chip->bus_sync_unlock)) 63 if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock))
53 desc->chip->bus_sync_unlock(irq); 64 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
54} 65}
55 66
56/* 67/*
@@ -67,8 +78,8 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
67 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); 78 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
68 printk("->handle_irq(): %p, ", desc->handle_irq); 79 printk("->handle_irq(): %p, ", desc->handle_irq);
69 print_symbol("%s\n", (unsigned long)desc->handle_irq); 80 print_symbol("%s\n", (unsigned long)desc->handle_irq);
70 printk("->chip(): %p, ", desc->chip); 81 printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
71 print_symbol("%s\n", (unsigned long)desc->chip); 82 print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
72 printk("->action(): %p\n", desc->action); 83 printk("->action(): %p\n", desc->action);
73 if (desc->action) { 84 if (desc->action) {
74 printk("->action->handler(): %p, ", desc->action->handler); 85 printk("->action->handler(): %p, ", desc->action->handler);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
new file mode 100644
index 000000000000..9988d03797f5
--- /dev/null
+++ b/kernel/irq/irqdesc.c
@@ -0,0 +1,410 @@
1/*
2 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
3 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
4 *
5 * This file contains the interrupt descriptor management code
6 *
7 * Detailed information is available in Documentation/DocBook/genericirq
8 *
9 */
10#include <linux/irq.h>
11#include <linux/slab.h>
12#include <linux/module.h>
13#include <linux/interrupt.h>
14#include <linux/kernel_stat.h>
15#include <linux/radix-tree.h>
16#include <linux/bitmap.h>
17
18#include "internals.h"
19
20/*
21 * lockdep: we want to handle all irq_desc locks as a single lock-class:
22 */
23static struct lock_class_key irq_desc_lock_class;
24
25#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
26static void __init init_irq_default_affinity(void)
27{
28 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
29 cpumask_setall(irq_default_affinity);
30}
31#else
32static void __init init_irq_default_affinity(void)
33{
34}
35#endif
36
37#ifdef CONFIG_SMP
38static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
39{
40 if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node))
41 return -ENOMEM;
42
43#ifdef CONFIG_GENERIC_PENDING_IRQ
44 if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
45 free_cpumask_var(desc->irq_data.affinity);
46 return -ENOMEM;
47 }
48#endif
49 return 0;
50}
51
52static void desc_smp_init(struct irq_desc *desc, int node)
53{
54 desc->irq_data.node = node;
55 cpumask_copy(desc->irq_data.affinity, irq_default_affinity);
56#ifdef CONFIG_GENERIC_PENDING_IRQ
57 cpumask_clear(desc->pending_mask);
58#endif
59}
60
61static inline int desc_node(struct irq_desc *desc)
62{
63 return desc->irq_data.node;
64}
65
66#else
67static inline int
68alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
69static inline void desc_smp_init(struct irq_desc *desc, int node) { }
70static inline int desc_node(struct irq_desc *desc) { return 0; }
71#endif
72
73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
74{
75 desc->irq_data.irq = irq;
76 desc->irq_data.chip = &no_irq_chip;
77 desc->irq_data.chip_data = NULL;
78 desc->irq_data.handler_data = NULL;
79 desc->irq_data.msi_desc = NULL;
80 desc->status = IRQ_DEFAULT_INIT_FLAGS;
81 desc->handle_irq = handle_bad_irq;
82 desc->depth = 1;
83 desc->irq_count = 0;
84 desc->irqs_unhandled = 0;
85 desc->name = NULL;
86 memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
87 desc_smp_init(desc, node);
88}
89
90int nr_irqs = NR_IRQS;
91EXPORT_SYMBOL_GPL(nr_irqs);
92
93static DEFINE_MUTEX(sparse_irq_lock);
94static DECLARE_BITMAP(allocated_irqs, NR_IRQS);
95
96#ifdef CONFIG_SPARSE_IRQ
97
98static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
99
100static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
101{
102 radix_tree_insert(&irq_desc_tree, irq, desc);
103}
104
105struct irq_desc *irq_to_desc(unsigned int irq)
106{
107 return radix_tree_lookup(&irq_desc_tree, irq);
108}
109
110static void delete_irq_desc(unsigned int irq)
111{
112 radix_tree_delete(&irq_desc_tree, irq);
113}
114
115#ifdef CONFIG_SMP
116static void free_masks(struct irq_desc *desc)
117{
118#ifdef CONFIG_GENERIC_PENDING_IRQ
119 free_cpumask_var(desc->pending_mask);
120#endif
121 free_cpumask_var(desc->irq_data.affinity);
122}
123#else
124static inline void free_masks(struct irq_desc *desc) { }
125#endif
126
127static struct irq_desc *alloc_desc(int irq, int node)
128{
129 struct irq_desc *desc;
130 gfp_t gfp = GFP_KERNEL;
131
132 desc = kzalloc_node(sizeof(*desc), gfp, node);
133 if (!desc)
134 return NULL;
135 /* allocate based on nr_cpu_ids */
136 desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs),
137 gfp, node);
138 if (!desc->kstat_irqs)
139 goto err_desc;
140
141 if (alloc_masks(desc, gfp, node))
142 goto err_kstat;
143
144 raw_spin_lock_init(&desc->lock);
145 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
146
147 desc_set_defaults(irq, desc, node);
148
149 return desc;
150
151err_kstat:
152 kfree(desc->kstat_irqs);
153err_desc:
154 kfree(desc);
155 return NULL;
156}
157
158static void free_desc(unsigned int irq)
159{
160 struct irq_desc *desc = irq_to_desc(irq);
161
162 unregister_irq_proc(irq, desc);
163
164 mutex_lock(&sparse_irq_lock);
165 delete_irq_desc(irq);
166 mutex_unlock(&sparse_irq_lock);
167
168 free_masks(desc);
169 kfree(desc->kstat_irqs);
170 kfree(desc);
171}
172
173static int alloc_descs(unsigned int start, unsigned int cnt, int node)
174{
175 struct irq_desc *desc;
176 int i;
177
178 for (i = 0; i < cnt; i++) {
179 desc = alloc_desc(start + i, node);
180 if (!desc)
181 goto err;
182 mutex_lock(&sparse_irq_lock);
183 irq_insert_desc(start + i, desc);
184 mutex_unlock(&sparse_irq_lock);
185 }
186 return start;
187
188err:
189 for (i--; i >= 0; i--)
190 free_desc(start + i);
191
192 mutex_lock(&sparse_irq_lock);
193 bitmap_clear(allocated_irqs, start, cnt);
194 mutex_unlock(&sparse_irq_lock);
195 return -ENOMEM;
196}
197
198struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
199{
200 int res = irq_alloc_descs(irq, irq, 1, node);
201
202 if (res == -EEXIST || res == irq)
203 return irq_to_desc(irq);
204 return NULL;
205}
206
207int __init early_irq_init(void)
208{
209 int i, initcnt, node = first_online_node;
210 struct irq_desc *desc;
211
212 init_irq_default_affinity();
213
214 /* Let arch update nr_irqs and return the nr of preallocated irqs */
215 initcnt = arch_probe_nr_irqs();
216 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
217
218 for (i = 0; i < initcnt; i++) {
219 desc = alloc_desc(i, node);
220 set_bit(i, allocated_irqs);
221 irq_insert_desc(i, desc);
222 }
223 return arch_early_irq_init();
224}
225
226#else /* !CONFIG_SPARSE_IRQ */
227
228struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
229 [0 ... NR_IRQS-1] = {
230 .status = IRQ_DEFAULT_INIT_FLAGS,
231 .handle_irq = handle_bad_irq,
232 .depth = 1,
233 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
234 }
235};
236
237static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
238int __init early_irq_init(void)
239{
240 int count, i, node = first_online_node;
241 struct irq_desc *desc;
242
243 init_irq_default_affinity();
244
245 printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
246
247 desc = irq_desc;
248 count = ARRAY_SIZE(irq_desc);
249
250 for (i = 0; i < count; i++) {
251 desc[i].irq_data.irq = i;
252 desc[i].irq_data.chip = &no_irq_chip;
253 desc[i].kstat_irqs = kstat_irqs_all[i];
254 alloc_masks(desc + i, GFP_KERNEL, node);
255 desc_smp_init(desc + i, node);
256 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
257 }
258 return arch_early_irq_init();
259}
260
261struct irq_desc *irq_to_desc(unsigned int irq)
262{
263 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
264}
265
266struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
267{
268 return irq_to_desc(irq);
269}
270
271static void free_desc(unsigned int irq)
272{
273 dynamic_irq_cleanup(irq);
274}
275
276static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
277{
278 return start;
279}
280#endif /* !CONFIG_SPARSE_IRQ */
281
282/* Dynamic interrupt handling */
283
284/**
285 * irq_free_descs - free irq descriptors
286 * @from: Start of descriptor range
287 * @cnt: Number of consecutive irqs to free
288 */
289void irq_free_descs(unsigned int from, unsigned int cnt)
290{
291 int i;
292
293 if (from >= nr_irqs || (from + cnt) > nr_irqs)
294 return;
295
296 for (i = 0; i < cnt; i++)
297 free_desc(from + i);
298
299 mutex_lock(&sparse_irq_lock);
300 bitmap_clear(allocated_irqs, from, cnt);
301 mutex_unlock(&sparse_irq_lock);
302}
303
304/**
305 * irq_alloc_descs - allocate and initialize a range of irq descriptors
306 * @irq: Allocate for specific irq number if irq >= 0
307 * @from: Start the search from this irq number
308 * @cnt: Number of consecutive irqs to allocate.
309 * @node: Preferred node on which the irq descriptor should be allocated
310 *
311 * Returns the first irq number or error code
312 */
313int __ref
314irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
315{
316 int start, ret;
317
318 if (!cnt)
319 return -EINVAL;
320
321 mutex_lock(&sparse_irq_lock);
322
323 start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
324 ret = -EEXIST;
325 if (irq >=0 && start != irq)
326 goto err;
327
328 ret = -ENOMEM;
329 if (start >= nr_irqs)
330 goto err;
331
332 bitmap_set(allocated_irqs, start, cnt);
333 mutex_unlock(&sparse_irq_lock);
334 return alloc_descs(start, cnt, node);
335
336err:
337 mutex_unlock(&sparse_irq_lock);
338 return ret;
339}
340
341/**
342 * irq_reserve_irqs - mark irqs allocated
343 * @from: mark from irq number
344 * @cnt: number of irqs to mark
345 *
346 * Returns 0 on success or an appropriate error code
347 */
348int irq_reserve_irqs(unsigned int from, unsigned int cnt)
349{
350 unsigned int start;
351 int ret = 0;
352
353 if (!cnt || (from + cnt) > nr_irqs)
354 return -EINVAL;
355
356 mutex_lock(&sparse_irq_lock);
357 start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
358 if (start == from)
359 bitmap_set(allocated_irqs, start, cnt);
360 else
361 ret = -EEXIST;
362 mutex_unlock(&sparse_irq_lock);
363 return ret;
364}
365
366/**
367 * irq_get_next_irq - get next allocated irq number
368 * @offset: where to start the search
369 *
370 * Returns next irq number after offset or nr_irqs if none is found.
371 */
372unsigned int irq_get_next_irq(unsigned int offset)
373{
374 return find_next_bit(allocated_irqs, nr_irqs, offset);
375}
376
377/**
378 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
379 * @irq: irq number to initialize
380 */
381void dynamic_irq_cleanup(unsigned int irq)
382{
383 struct irq_desc *desc = irq_to_desc(irq);
384 unsigned long flags;
385
386 raw_spin_lock_irqsave(&desc->lock, flags);
387 desc_set_defaults(irq, desc, desc_node(desc));
388 raw_spin_unlock_irqrestore(&desc->lock, flags);
389}
390
391unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
392{
393 struct irq_desc *desc = irq_to_desc(irq);
394 return desc ? desc->kstat_irqs[cpu] : 0;
395}
396
397#ifdef CONFIG_GENERIC_HARDIRQS
398unsigned int kstat_irqs(unsigned int irq)
399{
400 struct irq_desc *desc = irq_to_desc(irq);
401 int cpu;
402 int sum = 0;
403
404 if (!desc)
405 return 0;
406 for_each_possible_cpu(cpu)
407 sum += desc->kstat_irqs[cpu];
408 return sum;
409}
410#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c3003e9d91a3..5f92acc5f952 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq)
73{ 73{
74 struct irq_desc *desc = irq_to_desc(irq); 74 struct irq_desc *desc = irq_to_desc(irq);
75 75
76 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || 76 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip ||
77 !desc->chip->set_affinity) 77 !desc->irq_data.chip->irq_set_affinity)
78 return 0; 78 return 0;
79 79
80 return 1; 80 return 1;
@@ -109,17 +109,18 @@ void irq_set_thread_affinity(struct irq_desc *desc)
109int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) 109int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
110{ 110{
111 struct irq_desc *desc = irq_to_desc(irq); 111 struct irq_desc *desc = irq_to_desc(irq);
112 struct irq_chip *chip = desc->irq_data.chip;
112 unsigned long flags; 113 unsigned long flags;
113 114
114 if (!desc->chip->set_affinity) 115 if (!chip->irq_set_affinity)
115 return -EINVAL; 116 return -EINVAL;
116 117
117 raw_spin_lock_irqsave(&desc->lock, flags); 118 raw_spin_lock_irqsave(&desc->lock, flags);
118 119
119#ifdef CONFIG_GENERIC_PENDING_IRQ 120#ifdef CONFIG_GENERIC_PENDING_IRQ
120 if (desc->status & IRQ_MOVE_PCNTXT) { 121 if (desc->status & IRQ_MOVE_PCNTXT) {
121 if (!desc->chip->set_affinity(irq, cpumask)) { 122 if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
122 cpumask_copy(desc->affinity, cpumask); 123 cpumask_copy(desc->irq_data.affinity, cpumask);
123 irq_set_thread_affinity(desc); 124 irq_set_thread_affinity(desc);
124 } 125 }
125 } 126 }
@@ -128,8 +129,8 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
128 cpumask_copy(desc->pending_mask, cpumask); 129 cpumask_copy(desc->pending_mask, cpumask);
129 } 130 }
130#else 131#else
131 if (!desc->chip->set_affinity(irq, cpumask)) { 132 if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
132 cpumask_copy(desc->affinity, cpumask); 133 cpumask_copy(desc->irq_data.affinity, cpumask);
133 irq_set_thread_affinity(desc); 134 irq_set_thread_affinity(desc);
134 } 135 }
135#endif 136#endif
@@ -168,16 +169,16 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
168 * one of the targets is online. 169 * one of the targets is online.
169 */ 170 */
170 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { 171 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
171 if (cpumask_any_and(desc->affinity, cpu_online_mask) 172 if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask)
172 < nr_cpu_ids) 173 < nr_cpu_ids)
173 goto set_affinity; 174 goto set_affinity;
174 else 175 else
175 desc->status &= ~IRQ_AFFINITY_SET; 176 desc->status &= ~IRQ_AFFINITY_SET;
176 } 177 }
177 178
178 cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity); 179 cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity);
179set_affinity: 180set_affinity:
180 desc->chip->set_affinity(irq, desc->affinity); 181 desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false);
181 182
182 return 0; 183 return 0;
183} 184}
@@ -223,7 +224,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
223 224
224 if (!desc->depth++) { 225 if (!desc->depth++) {
225 desc->status |= IRQ_DISABLED; 226 desc->status |= IRQ_DISABLED;
226 desc->chip->disable(irq); 227 desc->irq_data.chip->irq_disable(&desc->irq_data);
227 } 228 }
228} 229}
229 230
@@ -246,11 +247,11 @@ void disable_irq_nosync(unsigned int irq)
246 if (!desc) 247 if (!desc)
247 return; 248 return;
248 249
249 chip_bus_lock(irq, desc); 250 chip_bus_lock(desc);
250 raw_spin_lock_irqsave(&desc->lock, flags); 251 raw_spin_lock_irqsave(&desc->lock, flags);
251 __disable_irq(desc, irq, false); 252 __disable_irq(desc, irq, false);
252 raw_spin_unlock_irqrestore(&desc->lock, flags); 253 raw_spin_unlock_irqrestore(&desc->lock, flags);
253 chip_bus_sync_unlock(irq, desc); 254 chip_bus_sync_unlock(desc);
254} 255}
255EXPORT_SYMBOL(disable_irq_nosync); 256EXPORT_SYMBOL(disable_irq_nosync);
256 257
@@ -313,7 +314,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
313 * IRQ line is re-enabled. 314 * IRQ line is re-enabled.
314 * 315 *
315 * This function may be called from IRQ context only when 316 * This function may be called from IRQ context only when
316 * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! 317 * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
317 */ 318 */
318void enable_irq(unsigned int irq) 319void enable_irq(unsigned int irq)
319{ 320{
@@ -323,11 +324,15 @@ void enable_irq(unsigned int irq)
323 if (!desc) 324 if (!desc)
324 return; 325 return;
325 326
326 chip_bus_lock(irq, desc); 327 if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable,
328 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
329 return;
330
331 chip_bus_lock(desc);
327 raw_spin_lock_irqsave(&desc->lock, flags); 332 raw_spin_lock_irqsave(&desc->lock, flags);
328 __enable_irq(desc, irq, false); 333 __enable_irq(desc, irq, false);
329 raw_spin_unlock_irqrestore(&desc->lock, flags); 334 raw_spin_unlock_irqrestore(&desc->lock, flags);
330 chip_bus_sync_unlock(irq, desc); 335 chip_bus_sync_unlock(desc);
331} 336}
332EXPORT_SYMBOL(enable_irq); 337EXPORT_SYMBOL(enable_irq);
333 338
@@ -336,8 +341,8 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
336 struct irq_desc *desc = irq_to_desc(irq); 341 struct irq_desc *desc = irq_to_desc(irq);
337 int ret = -ENXIO; 342 int ret = -ENXIO;
338 343
339 if (desc->chip->set_wake) 344 if (desc->irq_data.chip->irq_set_wake)
340 ret = desc->chip->set_wake(irq, on); 345 ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);
341 346
342 return ret; 347 return ret;
343} 348}
@@ -429,12 +434,12 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc)
429} 434}
430 435
431int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 436int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
432 unsigned long flags) 437 unsigned long flags)
433{ 438{
434 int ret; 439 int ret;
435 struct irq_chip *chip = desc->chip; 440 struct irq_chip *chip = desc->irq_data.chip;
436 441
437 if (!chip || !chip->set_type) { 442 if (!chip || !chip->irq_set_type) {
438 /* 443 /*
439 * IRQF_TRIGGER_* but the PIC does not support multiple 444 * IRQF_TRIGGER_* but the PIC does not support multiple
440 * flow-types? 445 * flow-types?
@@ -445,11 +450,11 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
445 } 450 }
446 451
447 /* caller masked out all except trigger mode flags */ 452 /* caller masked out all except trigger mode flags */
448 ret = chip->set_type(irq, flags); 453 ret = chip->irq_set_type(&desc->irq_data, flags);
449 454
450 if (ret) 455 if (ret)
451 pr_err("setting trigger mode %d for irq %u failed (%pF)\n", 456 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
452 (int)flags, irq, chip->set_type); 457 flags, irq, chip->irq_set_type);
453 else { 458 else {
454 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) 459 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
455 flags |= IRQ_LEVEL; 460 flags |= IRQ_LEVEL;
@@ -457,8 +462,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
457 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); 462 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
458 desc->status |= flags; 463 desc->status |= flags;
459 464
460 if (chip != desc->chip) 465 if (chip != desc->irq_data.chip)
461 irq_chip_set_defaults(desc->chip); 466 irq_chip_set_defaults(desc->irq_data.chip);
462 } 467 }
463 468
464 return ret; 469 return ret;
@@ -507,7 +512,7 @@ static int irq_wait_for_interrupt(struct irqaction *action)
507static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 512static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
508{ 513{
509again: 514again:
510 chip_bus_lock(irq, desc); 515 chip_bus_lock(desc);
511 raw_spin_lock_irq(&desc->lock); 516 raw_spin_lock_irq(&desc->lock);
512 517
513 /* 518 /*
@@ -521,17 +526,17 @@ again:
521 */ 526 */
522 if (unlikely(desc->status & IRQ_INPROGRESS)) { 527 if (unlikely(desc->status & IRQ_INPROGRESS)) {
523 raw_spin_unlock_irq(&desc->lock); 528 raw_spin_unlock_irq(&desc->lock);
524 chip_bus_sync_unlock(irq, desc); 529 chip_bus_sync_unlock(desc);
525 cpu_relax(); 530 cpu_relax();
526 goto again; 531 goto again;
527 } 532 }
528 533
529 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 534 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
530 desc->status &= ~IRQ_MASKED; 535 desc->status &= ~IRQ_MASKED;
531 desc->chip->unmask(irq); 536 desc->irq_data.chip->irq_unmask(&desc->irq_data);
532 } 537 }
533 raw_spin_unlock_irq(&desc->lock); 538 raw_spin_unlock_irq(&desc->lock);
534 chip_bus_sync_unlock(irq, desc); 539 chip_bus_sync_unlock(desc);
535} 540}
536 541
537#ifdef CONFIG_SMP 542#ifdef CONFIG_SMP
@@ -556,7 +561,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
556 } 561 }
557 562
558 raw_spin_lock_irq(&desc->lock); 563 raw_spin_lock_irq(&desc->lock);
559 cpumask_copy(mask, desc->affinity); 564 cpumask_copy(mask, desc->irq_data.affinity);
560 raw_spin_unlock_irq(&desc->lock); 565 raw_spin_unlock_irq(&desc->lock);
561 566
562 set_cpus_allowed_ptr(current, mask); 567 set_cpus_allowed_ptr(current, mask);
@@ -657,7 +662,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
657 if (!desc) 662 if (!desc)
658 return -EINVAL; 663 return -EINVAL;
659 664
660 if (desc->chip == &no_irq_chip) 665 if (desc->irq_data.chip == &no_irq_chip)
661 return -ENOSYS; 666 return -ENOSYS;
662 /* 667 /*
663 * Some drivers like serial.c use request_irq() heavily, 668 * Some drivers like serial.c use request_irq() heavily,
@@ -752,7 +757,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
752 } 757 }
753 758
754 if (!shared) { 759 if (!shared) {
755 irq_chip_set_defaults(desc->chip); 760 irq_chip_set_defaults(desc->irq_data.chip);
756 761
757 init_waitqueue_head(&desc->wait_for_threads); 762 init_waitqueue_head(&desc->wait_for_threads);
758 763
@@ -779,7 +784,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
779 if (!(desc->status & IRQ_NOAUTOEN)) { 784 if (!(desc->status & IRQ_NOAUTOEN)) {
780 desc->depth = 0; 785 desc->depth = 0;
781 desc->status &= ~IRQ_DISABLED; 786 desc->status &= ~IRQ_DISABLED;
782 desc->chip->startup(irq); 787 desc->irq_data.chip->irq_startup(&desc->irq_data);
783 } else 788 } else
784 /* Undo nested disables: */ 789 /* Undo nested disables: */
785 desc->depth = 1; 790 desc->depth = 1;
@@ -912,17 +917,17 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
912 917
913 /* Currently used only by UML, might disappear one day: */ 918 /* Currently used only by UML, might disappear one day: */
914#ifdef CONFIG_IRQ_RELEASE_METHOD 919#ifdef CONFIG_IRQ_RELEASE_METHOD
915 if (desc->chip->release) 920 if (desc->irq_data.chip->release)
916 desc->chip->release(irq, dev_id); 921 desc->irq_data.chip->release(irq, dev_id);
917#endif 922#endif
918 923
919 /* If this was the last handler, shut down the IRQ line: */ 924 /* If this was the last handler, shut down the IRQ line: */
920 if (!desc->action) { 925 if (!desc->action) {
921 desc->status |= IRQ_DISABLED; 926 desc->status |= IRQ_DISABLED;
922 if (desc->chip->shutdown) 927 if (desc->irq_data.chip->irq_shutdown)
923 desc->chip->shutdown(irq); 928 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
924 else 929 else
925 desc->chip->disable(irq); 930 desc->irq_data.chip->irq_disable(&desc->irq_data);
926 } 931 }
927 932
928#ifdef CONFIG_SMP 933#ifdef CONFIG_SMP
@@ -997,9 +1002,9 @@ void free_irq(unsigned int irq, void *dev_id)
997 if (!desc) 1002 if (!desc)
998 return; 1003 return;
999 1004
1000 chip_bus_lock(irq, desc); 1005 chip_bus_lock(desc);
1001 kfree(__free_irq(irq, dev_id)); 1006 kfree(__free_irq(irq, dev_id));
1002 chip_bus_sync_unlock(irq, desc); 1007 chip_bus_sync_unlock(desc);
1003} 1008}
1004EXPORT_SYMBOL(free_irq); 1009EXPORT_SYMBOL(free_irq);
1005 1010
@@ -1086,9 +1091,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1086 action->name = devname; 1091 action->name = devname;
1087 action->dev_id = dev_id; 1092 action->dev_id = dev_id;
1088 1093
1089 chip_bus_lock(irq, desc); 1094 chip_bus_lock(desc);
1090 retval = __setup_irq(irq, desc, action); 1095 retval = __setup_irq(irq, desc, action);
1091 chip_bus_sync_unlock(irq, desc); 1096 chip_bus_sync_unlock(desc);
1092 1097
1093 if (retval) 1098 if (retval)
1094 kfree(action); 1099 kfree(action);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 241962280836..1d2541940480 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -7,6 +7,7 @@
7void move_masked_irq(int irq) 7void move_masked_irq(int irq)
8{ 8{
9 struct irq_desc *desc = irq_to_desc(irq); 9 struct irq_desc *desc = irq_to_desc(irq);
10 struct irq_chip *chip = desc->irq_data.chip;
10 11
11 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 12 if (likely(!(desc->status & IRQ_MOVE_PENDING)))
12 return; 13 return;
@@ -24,7 +25,7 @@ void move_masked_irq(int irq)
24 if (unlikely(cpumask_empty(desc->pending_mask))) 25 if (unlikely(cpumask_empty(desc->pending_mask)))
25 return; 26 return;
26 27
27 if (!desc->chip->set_affinity) 28 if (!chip->irq_set_affinity)
28 return; 29 return;
29 30
30 assert_raw_spin_locked(&desc->lock); 31 assert_raw_spin_locked(&desc->lock);
@@ -43,8 +44,9 @@ void move_masked_irq(int irq)
43 */ 44 */
44 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) 45 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
45 < nr_cpu_ids)) 46 < nr_cpu_ids))
46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) { 47 if (!chip->irq_set_affinity(&desc->irq_data,
47 cpumask_copy(desc->affinity, desc->pending_mask); 48 desc->pending_mask, false)) {
49 cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
48 irq_set_thread_affinity(desc); 50 irq_set_thread_affinity(desc);
49 } 51 }
50 52
@@ -61,8 +63,8 @@ void move_native_irq(int irq)
61 if (unlikely(desc->status & IRQ_DISABLED)) 63 if (unlikely(desc->status & IRQ_DISABLED))
62 return; 64 return;
63 65
64 desc->chip->mask(irq); 66 desc->irq_data.chip->irq_mask(&desc->irq_data);
65 move_masked_irq(irq); 67 move_masked_irq(irq);
66 desc->chip->unmask(irq); 68 desc->irq_data.chip->irq_unmask(&desc->irq_data);
67} 69}
68 70
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
deleted file mode 100644
index 65d3845665ac..000000000000
--- a/kernel/irq/numa_migrate.c
+++ /dev/null
@@ -1,120 +0,0 @@
1/*
2 * NUMA irq-desc migration code
3 *
4 * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to
5 * the new "home node" of the IRQ.
6 */
7
8#include <linux/irq.h>
9#include <linux/slab.h>
10#include <linux/module.h>
11#include <linux/random.h>
12#include <linux/interrupt.h>
13#include <linux/kernel_stat.h>
14
15#include "internals.h"
16
17static void init_copy_kstat_irqs(struct irq_desc *old_desc,
18 struct irq_desc *desc,
19 int node, int nr)
20{
21 init_kstat_irqs(desc, node, nr);
22
23 if (desc->kstat_irqs != old_desc->kstat_irqs)
24 memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
25 nr * sizeof(*desc->kstat_irqs));
26}
27
28static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
29{
30 if (old_desc->kstat_irqs == desc->kstat_irqs)
31 return;
32
33 kfree(old_desc->kstat_irqs);
34 old_desc->kstat_irqs = NULL;
35}
36
37static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
38 struct irq_desc *desc, int node)
39{
40 memcpy(desc, old_desc, sizeof(struct irq_desc));
41 if (!alloc_desc_masks(desc, node, false)) {
42 printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
43 "for migration.\n", irq);
44 return false;
45 }
46 raw_spin_lock_init(&desc->lock);
47 desc->node = node;
48 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
49 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
50 init_copy_desc_masks(old_desc, desc);
51 arch_init_copy_chip_data(old_desc, desc, node);
52 return true;
53}
54
55static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
56{
57 free_kstat_irqs(old_desc, desc);
58 free_desc_masks(old_desc, desc);
59 arch_free_chip_data(old_desc, desc);
60}
61
62static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
63 int node)
64{
65 struct irq_desc *desc;
66 unsigned int irq;
67 unsigned long flags;
68
69 irq = old_desc->irq;
70
71 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
72
73 /* We have to check it to avoid races with another CPU */
74 desc = irq_to_desc(irq);
75
76 if (desc && old_desc != desc)
77 goto out_unlock;
78
79 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
80 if (!desc) {
81 printk(KERN_ERR "irq %d: can not get new irq_desc "
82 "for migration.\n", irq);
83 /* still use old one */
84 desc = old_desc;
85 goto out_unlock;
86 }
87 if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
88 /* still use old one */
89 kfree(desc);
90 desc = old_desc;
91 goto out_unlock;
92 }
93
94 replace_irq_desc(irq, desc);
95 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
96
97 /* free the old one */
98 free_one_irq_desc(old_desc, desc);
99 kfree(old_desc);
100
101 return desc;
102
103out_unlock:
104 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
105
106 return desc;
107}
108
109struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
110{
111 /* those static or target node is -1, do not move them */
112 if (desc->irq < NR_IRQS_LEGACY || node == -1)
113 return desc;
114
115 if (desc->node != node)
116 desc = __real_move_irq_desc(desc, node);
117
118 return desc;
119}
120
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 09a2ee540bd2..01b1d3a88983 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -21,7 +21,7 @@ static struct proc_dir_entry *root_irq_dir;
21static int irq_affinity_proc_show(struct seq_file *m, void *v) 21static int irq_affinity_proc_show(struct seq_file *m, void *v)
22{ 22{
23 struct irq_desc *desc = irq_to_desc((long)m->private); 23 struct irq_desc *desc = irq_to_desc((long)m->private);
24 const struct cpumask *mask = desc->affinity; 24 const struct cpumask *mask = desc->irq_data.affinity;
25 25
26#ifdef CONFIG_GENERIC_PENDING_IRQ 26#ifdef CONFIG_GENERIC_PENDING_IRQ
27 if (desc->status & IRQ_MOVE_PENDING) 27 if (desc->status & IRQ_MOVE_PENDING)
@@ -65,7 +65,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
65 cpumask_var_t new_value; 65 cpumask_var_t new_value;
66 int err; 66 int err;
67 67
68 if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || 68 if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity ||
69 irq_balancing_disabled(irq)) 69 irq_balancing_disabled(irq))
70 return -EIO; 70 return -EIO;
71 71
@@ -185,7 +185,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
185{ 185{
186 struct irq_desc *desc = irq_to_desc((long) m->private); 186 struct irq_desc *desc = irq_to_desc((long) m->private);
187 187
188 seq_printf(m, "%d\n", desc->node); 188 seq_printf(m, "%d\n", desc->irq_data.node);
189 return 0; 189 return 0;
190} 190}
191 191
@@ -269,7 +269,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
269{ 269{
270 char name [MAX_NAMELEN]; 270 char name [MAX_NAMELEN];
271 271
272 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) 272 if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir)
273 return; 273 return;
274 274
275 memset(name, 0, MAX_NAMELEN); 275 memset(name, 0, MAX_NAMELEN);
@@ -297,6 +297,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
297 &irq_spurious_proc_fops, (void *)(long)irq); 297 &irq_spurious_proc_fops, (void *)(long)irq);
298} 298}
299 299
300void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
301{
302 char name [MAX_NAMELEN];
303
304 if (!root_irq_dir || !desc->dir)
305 return;
306#ifdef CONFIG_SMP
307 remove_proc_entry("smp_affinity", desc->dir);
308 remove_proc_entry("affinity_hint", desc->dir);
309 remove_proc_entry("node", desc->dir);
310#endif
311 remove_proc_entry("spurious", desc->dir);
312
313 memset(name, 0, MAX_NAMELEN);
314 sprintf(name, "%u", irq);
315 remove_proc_entry(name, root_irq_dir);
316}
317
300#undef MAX_NAMELEN 318#undef MAX_NAMELEN
301 319
302void unregister_handler_proc(unsigned int irq, struct irqaction *action) 320void unregister_handler_proc(unsigned int irq, struct irqaction *action)
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 090c3763f3a2..891115a929aa 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -60,7 +60,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
60 /* 60 /*
61 * Make sure the interrupt is enabled, before resending it: 61 * Make sure the interrupt is enabled, before resending it:
62 */ 62 */
63 desc->chip->enable(irq); 63 desc->irq_data.chip->irq_enable(&desc->irq_data);
64 64
65 /* 65 /*
66 * We do not resend level type interrupts. Level type 66 * We do not resend level type interrupts. Level type
@@ -70,7 +70,8 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; 71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
72 72
73 if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) { 73 if (!desc->irq_data.chip->irq_retrigger ||
74 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
74#ifdef CONFIG_HARDIRQS_SW_RESEND 75#ifdef CONFIG_HARDIRQS_SW_RESEND
75 /* Set it pending and activate the softirq: */ 76 /* Set it pending and activate the softirq: */
76 set_bit(irq, irqs_resend); 77 set_bit(irq, irqs_resend);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 89fb90ae534f..3089d3b9d5f3 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -14,6 +14,8 @@
14#include <linux/moduleparam.h> 14#include <linux/moduleparam.h>
15#include <linux/timer.h> 15#include <linux/timer.h>
16 16
17#include "internals.h"
18
17static int irqfixup __read_mostly; 19static int irqfixup __read_mostly;
18 20
19#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) 21#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
@@ -78,8 +80,8 @@ static int try_one_irq(int irq, struct irq_desc *desc)
78 * If we did actual work for the real IRQ line we must let the 80 * If we did actual work for the real IRQ line we must let the
79 * IRQ controller clean up too 81 * IRQ controller clean up too
80 */ 82 */
81 if (work && desc->chip && desc->chip->end) 83 if (work)
82 desc->chip->end(irq); 84 irq_end(irq, desc);
83 raw_spin_unlock(&desc->lock); 85 raw_spin_unlock(&desc->lock);
84 86
85 return ok; 87 return ok;
@@ -254,7 +256,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
254 printk(KERN_EMERG "Disabling IRQ #%d\n", irq); 256 printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
255 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; 257 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
256 desc->depth++; 258 desc->depth++;
257 desc->chip->disable(irq); 259 desc->irq_data.chip->irq_disable(&desc->irq_data);
258 260
259 mod_timer(&poll_spurious_irq_timer, 261 mod_timer(&poll_spurious_irq_timer,
260 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 262 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
new file mode 100644
index 000000000000..f16763ff8481
--- /dev/null
+++ b/kernel/irq_work.c
@@ -0,0 +1,164 @@
1/*
2 * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
3 *
4 * Provides a framework for enqueueing and running callbacks from hardirq
5 * context. The enqueueing is NMI-safe.
6 */
7
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/irq_work.h>
11#include <linux/hardirq.h>
12
13/*
14 * An entry can be in one of four states:
15 *
16 * free NULL, 0 -> {claimed} : free to be used
17 * claimed NULL, 3 -> {pending} : claimed to be enqueued
18 * pending next, 3 -> {busy} : queued, pending callback
19 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
20 *
21 * We use the lower two bits of the next pointer to keep PENDING and BUSY
22 * flags.
23 */
24
25#define IRQ_WORK_PENDING 1UL
26#define IRQ_WORK_BUSY 2UL
27#define IRQ_WORK_FLAGS 3UL
28
29static inline bool irq_work_is_set(struct irq_work *entry, int flags)
30{
31 return (unsigned long)entry->next & flags;
32}
33
34static inline struct irq_work *irq_work_next(struct irq_work *entry)
35{
36 unsigned long next = (unsigned long)entry->next;
37 next &= ~IRQ_WORK_FLAGS;
38 return (struct irq_work *)next;
39}
40
41static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
42{
43 unsigned long next = (unsigned long)entry;
44 next |= flags;
45 return (struct irq_work *)next;
46}
47
48static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
49
50/*
51 * Claim the entry so that no one else will poke at it.
52 */
53static bool irq_work_claim(struct irq_work *entry)
54{
55 struct irq_work *next, *nflags;
56
57 do {
58 next = entry->next;
59 if ((unsigned long)next & IRQ_WORK_PENDING)
60 return false;
61 nflags = next_flags(next, IRQ_WORK_FLAGS);
62 } while (cmpxchg(&entry->next, next, nflags) != next);
63
64 return true;
65}
66
67
68void __weak arch_irq_work_raise(void)
69{
70 /*
71 * Lame architectures will get the timer tick callback
72 */
73}
74
75/*
76 * Queue the entry and raise the IPI if needed.
77 */
78static void __irq_work_queue(struct irq_work *entry)
79{
80 struct irq_work **head, *next;
81
82 head = &get_cpu_var(irq_work_list);
83
84 do {
85 next = *head;
86 /* Can assign non-atomic because we keep the flags set. */
87 entry->next = next_flags(next, IRQ_WORK_FLAGS);
88 } while (cmpxchg(head, next, entry) != next);
89
90 /* The list was empty, raise self-interrupt to start processing. */
91 if (!irq_work_next(entry))
92 arch_irq_work_raise();
93
94 put_cpu_var(irq_work_list);
95}
96
97/*
98 * Enqueue the irq_work @entry, returns true on success, failure when the
99 * @entry was already enqueued by someone else.
100 *
101 * Can be re-enqueued while the callback is still in progress.
102 */
103bool irq_work_queue(struct irq_work *entry)
104{
105 if (!irq_work_claim(entry)) {
106 /*
107 * Already enqueued, can't do!
108 */
109 return false;
110 }
111
112 __irq_work_queue(entry);
113 return true;
114}
115EXPORT_SYMBOL_GPL(irq_work_queue);
116
117/*
118 * Run the irq_work entries on this cpu. Requires to be ran from hardirq
119 * context with local IRQs disabled.
120 */
121void irq_work_run(void)
122{
123 struct irq_work *list, **head;
124
125 head = &__get_cpu_var(irq_work_list);
126 if (*head == NULL)
127 return;
128
129 BUG_ON(!in_irq());
130 BUG_ON(!irqs_disabled());
131
132 list = xchg(head, NULL);
133 while (list != NULL) {
134 struct irq_work *entry = list;
135
136 list = irq_work_next(list);
137
138 /*
139 * Clear the PENDING bit, after this point the @entry
140 * can be re-used.
141 */
142 entry->next = next_flags(NULL, IRQ_WORK_BUSY);
143 entry->func(entry);
144 /*
145 * Clear the BUSY bit and return to the free state if
146 * no-one else claimed it meanwhile.
147 */
148 cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL);
149 }
150}
151EXPORT_SYMBOL_GPL(irq_work_run);
152
153/*
154 * Synchronize against the irq_work @entry, ensures the entry is not
155 * currently in use.
156 */
157void irq_work_sync(struct irq_work *entry)
158{
159 WARN_ON_ONCE(irqs_disabled());
160
161 while (irq_work_is_set(entry, IRQ_WORK_BUSY))
162 cpu_relax();
163}
164EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
new file mode 100644
index 000000000000..3b79bd938330
--- /dev/null
+++ b/kernel/jump_label.c
@@ -0,0 +1,484 @@
1/*
2 * jump label support
3 *
4 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
5 *
6 */
7#include <linux/jump_label.h>
8#include <linux/memory.h>
9#include <linux/uaccess.h>
10#include <linux/module.h>
11#include <linux/list.h>
12#include <linux/jhash.h>
13#include <linux/slab.h>
14#include <linux/sort.h>
15#include <linux/err.h>
16
17#ifdef HAVE_JUMP_LABEL
18
19#define JUMP_LABEL_HASH_BITS 6
20#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS)
21static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE];
22
23/* mutex to protect coming/going of the the jump_label table */
24static DEFINE_MUTEX(jump_label_mutex);
25
26struct jump_label_entry {
27 struct hlist_node hlist;
28 struct jump_entry *table;
29 int nr_entries;
30 /* hang modules off here */
31 struct hlist_head modules;
32 unsigned long key;
33};
34
35struct jump_label_module_entry {
36 struct hlist_node hlist;
37 struct jump_entry *table;
38 int nr_entries;
39 struct module *mod;
40};
41
42void jump_label_lock(void)
43{
44 mutex_lock(&jump_label_mutex);
45}
46
47void jump_label_unlock(void)
48{
49 mutex_unlock(&jump_label_mutex);
50}
51
52static int jump_label_cmp(const void *a, const void *b)
53{
54 const struct jump_entry *jea = a;
55 const struct jump_entry *jeb = b;
56
57 if (jea->key < jeb->key)
58 return -1;
59
60 if (jea->key > jeb->key)
61 return 1;
62
63 return 0;
64}
65
66static void
67sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
68{
69 unsigned long size;
70
71 size = (((unsigned long)stop - (unsigned long)start)
72 / sizeof(struct jump_entry));
73 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
74}
75
76static struct jump_label_entry *get_jump_label_entry(jump_label_t key)
77{
78 struct hlist_head *head;
79 struct hlist_node *node;
80 struct jump_label_entry *e;
81 u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
82
83 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
84 hlist_for_each_entry(e, node, head, hlist) {
85 if (key == e->key)
86 return e;
87 }
88 return NULL;
89}
90
91static struct jump_label_entry *
92add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table)
93{
94 struct hlist_head *head;
95 struct jump_label_entry *e;
96 u32 hash;
97
98 e = get_jump_label_entry(key);
99 if (e)
100 return ERR_PTR(-EEXIST);
101
102 e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL);
103 if (!e)
104 return ERR_PTR(-ENOMEM);
105
106 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
107 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
108 e->key = key;
109 e->table = table;
110 e->nr_entries = nr_entries;
111 INIT_HLIST_HEAD(&(e->modules));
112 hlist_add_head(&e->hlist, head);
113 return e;
114}
115
116static int
117build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop)
118{
119 struct jump_entry *iter, *iter_begin;
120 struct jump_label_entry *entry;
121 int count;
122
123 sort_jump_label_entries(start, stop);
124 iter = start;
125 while (iter < stop) {
126 entry = get_jump_label_entry(iter->key);
127 if (!entry) {
128 iter_begin = iter;
129 count = 0;
130 while ((iter < stop) &&
131 (iter->key == iter_begin->key)) {
132 iter++;
133 count++;
134 }
135 entry = add_jump_label_entry(iter_begin->key,
136 count, iter_begin);
137 if (IS_ERR(entry))
138 return PTR_ERR(entry);
139 } else {
140 WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n");
141 return -1;
142 }
143 }
144 return 0;
145}
146
147/***
148 * jump_label_update - update jump label text
149 * @key - key value associated with a a jump label
150 * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE
151 *
152 * Will enable/disable the jump for jump label @key, depending on the
153 * value of @type.
154 *
155 */
156
157void jump_label_update(unsigned long key, enum jump_label_type type)
158{
159 struct jump_entry *iter;
160 struct jump_label_entry *entry;
161 struct hlist_node *module_node;
162 struct jump_label_module_entry *e_module;
163 int count;
164
165 jump_label_lock();
166 entry = get_jump_label_entry((jump_label_t)key);
167 if (entry) {
168 count = entry->nr_entries;
169 iter = entry->table;
170 while (count--) {
171 if (kernel_text_address(iter->code))
172 arch_jump_label_transform(iter, type);
173 iter++;
174 }
175 /* eanble/disable jump labels in modules */
176 hlist_for_each_entry(e_module, module_node, &(entry->modules),
177 hlist) {
178 count = e_module->nr_entries;
179 iter = e_module->table;
180 while (count--) {
181 if (iter->key &&
182 kernel_text_address(iter->code))
183 arch_jump_label_transform(iter, type);
184 iter++;
185 }
186 }
187 }
188 jump_label_unlock();
189}
190
191static int addr_conflict(struct jump_entry *entry, void *start, void *end)
192{
193 if (entry->code <= (unsigned long)end &&
194 entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start)
195 return 1;
196
197 return 0;
198}
199
200#ifdef CONFIG_MODULES
201
202static int module_conflict(void *start, void *end)
203{
204 struct hlist_head *head;
205 struct hlist_node *node, *node_next, *module_node, *module_node_next;
206 struct jump_label_entry *e;
207 struct jump_label_module_entry *e_module;
208 struct jump_entry *iter;
209 int i, count;
210 int conflict = 0;
211
212 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
213 head = &jump_label_table[i];
214 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
215 hlist_for_each_entry_safe(e_module, module_node,
216 module_node_next,
217 &(e->modules), hlist) {
218 count = e_module->nr_entries;
219 iter = e_module->table;
220 while (count--) {
221 if (addr_conflict(iter, start, end)) {
222 conflict = 1;
223 goto out;
224 }
225 iter++;
226 }
227 }
228 }
229 }
230out:
231 return conflict;
232}
233
234#endif
235
236/***
237 * jump_label_text_reserved - check if addr range is reserved
238 * @start: start text addr
239 * @end: end text addr
240 *
241 * checks if the text addr located between @start and @end
242 * overlaps with any of the jump label patch addresses. Code
243 * that wants to modify kernel text should first verify that
244 * it does not overlap with any of the jump label addresses.
245 * Caller must hold jump_label_mutex.
246 *
247 * returns 1 if there is an overlap, 0 otherwise
248 */
249int jump_label_text_reserved(void *start, void *end)
250{
251 struct jump_entry *iter;
252 struct jump_entry *iter_start = __start___jump_table;
253 struct jump_entry *iter_stop = __start___jump_table;
254 int conflict = 0;
255
256 iter = iter_start;
257 while (iter < iter_stop) {
258 if (addr_conflict(iter, start, end)) {
259 conflict = 1;
260 goto out;
261 }
262 iter++;
263 }
264
265 /* now check modules */
266#ifdef CONFIG_MODULES
267 conflict = module_conflict(start, end);
268#endif
269out:
270 return conflict;
271}
272
273/*
274 * Not all archs need this.
275 */
276void __weak arch_jump_label_text_poke_early(jump_label_t addr)
277{
278}
279
280static __init int init_jump_label(void)
281{
282 int ret;
283 struct jump_entry *iter_start = __start___jump_table;
284 struct jump_entry *iter_stop = __stop___jump_table;
285 struct jump_entry *iter;
286
287 jump_label_lock();
288 ret = build_jump_label_hashtable(__start___jump_table,
289 __stop___jump_table);
290 iter = iter_start;
291 while (iter < iter_stop) {
292 arch_jump_label_text_poke_early(iter->code);
293 iter++;
294 }
295 jump_label_unlock();
296 return ret;
297}
298early_initcall(init_jump_label);
299
300#ifdef CONFIG_MODULES
301
302static struct jump_label_module_entry *
303add_jump_label_module_entry(struct jump_label_entry *entry,
304 struct jump_entry *iter_begin,
305 int count, struct module *mod)
306{
307 struct jump_label_module_entry *e;
308
309 e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL);
310 if (!e)
311 return ERR_PTR(-ENOMEM);
312 e->mod = mod;
313 e->nr_entries = count;
314 e->table = iter_begin;
315 hlist_add_head(&e->hlist, &entry->modules);
316 return e;
317}
318
319static int add_jump_label_module(struct module *mod)
320{
321 struct jump_entry *iter, *iter_begin;
322 struct jump_label_entry *entry;
323 struct jump_label_module_entry *module_entry;
324 int count;
325
326 /* if the module doesn't have jump label entries, just return */
327 if (!mod->num_jump_entries)
328 return 0;
329
330 sort_jump_label_entries(mod->jump_entries,
331 mod->jump_entries + mod->num_jump_entries);
332 iter = mod->jump_entries;
333 while (iter < mod->jump_entries + mod->num_jump_entries) {
334 entry = get_jump_label_entry(iter->key);
335 iter_begin = iter;
336 count = 0;
337 while ((iter < mod->jump_entries + mod->num_jump_entries) &&
338 (iter->key == iter_begin->key)) {
339 iter++;
340 count++;
341 }
342 if (!entry) {
343 entry = add_jump_label_entry(iter_begin->key, 0, NULL);
344 if (IS_ERR(entry))
345 return PTR_ERR(entry);
346 }
347 module_entry = add_jump_label_module_entry(entry, iter_begin,
348 count, mod);
349 if (IS_ERR(module_entry))
350 return PTR_ERR(module_entry);
351 }
352 return 0;
353}
354
355static void remove_jump_label_module(struct module *mod)
356{
357 struct hlist_head *head;
358 struct hlist_node *node, *node_next, *module_node, *module_node_next;
359 struct jump_label_entry *e;
360 struct jump_label_module_entry *e_module;
361 int i;
362
363 /* if the module doesn't have jump label entries, just return */
364 if (!mod->num_jump_entries)
365 return;
366
367 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
368 head = &jump_label_table[i];
369 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
370 hlist_for_each_entry_safe(e_module, module_node,
371 module_node_next,
372 &(e->modules), hlist) {
373 if (e_module->mod == mod) {
374 hlist_del(&e_module->hlist);
375 kfree(e_module);
376 }
377 }
378 if (hlist_empty(&e->modules) && (e->nr_entries == 0)) {
379 hlist_del(&e->hlist);
380 kfree(e);
381 }
382 }
383 }
384}
385
386static void remove_jump_label_module_init(struct module *mod)
387{
388 struct hlist_head *head;
389 struct hlist_node *node, *node_next, *module_node, *module_node_next;
390 struct jump_label_entry *e;
391 struct jump_label_module_entry *e_module;
392 struct jump_entry *iter;
393 int i, count;
394
395 /* if the module doesn't have jump label entries, just return */
396 if (!mod->num_jump_entries)
397 return;
398
399 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
400 head = &jump_label_table[i];
401 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
402 hlist_for_each_entry_safe(e_module, module_node,
403 module_node_next,
404 &(e->modules), hlist) {
405 if (e_module->mod != mod)
406 continue;
407 count = e_module->nr_entries;
408 iter = e_module->table;
409 while (count--) {
410 if (within_module_init(iter->code, mod))
411 iter->key = 0;
412 iter++;
413 }
414 }
415 }
416 }
417}
418
419static int
420jump_label_module_notify(struct notifier_block *self, unsigned long val,
421 void *data)
422{
423 struct module *mod = data;
424 int ret = 0;
425
426 switch (val) {
427 case MODULE_STATE_COMING:
428 jump_label_lock();
429 ret = add_jump_label_module(mod);
430 if (ret)
431 remove_jump_label_module(mod);
432 jump_label_unlock();
433 break;
434 case MODULE_STATE_GOING:
435 jump_label_lock();
436 remove_jump_label_module(mod);
437 jump_label_unlock();
438 break;
439 case MODULE_STATE_LIVE:
440 jump_label_lock();
441 remove_jump_label_module_init(mod);
442 jump_label_unlock();
443 break;
444 }
445 return ret;
446}
447
448/***
449 * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
450 * @mod: module to patch
451 *
452 * Allow for run-time selection of the optimal nops. Before the module
453 * loads patch these with arch_get_jump_label_nop(), which is specified by
454 * the arch specific jump label code.
455 */
456void jump_label_apply_nops(struct module *mod)
457{
458 struct jump_entry *iter;
459
460 /* if the module doesn't have jump label entries, just return */
461 if (!mod->num_jump_entries)
462 return;
463
464 iter = mod->jump_entries;
465 while (iter < mod->jump_entries + mod->num_jump_entries) {
466 arch_jump_label_text_poke_early(iter->code);
467 iter++;
468 }
469}
470
471struct notifier_block jump_label_module_nb = {
472 .notifier_call = jump_label_module_notify,
473 .priority = 0,
474};
475
476static __init int init_jump_label_module(void)
477{
478 return register_module_notifier(&jump_label_module_nb);
479}
480early_initcall(init_jump_label_module);
481
482#endif /* CONFIG_MODULES */
483
484#endif
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c0613f7d6730..b55045bc7563 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -816,7 +816,7 @@ static int kimage_load_normal_segment(struct kimage *image,
816 816
817 ptr = kmap(page); 817 ptr = kmap(page);
818 /* Start with a clear page */ 818 /* Start with a clear page */
819 memset(ptr, 0, PAGE_SIZE); 819 clear_page(ptr);
820 ptr += maddr & ~PAGE_MASK; 820 ptr += maddr & ~PAGE_MASK;
821 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 821 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
822 if (mchunk > mbytes) 822 if (mchunk > mbytes)
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 4502604ecadf..01a0700e873f 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -365,8 +365,6 @@ static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
365 n = setup_sgl_buf(sgl, fifo->data + off, nents, l); 365 n = setup_sgl_buf(sgl, fifo->data + off, nents, l);
366 n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l); 366 n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
367 367
368 if (n)
369 sg_mark_end(sgl + n - 1);
370 return n; 368 return n;
371} 369}
372 370
@@ -503,6 +501,15 @@ unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
503} 501}
504EXPORT_SYMBOL(__kfifo_out_r); 502EXPORT_SYMBOL(__kfifo_out_r);
505 503
504void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize)
505{
506 unsigned int n;
507
508 n = __kfifo_peek_n(fifo, recsize);
509 fifo->out += n + recsize;
510}
511EXPORT_SYMBOL(__kfifo_skip_r);
512
506int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from, 513int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
507 unsigned long len, unsigned int *copied, size_t recsize) 514 unsigned long len, unsigned int *copied, size_t recsize)
508{ 515{
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6e9b19667a8d..9cd0591c96a2 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -153,7 +153,9 @@ static int ____call_usermodehelper(void *data)
153 goto fail; 153 goto fail;
154 } 154 }
155 155
156 retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); 156 retval = kernel_execve(sub_info->path,
157 (const char *const *)sub_info->argv,
158 (const char *const *)sub_info->envp);
157 159
158 /* Exec failed? */ 160 /* Exec failed? */
159fail: 161fail:
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 282035f3ae96..9737a76e106f 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,6 +47,7 @@
47#include <linux/memory.h> 47#include <linux/memory.h>
48#include <linux/ftrace.h> 48#include <linux/ftrace.h>
49#include <linux/cpu.h> 49#include <linux/cpu.h>
50#include <linux/jump_label.h>
50 51
51#include <asm-generic/sections.h> 52#include <asm-generic/sections.h>
52#include <asm/cacheflush.h> 53#include <asm/cacheflush.h>
@@ -73,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
73/* NOTE: change this value only with kprobe_mutex held */ 74/* NOTE: change this value only with kprobe_mutex held */
74static bool kprobes_all_disarmed; 75static bool kprobes_all_disarmed;
75 76
76static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 77/* This protects kprobe_table and optimizing_list */
78static DEFINE_MUTEX(kprobe_mutex);
77static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 79static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
78static struct { 80static struct {
79 spinlock_t lock ____cacheline_aligned_in_smp; 81 spinlock_t lock ____cacheline_aligned_in_smp;
@@ -399,7 +401,7 @@ static inline int kprobe_optready(struct kprobe *p)
399 * Return an optimized kprobe whose optimizing code replaces 401 * Return an optimized kprobe whose optimizing code replaces
400 * instructions including addr (exclude breakpoint). 402 * instructions including addr (exclude breakpoint).
401 */ 403 */
402struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) 404static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
403{ 405{
404 int i; 406 int i;
405 struct kprobe *p = NULL; 407 struct kprobe *p = NULL;
@@ -594,6 +596,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
594} 596}
595 597
596#ifdef CONFIG_SYSCTL 598#ifdef CONFIG_SYSCTL
599/* This should be called with kprobe_mutex locked */
597static void __kprobes optimize_all_kprobes(void) 600static void __kprobes optimize_all_kprobes(void)
598{ 601{
599 struct hlist_head *head; 602 struct hlist_head *head;
@@ -606,17 +609,16 @@ static void __kprobes optimize_all_kprobes(void)
606 return; 609 return;
607 610
608 kprobes_allow_optimization = true; 611 kprobes_allow_optimization = true;
609 mutex_lock(&text_mutex);
610 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 612 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
611 head = &kprobe_table[i]; 613 head = &kprobe_table[i];
612 hlist_for_each_entry_rcu(p, node, head, hlist) 614 hlist_for_each_entry_rcu(p, node, head, hlist)
613 if (!kprobe_disabled(p)) 615 if (!kprobe_disabled(p))
614 optimize_kprobe(p); 616 optimize_kprobe(p);
615 } 617 }
616 mutex_unlock(&text_mutex);
617 printk(KERN_INFO "Kprobes globally optimized\n"); 618 printk(KERN_INFO "Kprobes globally optimized\n");
618} 619}
619 620
621/* This should be called with kprobe_mutex locked */
620static void __kprobes unoptimize_all_kprobes(void) 622static void __kprobes unoptimize_all_kprobes(void)
621{ 623{
622 struct hlist_head *head; 624 struct hlist_head *head;
@@ -831,6 +833,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
831 833
832void __kprobes kretprobe_hash_lock(struct task_struct *tsk, 834void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
833 struct hlist_head **head, unsigned long *flags) 835 struct hlist_head **head, unsigned long *flags)
836__acquires(hlist_lock)
834{ 837{
835 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 838 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
836 spinlock_t *hlist_lock; 839 spinlock_t *hlist_lock;
@@ -842,6 +845,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
842 845
843static void __kprobes kretprobe_table_lock(unsigned long hash, 846static void __kprobes kretprobe_table_lock(unsigned long hash,
844 unsigned long *flags) 847 unsigned long *flags)
848__acquires(hlist_lock)
845{ 849{
846 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 850 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
847 spin_lock_irqsave(hlist_lock, *flags); 851 spin_lock_irqsave(hlist_lock, *flags);
@@ -849,6 +853,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash,
849 853
850void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, 854void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
851 unsigned long *flags) 855 unsigned long *flags)
856__releases(hlist_lock)
852{ 857{
853 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 858 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
854 spinlock_t *hlist_lock; 859 spinlock_t *hlist_lock;
@@ -857,7 +862,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
857 spin_unlock_irqrestore(hlist_lock, *flags); 862 spin_unlock_irqrestore(hlist_lock, *flags);
858} 863}
859 864
860void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) 865static void __kprobes kretprobe_table_unlock(unsigned long hash,
866 unsigned long *flags)
867__releases(hlist_lock)
861{ 868{
862 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 869 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
863 spin_unlock_irqrestore(hlist_lock, *flags); 870 spin_unlock_irqrestore(hlist_lock, *flags);
@@ -1138,13 +1145,13 @@ int __kprobes register_kprobe(struct kprobe *p)
1138 if (ret) 1145 if (ret)
1139 return ret; 1146 return ret;
1140 1147
1148 jump_label_lock();
1141 preempt_disable(); 1149 preempt_disable();
1142 if (!kernel_text_address((unsigned long) p->addr) || 1150 if (!kernel_text_address((unsigned long) p->addr) ||
1143 in_kprobes_functions((unsigned long) p->addr) || 1151 in_kprobes_functions((unsigned long) p->addr) ||
1144 ftrace_text_reserved(p->addr, p->addr)) { 1152 ftrace_text_reserved(p->addr, p->addr) ||
1145 preempt_enable(); 1153 jump_label_text_reserved(p->addr, p->addr))
1146 return -EINVAL; 1154 goto fail_with_jump_label;
1147 }
1148 1155
1149 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ 1156 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
1150 p->flags &= KPROBE_FLAG_DISABLED; 1157 p->flags &= KPROBE_FLAG_DISABLED;
@@ -1158,10 +1165,9 @@ int __kprobes register_kprobe(struct kprobe *p)
1158 * We must hold a refcount of the probed module while updating 1165 * We must hold a refcount of the probed module while updating
1159 * its code to prohibit unexpected unloading. 1166 * its code to prohibit unexpected unloading.
1160 */ 1167 */
1161 if (unlikely(!try_module_get(probed_mod))) { 1168 if (unlikely(!try_module_get(probed_mod)))
1162 preempt_enable(); 1169 goto fail_with_jump_label;
1163 return -EINVAL; 1170
1164 }
1165 /* 1171 /*
1166 * If the module freed .init.text, we couldn't insert 1172 * If the module freed .init.text, we couldn't insert
1167 * kprobes in there. 1173 * kprobes in there.
@@ -1169,16 +1175,18 @@ int __kprobes register_kprobe(struct kprobe *p)
1169 if (within_module_init((unsigned long)p->addr, probed_mod) && 1175 if (within_module_init((unsigned long)p->addr, probed_mod) &&
1170 probed_mod->state != MODULE_STATE_COMING) { 1176 probed_mod->state != MODULE_STATE_COMING) {
1171 module_put(probed_mod); 1177 module_put(probed_mod);
1172 preempt_enable(); 1178 goto fail_with_jump_label;
1173 return -EINVAL;
1174 } 1179 }
1175 } 1180 }
1176 preempt_enable(); 1181 preempt_enable();
1182 jump_label_unlock();
1177 1183
1178 p->nmissed = 0; 1184 p->nmissed = 0;
1179 INIT_LIST_HEAD(&p->list); 1185 INIT_LIST_HEAD(&p->list);
1180 mutex_lock(&kprobe_mutex); 1186 mutex_lock(&kprobe_mutex);
1181 1187
1188 jump_label_lock(); /* needed to call jump_label_text_reserved() */
1189
1182 get_online_cpus(); /* For avoiding text_mutex deadlock. */ 1190 get_online_cpus(); /* For avoiding text_mutex deadlock. */
1183 mutex_lock(&text_mutex); 1191 mutex_lock(&text_mutex);
1184 1192
@@ -1206,12 +1214,18 @@ int __kprobes register_kprobe(struct kprobe *p)
1206out: 1214out:
1207 mutex_unlock(&text_mutex); 1215 mutex_unlock(&text_mutex);
1208 put_online_cpus(); 1216 put_online_cpus();
1217 jump_label_unlock();
1209 mutex_unlock(&kprobe_mutex); 1218 mutex_unlock(&kprobe_mutex);
1210 1219
1211 if (probed_mod) 1220 if (probed_mod)
1212 module_put(probed_mod); 1221 module_put(probed_mod);
1213 1222
1214 return ret; 1223 return ret;
1224
1225fail_with_jump_label:
1226 preempt_enable();
1227 jump_label_unlock();
1228 return -EINVAL;
1215} 1229}
1216EXPORT_SYMBOL_GPL(register_kprobe); 1230EXPORT_SYMBOL_GPL(register_kprobe);
1217 1231
@@ -1339,18 +1353,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
1339 if (num <= 0) 1353 if (num <= 0)
1340 return -EINVAL; 1354 return -EINVAL;
1341 for (i = 0; i < num; i++) { 1355 for (i = 0; i < num; i++) {
1342 unsigned long addr; 1356 unsigned long addr, offset;
1343 jp = jps[i]; 1357 jp = jps[i];
1344 addr = arch_deref_entry_point(jp->entry); 1358 addr = arch_deref_entry_point(jp->entry);
1345 1359
1346 if (!kernel_text_address(addr)) 1360 /* Verify probepoint is a function entry point */
1347 ret = -EINVAL; 1361 if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
1348 else { 1362 offset == 0) {
1349 /* Todo: Verify probepoint is a function entry point */
1350 jp->kp.pre_handler = setjmp_pre_handler; 1363 jp->kp.pre_handler = setjmp_pre_handler;
1351 jp->kp.break_handler = longjmp_break_handler; 1364 jp->kp.break_handler = longjmp_break_handler;
1352 ret = register_kprobe(&jp->kp); 1365 ret = register_kprobe(&jp->kp);
1353 } 1366 } else
1367 ret = -EINVAL;
1368
1354 if (ret < 0) { 1369 if (ret < 0) {
1355 if (i > 0) 1370 if (i > 0)
1356 unregister_jprobes(jps, i); 1371 unregister_jprobes(jps, i);
@@ -1992,6 +2007,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
1992static const struct file_operations fops_kp = { 2007static const struct file_operations fops_kp = {
1993 .read = read_enabled_file_bool, 2008 .read = read_enabled_file_bool,
1994 .write = write_enabled_file_bool, 2009 .write = write_enabled_file_bool,
2010 .llseek = default_llseek,
1995}; 2011};
1996 2012
1997static int __kprobes debugfs_kprobe_init(void) 2013static int __kprobes debugfs_kprobe_init(void)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f2852a510232..42ba65dff7d9 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -639,6 +639,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
639 } 639 }
640#endif 640#endif
641 641
642 if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
643 debug_locks_off();
644 printk(KERN_ERR
645 "BUG: looking up invalid subclass: %u\n", subclass);
646 printk(KERN_ERR
647 "turning off the locking correctness validator.\n");
648 dump_stack();
649 return NULL;
650 }
651
642 /* 652 /*
643 * Static locks do not have their class-keys yet - for them the key 653 * Static locks do not have their class-keys yet - for them the key
644 * is the lock object itself: 654 * is the lock object itself:
@@ -774,7 +784,9 @@ out_unlock_set:
774 raw_local_irq_restore(flags); 784 raw_local_irq_restore(flags);
775 785
776 if (!subclass || force) 786 if (!subclass || force)
777 lock->class_cache = class; 787 lock->class_cache[0] = class;
788 else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
789 lock->class_cache[subclass] = class;
778 790
779 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) 791 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
780 return NULL; 792 return NULL;
@@ -2679,7 +2691,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2679void lockdep_init_map(struct lockdep_map *lock, const char *name, 2691void lockdep_init_map(struct lockdep_map *lock, const char *name,
2680 struct lock_class_key *key, int subclass) 2692 struct lock_class_key *key, int subclass)
2681{ 2693{
2682 lock->class_cache = NULL; 2694 int i;
2695
2696 for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
2697 lock->class_cache[i] = NULL;
2698
2683#ifdef CONFIG_LOCK_STAT 2699#ifdef CONFIG_LOCK_STAT
2684 lock->cpu = raw_smp_processor_id(); 2700 lock->cpu = raw_smp_processor_id();
2685#endif 2701#endif
@@ -2739,21 +2755,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2739 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2755 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2740 return 0; 2756 return 0;
2741 2757
2742 if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
2743 debug_locks_off();
2744 printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
2745 printk("turning off the locking correctness validator.\n");
2746 dump_stack();
2747 return 0;
2748 }
2749
2750 if (lock->key == &__lockdep_no_validate__) 2758 if (lock->key == &__lockdep_no_validate__)
2751 check = 1; 2759 check = 1;
2752 2760
2753 if (!subclass) 2761 if (subclass < NR_LOCKDEP_CACHING_CLASSES)
2754 class = lock->class_cache; 2762 class = lock->class_cache[subclass];
2755 /* 2763 /*
2756 * Not cached yet or subclass? 2764 * Not cached?
2757 */ 2765 */
2758 if (unlikely(!class)) { 2766 if (unlikely(!class)) {
2759 class = register_lock_class(lock, subclass, 0); 2767 class = register_lock_class(lock, subclass, 0);
@@ -2918,7 +2926,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
2918 return 1; 2926 return 1;
2919 2927
2920 if (hlock->references) { 2928 if (hlock->references) {
2921 struct lock_class *class = lock->class_cache; 2929 struct lock_class *class = lock->class_cache[0];
2922 2930
2923 if (!class) 2931 if (!class)
2924 class = look_up_lock_class(lock, 0); 2932 class = look_up_lock_class(lock, 0);
@@ -3559,7 +3567,12 @@ void lockdep_reset_lock(struct lockdep_map *lock)
3559 if (list_empty(head)) 3567 if (list_empty(head))
3560 continue; 3568 continue;
3561 list_for_each_entry_safe(class, next, head, hash_entry) { 3569 list_for_each_entry_safe(class, next, head, hash_entry) {
3562 if (unlikely(class == lock->class_cache)) { 3570 int match = 0;
3571
3572 for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
3573 match |= class == lock->class_cache[j];
3574
3575 if (unlikely(match)) {
3563 if (debug_locks_off_graph_unlock()) 3576 if (debug_locks_off_graph_unlock())
3564 WARN_ON(1); 3577 WARN_ON(1);
3565 goto out_restore; 3578 goto out_restore;
@@ -3775,7 +3788,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks);
3775 * Careful: only use this function if you are sure that 3788 * Careful: only use this function if you are sure that
3776 * the task cannot run in parallel! 3789 * the task cannot run in parallel!
3777 */ 3790 */
3778void __debug_show_held_locks(struct task_struct *task) 3791void debug_show_held_locks(struct task_struct *task)
3779{ 3792{
3780 if (unlikely(!debug_locks)) { 3793 if (unlikely(!debug_locks)) {
3781 printk("INFO: lockdep is turned off.\n"); 3794 printk("INFO: lockdep is turned off.\n");
@@ -3783,12 +3796,6 @@ void __debug_show_held_locks(struct task_struct *task)
3783 } 3796 }
3784 lockdep_print_held_locks(task); 3797 lockdep_print_held_locks(task);
3785} 3798}
3786EXPORT_SYMBOL_GPL(__debug_show_held_locks);
3787
3788void debug_show_held_locks(struct task_struct *task)
3789{
3790 __debug_show_held_locks(task);
3791}
3792EXPORT_SYMBOL_GPL(debug_show_held_locks); 3799EXPORT_SYMBOL_GPL(debug_show_held_locks);
3793 3800
3794void lockdep_sys_exit(void) 3801void lockdep_sys_exit(void)
diff --git a/kernel/module.c b/kernel/module.c
index d0b5f8db11b4..437a74a7524a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -55,6 +55,7 @@
55#include <linux/async.h> 55#include <linux/async.h>
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/kmemleak.h> 57#include <linux/kmemleak.h>
58#include <linux/jump_label.h>
58 59
59#define CREATE_TRACE_POINTS 60#define CREATE_TRACE_POINTS
60#include <trace/events/module.h> 61#include <trace/events/module.h>
@@ -1537,6 +1538,7 @@ static int __unlink_module(void *_mod)
1537{ 1538{
1538 struct module *mod = _mod; 1539 struct module *mod = _mod;
1539 list_del(&mod->list); 1540 list_del(&mod->list);
1541 module_bug_cleanup(mod);
1540 return 0; 1542 return 0;
1541} 1543}
1542 1544
@@ -2035,7 +2037,7 @@ static inline void layout_symtab(struct module *mod, struct load_info *info)
2035{ 2037{
2036} 2038}
2037 2039
2038static void add_kallsyms(struct module *mod, struct load_info *info) 2040static void add_kallsyms(struct module *mod, const struct load_info *info)
2039{ 2041{
2040} 2042}
2041#endif /* CONFIG_KALLSYMS */ 2043#endif /* CONFIG_KALLSYMS */
@@ -2308,6 +2310,11 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2308 sizeof(*mod->tracepoints), 2310 sizeof(*mod->tracepoints),
2309 &mod->num_tracepoints); 2311 &mod->num_tracepoints);
2310#endif 2312#endif
2313#ifdef HAVE_JUMP_LABEL
2314 mod->jump_entries = section_objs(info, "__jump_table",
2315 sizeof(*mod->jump_entries),
2316 &mod->num_jump_entries);
2317#endif
2311#ifdef CONFIG_EVENT_TRACING 2318#ifdef CONFIG_EVENT_TRACING
2312 mod->trace_events = section_objs(info, "_ftrace_events", 2319 mod->trace_events = section_objs(info, "_ftrace_events",
2313 sizeof(*mod->trace_events), 2320 sizeof(*mod->trace_events),
@@ -2625,6 +2632,7 @@ static struct module *load_module(void __user *umod,
2625 if (err < 0) 2632 if (err < 0)
2626 goto ddebug; 2633 goto ddebug;
2627 2634
2635 module_bug_finalize(info.hdr, info.sechdrs, mod);
2628 list_add_rcu(&mod->list, &modules); 2636 list_add_rcu(&mod->list, &modules);
2629 mutex_unlock(&module_mutex); 2637 mutex_unlock(&module_mutex);
2630 2638
@@ -2650,6 +2658,8 @@ static struct module *load_module(void __user *umod,
2650 mutex_lock(&module_mutex); 2658 mutex_lock(&module_mutex);
2651 /* Unlink carefully: kallsyms could be walking list. */ 2659 /* Unlink carefully: kallsyms could be walking list. */
2652 list_del_rcu(&mod->list); 2660 list_del_rcu(&mod->list);
2661 module_bug_cleanup(mod);
2662
2653 ddebug: 2663 ddebug:
2654 if (!mod->taints) 2664 if (!mod->taints)
2655 dynamic_debug_remove(info.debug); 2665 dynamic_debug_remove(info.debug);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 4c0b7b3e6d2e..200407c1502f 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -36,15 +36,6 @@
36# include <asm/mutex.h> 36# include <asm/mutex.h>
37#endif 37#endif
38 38
39/***
40 * mutex_init - initialize the mutex
41 * @lock: the mutex to be initialized
42 * @key: the lock_class_key for the class; used by mutex lock debugging
43 *
44 * Initialize the mutex to unlocked state.
45 *
46 * It is not allowed to initialize an already locked mutex.
47 */
48void 39void
49__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) 40__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
50{ 41{
@@ -68,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init);
68static __used noinline void __sched 59static __used noinline void __sched
69__mutex_lock_slowpath(atomic_t *lock_count); 60__mutex_lock_slowpath(atomic_t *lock_count);
70 61
71/*** 62/**
72 * mutex_lock - acquire the mutex 63 * mutex_lock - acquire the mutex
73 * @lock: the mutex to be acquired 64 * @lock: the mutex to be acquired
74 * 65 *
@@ -105,7 +96,7 @@ EXPORT_SYMBOL(mutex_lock);
105 96
106static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); 97static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
107 98
108/*** 99/**
109 * mutex_unlock - release the mutex 100 * mutex_unlock - release the mutex
110 * @lock: the mutex to be released 101 * @lock: the mutex to be released
111 * 102 *
@@ -364,8 +355,8 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count);
364static noinline int __sched 355static noinline int __sched
365__mutex_lock_interruptible_slowpath(atomic_t *lock_count); 356__mutex_lock_interruptible_slowpath(atomic_t *lock_count);
366 357
367/*** 358/**
368 * mutex_lock_interruptible - acquire the mutex, interruptable 359 * mutex_lock_interruptible - acquire the mutex, interruptible
369 * @lock: the mutex to be acquired 360 * @lock: the mutex to be acquired
370 * 361 *
371 * Lock the mutex like mutex_lock(), and return 0 if the mutex has 362 * Lock the mutex like mutex_lock(), and return 0 if the mutex has
@@ -456,15 +447,15 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
456 return prev == 1; 447 return prev == 1;
457} 448}
458 449
459/*** 450/**
460 * mutex_trylock - try acquire the mutex, without waiting 451 * mutex_trylock - try to acquire the mutex, without waiting
461 * @lock: the mutex to be acquired 452 * @lock: the mutex to be acquired
462 * 453 *
463 * Try to acquire the mutex atomically. Returns 1 if the mutex 454 * Try to acquire the mutex atomically. Returns 1 if the mutex
464 * has been acquired successfully, and 0 on contention. 455 * has been acquired successfully, and 0 on contention.
465 * 456 *
466 * NOTE: this function follows the spin_trylock() convention, so 457 * NOTE: this function follows the spin_trylock() convention, so
467 * it is negated to the down_trylock() return values! Be careful 458 * it is negated from the down_trylock() return values! Be careful
468 * about this when converting semaphore users to mutexes. 459 * about this when converting semaphore users to mutexes.
469 * 460 *
470 * This function must not be used in interrupt context. The 461 * This function must not be used in interrupt context. The
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 2a5dfec8efe0..2c98ad94ba0e 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -85,6 +85,14 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
85 return ERR_PTR(-EPERM); 85 return ERR_PTR(-EPERM);
86 if (!cgroup_is_descendant(cgroup, current)) 86 if (!cgroup_is_descendant(cgroup, current))
87 return ERR_PTR(-EPERM); 87 return ERR_PTR(-EPERM);
88 if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) {
89 printk("ns_cgroup can't be created with parent "
90 "'clone_children' set.\n");
91 return ERR_PTR(-EINVAL);
92 }
93
94 printk_once("ns_cgroup deprecated: consider using the "
95 "'clone_children' flag without the ns_cgroup.\n");
88 96
89 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); 97 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
90 if (!ns_cgroup) 98 if (!ns_cgroup)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 403d1804b198..517d827f4982 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -31,24 +31,18 @@
31#include <linux/kernel_stat.h> 31#include <linux/kernel_stat.h>
32#include <linux/perf_event.h> 32#include <linux/perf_event.h>
33#include <linux/ftrace_event.h> 33#include <linux/ftrace_event.h>
34#include <linux/hw_breakpoint.h>
35 34
36#include <asm/irq_regs.h> 35#include <asm/irq_regs.h>
37 36
38/* 37atomic_t perf_task_events __read_mostly;
39 * Each CPU has a list of per CPU events:
40 */
41static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
42
43int perf_max_events __read_mostly = 1;
44static int perf_reserved_percpu __read_mostly;
45static int perf_overcommit __read_mostly = 1;
46
47static atomic_t nr_events __read_mostly;
48static atomic_t nr_mmap_events __read_mostly; 38static atomic_t nr_mmap_events __read_mostly;
49static atomic_t nr_comm_events __read_mostly; 39static atomic_t nr_comm_events __read_mostly;
50static atomic_t nr_task_events __read_mostly; 40static atomic_t nr_task_events __read_mostly;
51 41
42static LIST_HEAD(pmus);
43static DEFINE_MUTEX(pmus_lock);
44static struct srcu_struct pmus_srcu;
45
52/* 46/*
53 * perf event paranoia level: 47 * perf event paranoia level:
54 * -1 - not paranoid at all 48 * -1 - not paranoid at all
@@ -67,36 +61,43 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
67 61
68static atomic64_t perf_event_id; 62static atomic64_t perf_event_id;
69 63
70/* 64void __weak perf_event_print_debug(void) { }
71 * Lock for (sysadmin-configurable) event reservations:
72 */
73static DEFINE_SPINLOCK(perf_resource_lock);
74 65
75/* 66extern __weak const char *perf_pmu_name(void)
76 * Architecture provided APIs - weak aliases:
77 */
78extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
79{ 67{
80 return NULL; 68 return "pmu";
81} 69}
82 70
83void __weak hw_perf_disable(void) { barrier(); } 71void perf_pmu_disable(struct pmu *pmu)
84void __weak hw_perf_enable(void) { barrier(); } 72{
85 73 int *count = this_cpu_ptr(pmu->pmu_disable_count);
86void __weak perf_event_print_debug(void) { } 74 if (!(*count)++)
87 75 pmu->pmu_disable(pmu);
88static DEFINE_PER_CPU(int, perf_disable_count); 76}
89 77
90void perf_disable(void) 78void perf_pmu_enable(struct pmu *pmu)
91{ 79{
92 if (!__get_cpu_var(perf_disable_count)++) 80 int *count = this_cpu_ptr(pmu->pmu_disable_count);
93 hw_perf_disable(); 81 if (!--(*count))
82 pmu->pmu_enable(pmu);
94} 83}
95 84
96void perf_enable(void) 85static DEFINE_PER_CPU(struct list_head, rotation_list);
86
87/*
88 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
89 * because they're strictly cpu affine and rotate_start is called with IRQs
90 * disabled, while rotate_context is called from IRQ context.
91 */
92static void perf_pmu_rotate_start(struct pmu *pmu)
97{ 93{
98 if (!--__get_cpu_var(perf_disable_count)) 94 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
99 hw_perf_enable(); 95 struct list_head *head = &__get_cpu_var(rotation_list);
96
97 WARN_ON(!irqs_disabled());
98
99 if (list_empty(&cpuctx->rotation_list))
100 list_add(&cpuctx->rotation_list, head);
100} 101}
101 102
102static void get_ctx(struct perf_event_context *ctx) 103static void get_ctx(struct perf_event_context *ctx)
@@ -151,13 +152,13 @@ static u64 primary_event_id(struct perf_event *event)
151 * the context could get moved to another task. 152 * the context could get moved to another task.
152 */ 153 */
153static struct perf_event_context * 154static struct perf_event_context *
154perf_lock_task_context(struct task_struct *task, unsigned long *flags) 155perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
155{ 156{
156 struct perf_event_context *ctx; 157 struct perf_event_context *ctx;
157 158
158 rcu_read_lock(); 159 rcu_read_lock();
159 retry: 160retry:
160 ctx = rcu_dereference(task->perf_event_ctxp); 161 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
161 if (ctx) { 162 if (ctx) {
162 /* 163 /*
163 * If this context is a clone of another, it might 164 * If this context is a clone of another, it might
@@ -170,7 +171,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
170 * can't get swapped on us any more. 171 * can't get swapped on us any more.
171 */ 172 */
172 raw_spin_lock_irqsave(&ctx->lock, *flags); 173 raw_spin_lock_irqsave(&ctx->lock, *flags);
173 if (ctx != rcu_dereference(task->perf_event_ctxp)) { 174 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
174 raw_spin_unlock_irqrestore(&ctx->lock, *flags); 175 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
175 goto retry; 176 goto retry;
176 } 177 }
@@ -189,12 +190,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
189 * can't get swapped to another task. This also increments its 190 * can't get swapped to another task. This also increments its
190 * reference count so that the context can't get freed. 191 * reference count so that the context can't get freed.
191 */ 192 */
192static struct perf_event_context *perf_pin_task_context(struct task_struct *task) 193static struct perf_event_context *
194perf_pin_task_context(struct task_struct *task, int ctxn)
193{ 195{
194 struct perf_event_context *ctx; 196 struct perf_event_context *ctx;
195 unsigned long flags; 197 unsigned long flags;
196 198
197 ctx = perf_lock_task_context(task, &flags); 199 ctx = perf_lock_task_context(task, ctxn, &flags);
198 if (ctx) { 200 if (ctx) {
199 ++ctx->pin_count; 201 ++ctx->pin_count;
200 raw_spin_unlock_irqrestore(&ctx->lock, flags); 202 raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -302,6 +304,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
302 } 304 }
303 305
304 list_add_rcu(&event->event_entry, &ctx->event_list); 306 list_add_rcu(&event->event_entry, &ctx->event_list);
307 if (!ctx->nr_events)
308 perf_pmu_rotate_start(ctx->pmu);
305 ctx->nr_events++; 309 ctx->nr_events++;
306 if (event->attr.inherit_stat) 310 if (event->attr.inherit_stat)
307 ctx->nr_stat++; 311 ctx->nr_stat++;
@@ -311,7 +315,12 @@ static void perf_group_attach(struct perf_event *event)
311{ 315{
312 struct perf_event *group_leader = event->group_leader; 316 struct perf_event *group_leader = event->group_leader;
313 317
314 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); 318 /*
319 * We can have double attach due to group movement in perf_event_open.
320 */
321 if (event->attach_state & PERF_ATTACH_GROUP)
322 return;
323
315 event->attach_state |= PERF_ATTACH_GROUP; 324 event->attach_state |= PERF_ATTACH_GROUP;
316 325
317 if (group_leader == event) 326 if (group_leader == event)
@@ -402,11 +411,31 @@ static void perf_group_detach(struct perf_event *event)
402 } 411 }
403} 412}
404 413
414static inline int
415event_filter_match(struct perf_event *event)
416{
417 return event->cpu == -1 || event->cpu == smp_processor_id();
418}
419
405static void 420static void
406event_sched_out(struct perf_event *event, 421event_sched_out(struct perf_event *event,
407 struct perf_cpu_context *cpuctx, 422 struct perf_cpu_context *cpuctx,
408 struct perf_event_context *ctx) 423 struct perf_event_context *ctx)
409{ 424{
425 u64 delta;
426 /*
427 * An event which could not be activated because of
428 * filter mismatch still needs to have its timings
429 * maintained, otherwise bogus information is return
430 * via read() for time_enabled, time_running:
431 */
432 if (event->state == PERF_EVENT_STATE_INACTIVE
433 && !event_filter_match(event)) {
434 delta = ctx->time - event->tstamp_stopped;
435 event->tstamp_running += delta;
436 event->tstamp_stopped = ctx->time;
437 }
438
410 if (event->state != PERF_EVENT_STATE_ACTIVE) 439 if (event->state != PERF_EVENT_STATE_ACTIVE)
411 return; 440 return;
412 441
@@ -416,7 +445,7 @@ event_sched_out(struct perf_event *event,
416 event->state = PERF_EVENT_STATE_OFF; 445 event->state = PERF_EVENT_STATE_OFF;
417 } 446 }
418 event->tstamp_stopped = ctx->time; 447 event->tstamp_stopped = ctx->time;
419 event->pmu->disable(event); 448 event->pmu->del(event, 0);
420 event->oncpu = -1; 449 event->oncpu = -1;
421 450
422 if (!is_software_event(event)) 451 if (!is_software_event(event))
@@ -432,9 +461,7 @@ group_sched_out(struct perf_event *group_event,
432 struct perf_event_context *ctx) 461 struct perf_event_context *ctx)
433{ 462{
434 struct perf_event *event; 463 struct perf_event *event;
435 464 int state = group_event->state;
436 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
437 return;
438 465
439 event_sched_out(group_event, cpuctx, ctx); 466 event_sched_out(group_event, cpuctx, ctx);
440 467
@@ -444,10 +471,16 @@ group_sched_out(struct perf_event *group_event,
444 list_for_each_entry(event, &group_event->sibling_list, group_entry) 471 list_for_each_entry(event, &group_event->sibling_list, group_entry)
445 event_sched_out(event, cpuctx, ctx); 472 event_sched_out(event, cpuctx, ctx);
446 473
447 if (group_event->attr.exclusive) 474 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
448 cpuctx->exclusive = 0; 475 cpuctx->exclusive = 0;
449} 476}
450 477
478static inline struct perf_cpu_context *
479__get_cpu_context(struct perf_event_context *ctx)
480{
481 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
482}
483
451/* 484/*
452 * Cross CPU call to remove a performance event 485 * Cross CPU call to remove a performance event
453 * 486 *
@@ -456,9 +489,9 @@ group_sched_out(struct perf_event *group_event,
456 */ 489 */
457static void __perf_event_remove_from_context(void *info) 490static void __perf_event_remove_from_context(void *info)
458{ 491{
459 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
460 struct perf_event *event = info; 492 struct perf_event *event = info;
461 struct perf_event_context *ctx = event->ctx; 493 struct perf_event_context *ctx = event->ctx;
494 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
462 495
463 /* 496 /*
464 * If this is a task context, we need to check whether it is 497 * If this is a task context, we need to check whether it is
@@ -469,27 +502,11 @@ static void __perf_event_remove_from_context(void *info)
469 return; 502 return;
470 503
471 raw_spin_lock(&ctx->lock); 504 raw_spin_lock(&ctx->lock);
472 /*
473 * Protect the list operation against NMI by disabling the
474 * events on a global level.
475 */
476 perf_disable();
477 505
478 event_sched_out(event, cpuctx, ctx); 506 event_sched_out(event, cpuctx, ctx);
479 507
480 list_del_event(event, ctx); 508 list_del_event(event, ctx);
481 509
482 if (!ctx->task) {
483 /*
484 * Allow more per task events with respect to the
485 * reservation:
486 */
487 cpuctx->max_pertask =
488 min(perf_max_events - ctx->nr_events,
489 perf_max_events - perf_reserved_percpu);
490 }
491
492 perf_enable();
493 raw_spin_unlock(&ctx->lock); 510 raw_spin_unlock(&ctx->lock);
494} 511}
495 512
@@ -554,8 +571,8 @@ retry:
554static void __perf_event_disable(void *info) 571static void __perf_event_disable(void *info)
555{ 572{
556 struct perf_event *event = info; 573 struct perf_event *event = info;
557 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
558 struct perf_event_context *ctx = event->ctx; 574 struct perf_event_context *ctx = event->ctx;
575 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
559 576
560 /* 577 /*
561 * If this is a per-task event, need to check whether this 578 * If this is a per-task event, need to check whether this
@@ -610,7 +627,7 @@ void perf_event_disable(struct perf_event *event)
610 return; 627 return;
611 } 628 }
612 629
613 retry: 630retry:
614 task_oncpu_function_call(task, __perf_event_disable, event); 631 task_oncpu_function_call(task, __perf_event_disable, event);
615 632
616 raw_spin_lock_irq(&ctx->lock); 633 raw_spin_lock_irq(&ctx->lock);
@@ -649,7 +666,7 @@ event_sched_in(struct perf_event *event,
649 */ 666 */
650 smp_wmb(); 667 smp_wmb();
651 668
652 if (event->pmu->enable(event)) { 669 if (event->pmu->add(event, PERF_EF_START)) {
653 event->state = PERF_EVENT_STATE_INACTIVE; 670 event->state = PERF_EVENT_STATE_INACTIVE;
654 event->oncpu = -1; 671 event->oncpu = -1;
655 return -EAGAIN; 672 return -EAGAIN;
@@ -673,22 +690,17 @@ group_sched_in(struct perf_event *group_event,
673 struct perf_event_context *ctx) 690 struct perf_event_context *ctx)
674{ 691{
675 struct perf_event *event, *partial_group = NULL; 692 struct perf_event *event, *partial_group = NULL;
676 const struct pmu *pmu = group_event->pmu; 693 struct pmu *pmu = group_event->pmu;
677 bool txn = false; 694 u64 now = ctx->time;
695 bool simulate = false;
678 696
679 if (group_event->state == PERF_EVENT_STATE_OFF) 697 if (group_event->state == PERF_EVENT_STATE_OFF)
680 return 0; 698 return 0;
681 699
682 /* Check if group transaction availabe */ 700 pmu->start_txn(pmu);
683 if (pmu->start_txn)
684 txn = true;
685
686 if (txn)
687 pmu->start_txn(pmu);
688 701
689 if (event_sched_in(group_event, cpuctx, ctx)) { 702 if (event_sched_in(group_event, cpuctx, ctx)) {
690 if (txn) 703 pmu->cancel_txn(pmu);
691 pmu->cancel_txn(pmu);
692 return -EAGAIN; 704 return -EAGAIN;
693 } 705 }
694 706
@@ -702,23 +714,38 @@ group_sched_in(struct perf_event *group_event,
702 } 714 }
703 } 715 }
704 716
705 if (!txn || !pmu->commit_txn(pmu)) 717 if (!pmu->commit_txn(pmu))
706 return 0; 718 return 0;
707 719
708group_error: 720group_error:
709 /* 721 /*
710 * Groups can be scheduled in as one unit only, so undo any 722 * Groups can be scheduled in as one unit only, so undo any
711 * partial group before returning: 723 * partial group before returning:
724 * The events up to the failed event are scheduled out normally,
725 * tstamp_stopped will be updated.
726 *
727 * The failed events and the remaining siblings need to have
728 * their timings updated as if they had gone thru event_sched_in()
729 * and event_sched_out(). This is required to get consistent timings
730 * across the group. This also takes care of the case where the group
731 * could never be scheduled by ensuring tstamp_stopped is set to mark
732 * the time the event was actually stopped, such that time delta
733 * calculation in update_event_times() is correct.
712 */ 734 */
713 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 735 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
714 if (event == partial_group) 736 if (event == partial_group)
715 break; 737 simulate = true;
716 event_sched_out(event, cpuctx, ctx); 738
739 if (simulate) {
740 event->tstamp_running += now - event->tstamp_stopped;
741 event->tstamp_stopped = now;
742 } else {
743 event_sched_out(event, cpuctx, ctx);
744 }
717 } 745 }
718 event_sched_out(group_event, cpuctx, ctx); 746 event_sched_out(group_event, cpuctx, ctx);
719 747
720 if (txn) 748 pmu->cancel_txn(pmu);
721 pmu->cancel_txn(pmu);
722 749
723 return -EAGAIN; 750 return -EAGAIN;
724} 751}
@@ -771,10 +798,10 @@ static void add_event_to_ctx(struct perf_event *event,
771 */ 798 */
772static void __perf_install_in_context(void *info) 799static void __perf_install_in_context(void *info)
773{ 800{
774 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
775 struct perf_event *event = info; 801 struct perf_event *event = info;
776 struct perf_event_context *ctx = event->ctx; 802 struct perf_event_context *ctx = event->ctx;
777 struct perf_event *leader = event->group_leader; 803 struct perf_event *leader = event->group_leader;
804 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
778 int err; 805 int err;
779 806
780 /* 807 /*
@@ -794,12 +821,6 @@ static void __perf_install_in_context(void *info)
794 ctx->is_active = 1; 821 ctx->is_active = 1;
795 update_context_time(ctx); 822 update_context_time(ctx);
796 823
797 /*
798 * Protect the list operation against NMI by disabling the
799 * events on a global level. NOP for non NMI based events.
800 */
801 perf_disable();
802
803 add_event_to_ctx(event, ctx); 824 add_event_to_ctx(event, ctx);
804 825
805 if (event->cpu != -1 && event->cpu != smp_processor_id()) 826 if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -837,12 +858,7 @@ static void __perf_install_in_context(void *info)
837 } 858 }
838 } 859 }
839 860
840 if (!err && !ctx->task && cpuctx->max_pertask) 861unlock:
841 cpuctx->max_pertask--;
842
843 unlock:
844 perf_enable();
845
846 raw_spin_unlock(&ctx->lock); 862 raw_spin_unlock(&ctx->lock);
847} 863}
848 864
@@ -865,6 +881,8 @@ perf_install_in_context(struct perf_event_context *ctx,
865{ 881{
866 struct task_struct *task = ctx->task; 882 struct task_struct *task = ctx->task;
867 883
884 event->ctx = ctx;
885
868 if (!task) { 886 if (!task) {
869 /* 887 /*
870 * Per cpu events are installed via an smp call and 888 * Per cpu events are installed via an smp call and
@@ -913,10 +931,12 @@ static void __perf_event_mark_enabled(struct perf_event *event,
913 931
914 event->state = PERF_EVENT_STATE_INACTIVE; 932 event->state = PERF_EVENT_STATE_INACTIVE;
915 event->tstamp_enabled = ctx->time - event->total_time_enabled; 933 event->tstamp_enabled = ctx->time - event->total_time_enabled;
916 list_for_each_entry(sub, &event->sibling_list, group_entry) 934 list_for_each_entry(sub, &event->sibling_list, group_entry) {
917 if (sub->state >= PERF_EVENT_STATE_INACTIVE) 935 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
918 sub->tstamp_enabled = 936 sub->tstamp_enabled =
919 ctx->time - sub->total_time_enabled; 937 ctx->time - sub->total_time_enabled;
938 }
939 }
920} 940}
921 941
922/* 942/*
@@ -925,9 +945,9 @@ static void __perf_event_mark_enabled(struct perf_event *event,
925static void __perf_event_enable(void *info) 945static void __perf_event_enable(void *info)
926{ 946{
927 struct perf_event *event = info; 947 struct perf_event *event = info;
928 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
929 struct perf_event_context *ctx = event->ctx; 948 struct perf_event_context *ctx = event->ctx;
930 struct perf_event *leader = event->group_leader; 949 struct perf_event *leader = event->group_leader;
950 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
931 int err; 951 int err;
932 952
933 /* 953 /*
@@ -961,12 +981,10 @@ static void __perf_event_enable(void *info)
961 if (!group_can_go_on(event, cpuctx, 1)) { 981 if (!group_can_go_on(event, cpuctx, 1)) {
962 err = -EEXIST; 982 err = -EEXIST;
963 } else { 983 } else {
964 perf_disable();
965 if (event == leader) 984 if (event == leader)
966 err = group_sched_in(event, cpuctx, ctx); 985 err = group_sched_in(event, cpuctx, ctx);
967 else 986 else
968 err = event_sched_in(event, cpuctx, ctx); 987 err = event_sched_in(event, cpuctx, ctx);
969 perf_enable();
970 } 988 }
971 989
972 if (err) { 990 if (err) {
@@ -982,7 +1000,7 @@ static void __perf_event_enable(void *info)
982 } 1000 }
983 } 1001 }
984 1002
985 unlock: 1003unlock:
986 raw_spin_unlock(&ctx->lock); 1004 raw_spin_unlock(&ctx->lock);
987} 1005}
988 1006
@@ -1023,7 +1041,7 @@ void perf_event_enable(struct perf_event *event)
1023 if (event->state == PERF_EVENT_STATE_ERROR) 1041 if (event->state == PERF_EVENT_STATE_ERROR)
1024 event->state = PERF_EVENT_STATE_OFF; 1042 event->state = PERF_EVENT_STATE_OFF;
1025 1043
1026 retry: 1044retry:
1027 raw_spin_unlock_irq(&ctx->lock); 1045 raw_spin_unlock_irq(&ctx->lock);
1028 task_oncpu_function_call(task, __perf_event_enable, event); 1046 task_oncpu_function_call(task, __perf_event_enable, event);
1029 1047
@@ -1043,7 +1061,7 @@ void perf_event_enable(struct perf_event *event)
1043 if (event->state == PERF_EVENT_STATE_OFF) 1061 if (event->state == PERF_EVENT_STATE_OFF)
1044 __perf_event_mark_enabled(event, ctx); 1062 __perf_event_mark_enabled(event, ctx);
1045 1063
1046 out: 1064out:
1047 raw_spin_unlock_irq(&ctx->lock); 1065 raw_spin_unlock_irq(&ctx->lock);
1048} 1066}
1049 1067
@@ -1074,26 +1092,26 @@ static void ctx_sched_out(struct perf_event_context *ctx,
1074 struct perf_event *event; 1092 struct perf_event *event;
1075 1093
1076 raw_spin_lock(&ctx->lock); 1094 raw_spin_lock(&ctx->lock);
1095 perf_pmu_disable(ctx->pmu);
1077 ctx->is_active = 0; 1096 ctx->is_active = 0;
1078 if (likely(!ctx->nr_events)) 1097 if (likely(!ctx->nr_events))
1079 goto out; 1098 goto out;
1080 update_context_time(ctx); 1099 update_context_time(ctx);
1081 1100
1082 perf_disable();
1083 if (!ctx->nr_active) 1101 if (!ctx->nr_active)
1084 goto out_enable; 1102 goto out;
1085 1103
1086 if (event_type & EVENT_PINNED) 1104 if (event_type & EVENT_PINNED) {
1087 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 1105 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1088 group_sched_out(event, cpuctx, ctx); 1106 group_sched_out(event, cpuctx, ctx);
1107 }
1089 1108
1090 if (event_type & EVENT_FLEXIBLE) 1109 if (event_type & EVENT_FLEXIBLE) {
1091 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 1110 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1092 group_sched_out(event, cpuctx, ctx); 1111 group_sched_out(event, cpuctx, ctx);
1093 1112 }
1094 out_enable: 1113out:
1095 perf_enable(); 1114 perf_pmu_enable(ctx->pmu);
1096 out:
1097 raw_spin_unlock(&ctx->lock); 1115 raw_spin_unlock(&ctx->lock);
1098} 1116}
1099 1117
@@ -1191,34 +1209,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1191 } 1209 }
1192} 1210}
1193 1211
1194/* 1212void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1195 * Called from scheduler to remove the events of the current task, 1213 struct task_struct *next)
1196 * with interrupts disabled.
1197 *
1198 * We stop each event and update the event value in event->count.
1199 *
1200 * This does not protect us against NMI, but disable()
1201 * sets the disabled bit in the control field of event _before_
1202 * accessing the event control register. If a NMI hits, then it will
1203 * not restart the event.
1204 */
1205void perf_event_task_sched_out(struct task_struct *task,
1206 struct task_struct *next)
1207{ 1214{
1208 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1215 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1209 struct perf_event_context *ctx = task->perf_event_ctxp;
1210 struct perf_event_context *next_ctx; 1216 struct perf_event_context *next_ctx;
1211 struct perf_event_context *parent; 1217 struct perf_event_context *parent;
1218 struct perf_cpu_context *cpuctx;
1212 int do_switch = 1; 1219 int do_switch = 1;
1213 1220
1214 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); 1221 if (likely(!ctx))
1222 return;
1215 1223
1216 if (likely(!ctx || !cpuctx->task_ctx)) 1224 cpuctx = __get_cpu_context(ctx);
1225 if (!cpuctx->task_ctx)
1217 return; 1226 return;
1218 1227
1219 rcu_read_lock(); 1228 rcu_read_lock();
1220 parent = rcu_dereference(ctx->parent_ctx); 1229 parent = rcu_dereference(ctx->parent_ctx);
1221 next_ctx = next->perf_event_ctxp; 1230 next_ctx = next->perf_event_ctxp[ctxn];
1222 if (parent && next_ctx && 1231 if (parent && next_ctx &&
1223 rcu_dereference(next_ctx->parent_ctx) == parent) { 1232 rcu_dereference(next_ctx->parent_ctx) == parent) {
1224 /* 1233 /*
@@ -1237,8 +1246,8 @@ void perf_event_task_sched_out(struct task_struct *task,
1237 * XXX do we need a memory barrier of sorts 1246 * XXX do we need a memory barrier of sorts
1238 * wrt to rcu_dereference() of perf_event_ctxp 1247 * wrt to rcu_dereference() of perf_event_ctxp
1239 */ 1248 */
1240 task->perf_event_ctxp = next_ctx; 1249 task->perf_event_ctxp[ctxn] = next_ctx;
1241 next->perf_event_ctxp = ctx; 1250 next->perf_event_ctxp[ctxn] = ctx;
1242 ctx->task = next; 1251 ctx->task = next;
1243 next_ctx->task = task; 1252 next_ctx->task = task;
1244 do_switch = 0; 1253 do_switch = 0;
@@ -1256,10 +1265,35 @@ void perf_event_task_sched_out(struct task_struct *task,
1256 } 1265 }
1257} 1266}
1258 1267
1268#define for_each_task_context_nr(ctxn) \
1269 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
1270
1271/*
1272 * Called from scheduler to remove the events of the current task,
1273 * with interrupts disabled.
1274 *
1275 * We stop each event and update the event value in event->count.
1276 *
1277 * This does not protect us against NMI, but disable()
1278 * sets the disabled bit in the control field of event _before_
1279 * accessing the event control register. If a NMI hits, then it will
1280 * not restart the event.
1281 */
1282void __perf_event_task_sched_out(struct task_struct *task,
1283 struct task_struct *next)
1284{
1285 int ctxn;
1286
1287 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1288
1289 for_each_task_context_nr(ctxn)
1290 perf_event_context_sched_out(task, ctxn, next);
1291}
1292
1259static void task_ctx_sched_out(struct perf_event_context *ctx, 1293static void task_ctx_sched_out(struct perf_event_context *ctx,
1260 enum event_type_t event_type) 1294 enum event_type_t event_type)
1261{ 1295{
1262 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1296 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1263 1297
1264 if (!cpuctx->task_ctx) 1298 if (!cpuctx->task_ctx)
1265 return; 1299 return;
@@ -1274,14 +1308,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
1274/* 1308/*
1275 * Called with IRQs disabled 1309 * Called with IRQs disabled
1276 */ 1310 */
1277static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1278{
1279 task_ctx_sched_out(ctx, EVENT_ALL);
1280}
1281
1282/*
1283 * Called with IRQs disabled
1284 */
1285static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, 1311static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1286 enum event_type_t event_type) 1312 enum event_type_t event_type)
1287{ 1313{
@@ -1332,9 +1358,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1332 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1358 if (event->cpu != -1 && event->cpu != smp_processor_id())
1333 continue; 1359 continue;
1334 1360
1335 if (group_can_go_on(event, cpuctx, can_add_hw)) 1361 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1336 if (group_sched_in(event, cpuctx, ctx)) 1362 if (group_sched_in(event, cpuctx, ctx))
1337 can_add_hw = 0; 1363 can_add_hw = 0;
1364 }
1338 } 1365 }
1339} 1366}
1340 1367
@@ -1350,8 +1377,6 @@ ctx_sched_in(struct perf_event_context *ctx,
1350 1377
1351 ctx->timestamp = perf_clock(); 1378 ctx->timestamp = perf_clock();
1352 1379
1353 perf_disable();
1354
1355 /* 1380 /*
1356 * First go through the list and put on any pinned groups 1381 * First go through the list and put on any pinned groups
1357 * in order to give them the best chance of going on. 1382 * in order to give them the best chance of going on.
@@ -1363,8 +1388,7 @@ ctx_sched_in(struct perf_event_context *ctx,
1363 if (event_type & EVENT_FLEXIBLE) 1388 if (event_type & EVENT_FLEXIBLE)
1364 ctx_flexible_sched_in(ctx, cpuctx); 1389 ctx_flexible_sched_in(ctx, cpuctx);
1365 1390
1366 perf_enable(); 1391out:
1367 out:
1368 raw_spin_unlock(&ctx->lock); 1392 raw_spin_unlock(&ctx->lock);
1369} 1393}
1370 1394
@@ -1376,43 +1400,28 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1376 ctx_sched_in(ctx, cpuctx, event_type); 1400 ctx_sched_in(ctx, cpuctx, event_type);
1377} 1401}
1378 1402
1379static void task_ctx_sched_in(struct task_struct *task, 1403static void task_ctx_sched_in(struct perf_event_context *ctx,
1380 enum event_type_t event_type) 1404 enum event_type_t event_type)
1381{ 1405{
1382 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1406 struct perf_cpu_context *cpuctx;
1383 struct perf_event_context *ctx = task->perf_event_ctxp;
1384 1407
1385 if (likely(!ctx)) 1408 cpuctx = __get_cpu_context(ctx);
1386 return;
1387 if (cpuctx->task_ctx == ctx) 1409 if (cpuctx->task_ctx == ctx)
1388 return; 1410 return;
1411
1389 ctx_sched_in(ctx, cpuctx, event_type); 1412 ctx_sched_in(ctx, cpuctx, event_type);
1390 cpuctx->task_ctx = ctx; 1413 cpuctx->task_ctx = ctx;
1391} 1414}
1392/*
1393 * Called from scheduler to add the events of the current task
1394 * with interrupts disabled.
1395 *
1396 * We restore the event value and then enable it.
1397 *
1398 * This does not protect us against NMI, but enable()
1399 * sets the enabled bit in the control field of event _before_
1400 * accessing the event control register. If a NMI hits, then it will
1401 * keep the event running.
1402 */
1403void perf_event_task_sched_in(struct task_struct *task)
1404{
1405 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1406 struct perf_event_context *ctx = task->perf_event_ctxp;
1407 1415
1408 if (likely(!ctx)) 1416void perf_event_context_sched_in(struct perf_event_context *ctx)
1409 return; 1417{
1418 struct perf_cpu_context *cpuctx;
1410 1419
1420 cpuctx = __get_cpu_context(ctx);
1411 if (cpuctx->task_ctx == ctx) 1421 if (cpuctx->task_ctx == ctx)
1412 return; 1422 return;
1413 1423
1414 perf_disable(); 1424 perf_pmu_disable(ctx->pmu);
1415
1416 /* 1425 /*
1417 * We want to keep the following priority order: 1426 * We want to keep the following priority order:
1418 * cpu pinned (that don't need to move), task pinned, 1427 * cpu pinned (that don't need to move), task pinned,
@@ -1426,7 +1435,37 @@ void perf_event_task_sched_in(struct task_struct *task)
1426 1435
1427 cpuctx->task_ctx = ctx; 1436 cpuctx->task_ctx = ctx;
1428 1437
1429 perf_enable(); 1438 /*
1439 * Since these rotations are per-cpu, we need to ensure the
1440 * cpu-context we got scheduled on is actually rotating.
1441 */
1442 perf_pmu_rotate_start(ctx->pmu);
1443 perf_pmu_enable(ctx->pmu);
1444}
1445
1446/*
1447 * Called from scheduler to add the events of the current task
1448 * with interrupts disabled.
1449 *
1450 * We restore the event value and then enable it.
1451 *
1452 * This does not protect us against NMI, but enable()
1453 * sets the enabled bit in the control field of event _before_
1454 * accessing the event control register. If a NMI hits, then it will
1455 * keep the event running.
1456 */
1457void __perf_event_task_sched_in(struct task_struct *task)
1458{
1459 struct perf_event_context *ctx;
1460 int ctxn;
1461
1462 for_each_task_context_nr(ctxn) {
1463 ctx = task->perf_event_ctxp[ctxn];
1464 if (likely(!ctx))
1465 continue;
1466
1467 perf_event_context_sched_in(ctx);
1468 }
1430} 1469}
1431 1470
1432#define MAX_INTERRUPTS (~0ULL) 1471#define MAX_INTERRUPTS (~0ULL)
@@ -1506,22 +1545,6 @@ do { \
1506 return div64_u64(dividend, divisor); 1545 return div64_u64(dividend, divisor);
1507} 1546}
1508 1547
1509static void perf_event_stop(struct perf_event *event)
1510{
1511 if (!event->pmu->stop)
1512 return event->pmu->disable(event);
1513
1514 return event->pmu->stop(event);
1515}
1516
1517static int perf_event_start(struct perf_event *event)
1518{
1519 if (!event->pmu->start)
1520 return event->pmu->enable(event);
1521
1522 return event->pmu->start(event);
1523}
1524
1525static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) 1548static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1526{ 1549{
1527 struct hw_perf_event *hwc = &event->hw; 1550 struct hw_perf_event *hwc = &event->hw;
@@ -1541,15 +1564,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1541 hwc->sample_period = sample_period; 1564 hwc->sample_period = sample_period;
1542 1565
1543 if (local64_read(&hwc->period_left) > 8*sample_period) { 1566 if (local64_read(&hwc->period_left) > 8*sample_period) {
1544 perf_disable(); 1567 event->pmu->stop(event, PERF_EF_UPDATE);
1545 perf_event_stop(event);
1546 local64_set(&hwc->period_left, 0); 1568 local64_set(&hwc->period_left, 0);
1547 perf_event_start(event); 1569 event->pmu->start(event, PERF_EF_RELOAD);
1548 perf_enable();
1549 } 1570 }
1550} 1571}
1551 1572
1552static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 1573static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
1553{ 1574{
1554 struct perf_event *event; 1575 struct perf_event *event;
1555 struct hw_perf_event *hwc; 1576 struct hw_perf_event *hwc;
@@ -1574,23 +1595,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1574 */ 1595 */
1575 if (interrupts == MAX_INTERRUPTS) { 1596 if (interrupts == MAX_INTERRUPTS) {
1576 perf_log_throttle(event, 1); 1597 perf_log_throttle(event, 1);
1577 perf_disable(); 1598 event->pmu->start(event, 0);
1578 event->pmu->unthrottle(event);
1579 perf_enable();
1580 } 1599 }
1581 1600
1582 if (!event->attr.freq || !event->attr.sample_freq) 1601 if (!event->attr.freq || !event->attr.sample_freq)
1583 continue; 1602 continue;
1584 1603
1585 perf_disable();
1586 event->pmu->read(event); 1604 event->pmu->read(event);
1587 now = local64_read(&event->count); 1605 now = local64_read(&event->count);
1588 delta = now - hwc->freq_count_stamp; 1606 delta = now - hwc->freq_count_stamp;
1589 hwc->freq_count_stamp = now; 1607 hwc->freq_count_stamp = now;
1590 1608
1591 if (delta > 0) 1609 if (delta > 0)
1592 perf_adjust_period(event, TICK_NSEC, delta); 1610 perf_adjust_period(event, period, delta);
1593 perf_enable();
1594 } 1611 }
1595 raw_spin_unlock(&ctx->lock); 1612 raw_spin_unlock(&ctx->lock);
1596} 1613}
@@ -1608,32 +1625,38 @@ static void rotate_ctx(struct perf_event_context *ctx)
1608 raw_spin_unlock(&ctx->lock); 1625 raw_spin_unlock(&ctx->lock);
1609} 1626}
1610 1627
1611void perf_event_task_tick(struct task_struct *curr) 1628/*
1629 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
1630 * because they're strictly cpu affine and rotate_start is called with IRQs
1631 * disabled, while rotate_context is called from IRQ context.
1632 */
1633static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1612{ 1634{
1613 struct perf_cpu_context *cpuctx; 1635 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
1614 struct perf_event_context *ctx; 1636 struct perf_event_context *ctx = NULL;
1615 int rotate = 0; 1637 int rotate = 0, remove = 1;
1616
1617 if (!atomic_read(&nr_events))
1618 return;
1619 1638
1620 cpuctx = &__get_cpu_var(perf_cpu_context); 1639 if (cpuctx->ctx.nr_events) {
1621 if (cpuctx->ctx.nr_events && 1640 remove = 0;
1622 cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) 1641 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
1623 rotate = 1; 1642 rotate = 1;
1643 }
1624 1644
1625 ctx = curr->perf_event_ctxp; 1645 ctx = cpuctx->task_ctx;
1626 if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) 1646 if (ctx && ctx->nr_events) {
1627 rotate = 1; 1647 remove = 0;
1648 if (ctx->nr_events != ctx->nr_active)
1649 rotate = 1;
1650 }
1628 1651
1629 perf_ctx_adjust_freq(&cpuctx->ctx); 1652 perf_pmu_disable(cpuctx->ctx.pmu);
1653 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
1630 if (ctx) 1654 if (ctx)
1631 perf_ctx_adjust_freq(ctx); 1655 perf_ctx_adjust_freq(ctx, interval);
1632 1656
1633 if (!rotate) 1657 if (!rotate)
1634 return; 1658 goto done;
1635 1659
1636 perf_disable();
1637 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 1660 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1638 if (ctx) 1661 if (ctx)
1639 task_ctx_sched_out(ctx, EVENT_FLEXIBLE); 1662 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1644,8 +1667,27 @@ void perf_event_task_tick(struct task_struct *curr)
1644 1667
1645 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 1668 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1646 if (ctx) 1669 if (ctx)
1647 task_ctx_sched_in(curr, EVENT_FLEXIBLE); 1670 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1648 perf_enable(); 1671
1672done:
1673 if (remove)
1674 list_del_init(&cpuctx->rotation_list);
1675
1676 perf_pmu_enable(cpuctx->ctx.pmu);
1677}
1678
1679void perf_event_task_tick(void)
1680{
1681 struct list_head *head = &__get_cpu_var(rotation_list);
1682 struct perf_cpu_context *cpuctx, *tmp;
1683
1684 WARN_ON(!irqs_disabled());
1685
1686 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
1687 if (cpuctx->jiffies_interval == 1 ||
1688 !(jiffies % cpuctx->jiffies_interval))
1689 perf_rotate_context(cpuctx);
1690 }
1649} 1691}
1650 1692
1651static int event_enable_on_exec(struct perf_event *event, 1693static int event_enable_on_exec(struct perf_event *event,
@@ -1667,20 +1709,18 @@ static int event_enable_on_exec(struct perf_event *event,
1667 * Enable all of a task's events that have been marked enable-on-exec. 1709 * Enable all of a task's events that have been marked enable-on-exec.
1668 * This expects task == current. 1710 * This expects task == current.
1669 */ 1711 */
1670static void perf_event_enable_on_exec(struct task_struct *task) 1712static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1671{ 1713{
1672 struct perf_event_context *ctx;
1673 struct perf_event *event; 1714 struct perf_event *event;
1674 unsigned long flags; 1715 unsigned long flags;
1675 int enabled = 0; 1716 int enabled = 0;
1676 int ret; 1717 int ret;
1677 1718
1678 local_irq_save(flags); 1719 local_irq_save(flags);
1679 ctx = task->perf_event_ctxp;
1680 if (!ctx || !ctx->nr_events) 1720 if (!ctx || !ctx->nr_events)
1681 goto out; 1721 goto out;
1682 1722
1683 __perf_event_task_sched_out(ctx); 1723 task_ctx_sched_out(ctx, EVENT_ALL);
1684 1724
1685 raw_spin_lock(&ctx->lock); 1725 raw_spin_lock(&ctx->lock);
1686 1726
@@ -1704,8 +1744,8 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1704 1744
1705 raw_spin_unlock(&ctx->lock); 1745 raw_spin_unlock(&ctx->lock);
1706 1746
1707 perf_event_task_sched_in(task); 1747 perf_event_context_sched_in(ctx);
1708 out: 1748out:
1709 local_irq_restore(flags); 1749 local_irq_restore(flags);
1710} 1750}
1711 1751
@@ -1714,9 +1754,9 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1714 */ 1754 */
1715static void __perf_event_read(void *info) 1755static void __perf_event_read(void *info)
1716{ 1756{
1717 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1718 struct perf_event *event = info; 1757 struct perf_event *event = info;
1719 struct perf_event_context *ctx = event->ctx; 1758 struct perf_event_context *ctx = event->ctx;
1759 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1720 1760
1721 /* 1761 /*
1722 * If this is a task context, we need to check whether it is 1762 * If this is a task context, we need to check whether it is
@@ -1755,7 +1795,13 @@ static u64 perf_event_read(struct perf_event *event)
1755 unsigned long flags; 1795 unsigned long flags;
1756 1796
1757 raw_spin_lock_irqsave(&ctx->lock, flags); 1797 raw_spin_lock_irqsave(&ctx->lock, flags);
1758 update_context_time(ctx); 1798 /*
1799 * may read while context is not active
1800 * (e.g., thread is blocked), in that case
1801 * we cannot update context time
1802 */
1803 if (ctx->is_active)
1804 update_context_time(ctx);
1759 update_event_times(event); 1805 update_event_times(event);
1760 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1806 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1761 } 1807 }
@@ -1764,11 +1810,219 @@ static u64 perf_event_read(struct perf_event *event)
1764} 1810}
1765 1811
1766/* 1812/*
1767 * Initialize the perf_event context in a task_struct: 1813 * Callchain support
1768 */ 1814 */
1815
1816struct callchain_cpus_entries {
1817 struct rcu_head rcu_head;
1818 struct perf_callchain_entry *cpu_entries[0];
1819};
1820
1821static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
1822static atomic_t nr_callchain_events;
1823static DEFINE_MUTEX(callchain_mutex);
1824struct callchain_cpus_entries *callchain_cpus_entries;
1825
1826
1827__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
1828 struct pt_regs *regs)
1829{
1830}
1831
1832__weak void perf_callchain_user(struct perf_callchain_entry *entry,
1833 struct pt_regs *regs)
1834{
1835}
1836
1837static void release_callchain_buffers_rcu(struct rcu_head *head)
1838{
1839 struct callchain_cpus_entries *entries;
1840 int cpu;
1841
1842 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
1843
1844 for_each_possible_cpu(cpu)
1845 kfree(entries->cpu_entries[cpu]);
1846
1847 kfree(entries);
1848}
1849
1850static void release_callchain_buffers(void)
1851{
1852 struct callchain_cpus_entries *entries;
1853
1854 entries = callchain_cpus_entries;
1855 rcu_assign_pointer(callchain_cpus_entries, NULL);
1856 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
1857}
1858
1859static int alloc_callchain_buffers(void)
1860{
1861 int cpu;
1862 int size;
1863 struct callchain_cpus_entries *entries;
1864
1865 /*
1866 * We can't use the percpu allocation API for data that can be
1867 * accessed from NMI. Use a temporary manual per cpu allocation
1868 * until that gets sorted out.
1869 */
1870 size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
1871 num_possible_cpus();
1872
1873 entries = kzalloc(size, GFP_KERNEL);
1874 if (!entries)
1875 return -ENOMEM;
1876
1877 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
1878
1879 for_each_possible_cpu(cpu) {
1880 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
1881 cpu_to_node(cpu));
1882 if (!entries->cpu_entries[cpu])
1883 goto fail;
1884 }
1885
1886 rcu_assign_pointer(callchain_cpus_entries, entries);
1887
1888 return 0;
1889
1890fail:
1891 for_each_possible_cpu(cpu)
1892 kfree(entries->cpu_entries[cpu]);
1893 kfree(entries);
1894
1895 return -ENOMEM;
1896}
1897
1898static int get_callchain_buffers(void)
1899{
1900 int err = 0;
1901 int count;
1902
1903 mutex_lock(&callchain_mutex);
1904
1905 count = atomic_inc_return(&nr_callchain_events);
1906 if (WARN_ON_ONCE(count < 1)) {
1907 err = -EINVAL;
1908 goto exit;
1909 }
1910
1911 if (count > 1) {
1912 /* If the allocation failed, give up */
1913 if (!callchain_cpus_entries)
1914 err = -ENOMEM;
1915 goto exit;
1916 }
1917
1918 err = alloc_callchain_buffers();
1919 if (err)
1920 release_callchain_buffers();
1921exit:
1922 mutex_unlock(&callchain_mutex);
1923
1924 return err;
1925}
1926
1927static void put_callchain_buffers(void)
1928{
1929 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
1930 release_callchain_buffers();
1931 mutex_unlock(&callchain_mutex);
1932 }
1933}
1934
1935static int get_recursion_context(int *recursion)
1936{
1937 int rctx;
1938
1939 if (in_nmi())
1940 rctx = 3;
1941 else if (in_irq())
1942 rctx = 2;
1943 else if (in_softirq())
1944 rctx = 1;
1945 else
1946 rctx = 0;
1947
1948 if (recursion[rctx])
1949 return -1;
1950
1951 recursion[rctx]++;
1952 barrier();
1953
1954 return rctx;
1955}
1956
1957static inline void put_recursion_context(int *recursion, int rctx)
1958{
1959 barrier();
1960 recursion[rctx]--;
1961}
1962
1963static struct perf_callchain_entry *get_callchain_entry(int *rctx)
1964{
1965 int cpu;
1966 struct callchain_cpus_entries *entries;
1967
1968 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
1969 if (*rctx == -1)
1970 return NULL;
1971
1972 entries = rcu_dereference(callchain_cpus_entries);
1973 if (!entries)
1974 return NULL;
1975
1976 cpu = smp_processor_id();
1977
1978 return &entries->cpu_entries[cpu][*rctx];
1979}
1980
1769static void 1981static void
1770__perf_event_init_context(struct perf_event_context *ctx, 1982put_callchain_entry(int rctx)
1771 struct task_struct *task) 1983{
1984 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
1985}
1986
1987static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1988{
1989 int rctx;
1990 struct perf_callchain_entry *entry;
1991
1992
1993 entry = get_callchain_entry(&rctx);
1994 if (rctx == -1)
1995 return NULL;
1996
1997 if (!entry)
1998 goto exit_put;
1999
2000 entry->nr = 0;
2001
2002 if (!user_mode(regs)) {
2003 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
2004 perf_callchain_kernel(entry, regs);
2005 if (current->mm)
2006 regs = task_pt_regs(current);
2007 else
2008 regs = NULL;
2009 }
2010
2011 if (regs) {
2012 perf_callchain_store(entry, PERF_CONTEXT_USER);
2013 perf_callchain_user(entry, regs);
2014 }
2015
2016exit_put:
2017 put_callchain_entry(rctx);
2018
2019 return entry;
2020}
2021
2022/*
2023 * Initialize the perf_event context in a task_struct:
2024 */
2025static void __perf_event_init_context(struct perf_event_context *ctx)
1772{ 2026{
1773 raw_spin_lock_init(&ctx->lock); 2027 raw_spin_lock_init(&ctx->lock);
1774 mutex_init(&ctx->mutex); 2028 mutex_init(&ctx->mutex);
@@ -1776,45 +2030,38 @@ __perf_event_init_context(struct perf_event_context *ctx,
1776 INIT_LIST_HEAD(&ctx->flexible_groups); 2030 INIT_LIST_HEAD(&ctx->flexible_groups);
1777 INIT_LIST_HEAD(&ctx->event_list); 2031 INIT_LIST_HEAD(&ctx->event_list);
1778 atomic_set(&ctx->refcount, 1); 2032 atomic_set(&ctx->refcount, 1);
1779 ctx->task = task;
1780} 2033}
1781 2034
1782static struct perf_event_context *find_get_context(pid_t pid, int cpu) 2035static struct perf_event_context *
2036alloc_perf_context(struct pmu *pmu, struct task_struct *task)
1783{ 2037{
1784 struct perf_event_context *ctx; 2038 struct perf_event_context *ctx;
1785 struct perf_cpu_context *cpuctx;
1786 struct task_struct *task;
1787 unsigned long flags;
1788 int err;
1789 2039
1790 if (pid == -1 && cpu != -1) { 2040 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1791 /* Must be root to operate on a CPU event: */ 2041 if (!ctx)
1792 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 2042 return NULL;
1793 return ERR_PTR(-EACCES);
1794
1795 if (cpu < 0 || cpu >= nr_cpumask_bits)
1796 return ERR_PTR(-EINVAL);
1797 2043
1798 /* 2044 __perf_event_init_context(ctx);
1799 * We could be clever and allow to attach a event to an 2045 if (task) {
1800 * offline CPU and activate it when the CPU comes up, but 2046 ctx->task = task;
1801 * that's for later. 2047 get_task_struct(task);
1802 */ 2048 }
1803 if (!cpu_online(cpu)) 2049 ctx->pmu = pmu;
1804 return ERR_PTR(-ENODEV);
1805 2050
1806 cpuctx = &per_cpu(perf_cpu_context, cpu); 2051 return ctx;
1807 ctx = &cpuctx->ctx; 2052}
1808 get_ctx(ctx);
1809 2053
1810 return ctx; 2054static struct task_struct *
1811 } 2055find_lively_task_by_vpid(pid_t vpid)
2056{
2057 struct task_struct *task;
2058 int err;
1812 2059
1813 rcu_read_lock(); 2060 rcu_read_lock();
1814 if (!pid) 2061 if (!vpid)
1815 task = current; 2062 task = current;
1816 else 2063 else
1817 task = find_task_by_vpid(pid); 2064 task = find_task_by_vpid(vpid);
1818 if (task) 2065 if (task)
1819 get_task_struct(task); 2066 get_task_struct(task);
1820 rcu_read_unlock(); 2067 rcu_read_unlock();
@@ -1834,36 +2081,78 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1834 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 2081 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1835 goto errout; 2082 goto errout;
1836 2083
1837 retry: 2084 return task;
1838 ctx = perf_lock_task_context(task, &flags); 2085errout:
2086 put_task_struct(task);
2087 return ERR_PTR(err);
2088
2089}
2090
2091static struct perf_event_context *
2092find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2093{
2094 struct perf_event_context *ctx;
2095 struct perf_cpu_context *cpuctx;
2096 unsigned long flags;
2097 int ctxn, err;
2098
2099 if (!task && cpu != -1) {
2100 /* Must be root to operate on a CPU event: */
2101 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2102 return ERR_PTR(-EACCES);
2103
2104 if (cpu < 0 || cpu >= nr_cpumask_bits)
2105 return ERR_PTR(-EINVAL);
2106
2107 /*
2108 * We could be clever and allow to attach a event to an
2109 * offline CPU and activate it when the CPU comes up, but
2110 * that's for later.
2111 */
2112 if (!cpu_online(cpu))
2113 return ERR_PTR(-ENODEV);
2114
2115 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2116 ctx = &cpuctx->ctx;
2117 get_ctx(ctx);
2118
2119 return ctx;
2120 }
2121
2122 err = -EINVAL;
2123 ctxn = pmu->task_ctx_nr;
2124 if (ctxn < 0)
2125 goto errout;
2126
2127retry:
2128 ctx = perf_lock_task_context(task, ctxn, &flags);
1839 if (ctx) { 2129 if (ctx) {
1840 unclone_ctx(ctx); 2130 unclone_ctx(ctx);
1841 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2131 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1842 } 2132 }
1843 2133
1844 if (!ctx) { 2134 if (!ctx) {
1845 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); 2135 ctx = alloc_perf_context(pmu, task);
1846 err = -ENOMEM; 2136 err = -ENOMEM;
1847 if (!ctx) 2137 if (!ctx)
1848 goto errout; 2138 goto errout;
1849 __perf_event_init_context(ctx, task); 2139
1850 get_ctx(ctx); 2140 get_ctx(ctx);
1851 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { 2141
2142 if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
1852 /* 2143 /*
1853 * We raced with some other task; use 2144 * We raced with some other task; use
1854 * the context they set. 2145 * the context they set.
1855 */ 2146 */
2147 put_task_struct(task);
1856 kfree(ctx); 2148 kfree(ctx);
1857 goto retry; 2149 goto retry;
1858 } 2150 }
1859 get_task_struct(task);
1860 } 2151 }
1861 2152
1862 put_task_struct(task);
1863 return ctx; 2153 return ctx;
1864 2154
1865 errout: 2155errout:
1866 put_task_struct(task);
1867 return ERR_PTR(err); 2156 return ERR_PTR(err);
1868} 2157}
1869 2158
@@ -1880,21 +2169,23 @@ static void free_event_rcu(struct rcu_head *head)
1880 kfree(event); 2169 kfree(event);
1881} 2170}
1882 2171
1883static void perf_pending_sync(struct perf_event *event);
1884static void perf_buffer_put(struct perf_buffer *buffer); 2172static void perf_buffer_put(struct perf_buffer *buffer);
1885 2173
1886static void free_event(struct perf_event *event) 2174static void free_event(struct perf_event *event)
1887{ 2175{
1888 perf_pending_sync(event); 2176 irq_work_sync(&event->pending);
1889 2177
1890 if (!event->parent) { 2178 if (!event->parent) {
1891 atomic_dec(&nr_events); 2179 if (event->attach_state & PERF_ATTACH_TASK)
2180 jump_label_dec(&perf_task_events);
1892 if (event->attr.mmap || event->attr.mmap_data) 2181 if (event->attr.mmap || event->attr.mmap_data)
1893 atomic_dec(&nr_mmap_events); 2182 atomic_dec(&nr_mmap_events);
1894 if (event->attr.comm) 2183 if (event->attr.comm)
1895 atomic_dec(&nr_comm_events); 2184 atomic_dec(&nr_comm_events);
1896 if (event->attr.task) 2185 if (event->attr.task)
1897 atomic_dec(&nr_task_events); 2186 atomic_dec(&nr_task_events);
2187 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2188 put_callchain_buffers();
1898 } 2189 }
1899 2190
1900 if (event->buffer) { 2191 if (event->buffer) {
@@ -1905,7 +2196,9 @@ static void free_event(struct perf_event *event)
1905 if (event->destroy) 2196 if (event->destroy)
1906 event->destroy(event); 2197 event->destroy(event);
1907 2198
1908 put_ctx(event->ctx); 2199 if (event->ctx)
2200 put_ctx(event->ctx);
2201
1909 call_rcu(&event->rcu_head, free_event_rcu); 2202 call_rcu(&event->rcu_head, free_event_rcu);
1910} 2203}
1911 2204
@@ -2184,15 +2477,13 @@ static void perf_event_for_each(struct perf_event *event,
2184static int perf_event_period(struct perf_event *event, u64 __user *arg) 2477static int perf_event_period(struct perf_event *event, u64 __user *arg)
2185{ 2478{
2186 struct perf_event_context *ctx = event->ctx; 2479 struct perf_event_context *ctx = event->ctx;
2187 unsigned long size;
2188 int ret = 0; 2480 int ret = 0;
2189 u64 value; 2481 u64 value;
2190 2482
2191 if (!event->attr.sample_period) 2483 if (!event->attr.sample_period)
2192 return -EINVAL; 2484 return -EINVAL;
2193 2485
2194 size = copy_from_user(&value, arg, sizeof(value)); 2486 if (copy_from_user(&value, arg, sizeof(value)))
2195 if (size != sizeof(value))
2196 return -EFAULT; 2487 return -EFAULT;
2197 2488
2198 if (!value) 2489 if (!value)
@@ -2326,6 +2617,9 @@ int perf_event_task_disable(void)
2326 2617
2327static int perf_event_index(struct perf_event *event) 2618static int perf_event_index(struct perf_event *event)
2328{ 2619{
2620 if (event->hw.state & PERF_HES_STOPPED)
2621 return 0;
2622
2329 if (event->state != PERF_EVENT_STATE_ACTIVE) 2623 if (event->state != PERF_EVENT_STATE_ACTIVE)
2330 return 0; 2624 return 0;
2331 2625
@@ -2829,16 +3123,7 @@ void perf_event_wakeup(struct perf_event *event)
2829 } 3123 }
2830} 3124}
2831 3125
2832/* 3126static void perf_pending_event(struct irq_work *entry)
2833 * Pending wakeups
2834 *
2835 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2836 *
2837 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2838 * single linked list and use cmpxchg() to add entries lockless.
2839 */
2840
2841static void perf_pending_event(struct perf_pending_entry *entry)
2842{ 3127{
2843 struct perf_event *event = container_of(entry, 3128 struct perf_event *event = container_of(entry,
2844 struct perf_event, pending); 3129 struct perf_event, pending);
@@ -2854,99 +3139,6 @@ static void perf_pending_event(struct perf_pending_entry *entry)
2854 } 3139 }
2855} 3140}
2856 3141
2857#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2858
2859static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2860 PENDING_TAIL,
2861};
2862
2863static void perf_pending_queue(struct perf_pending_entry *entry,
2864 void (*func)(struct perf_pending_entry *))
2865{
2866 struct perf_pending_entry **head;
2867
2868 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2869 return;
2870
2871 entry->func = func;
2872
2873 head = &get_cpu_var(perf_pending_head);
2874
2875 do {
2876 entry->next = *head;
2877 } while (cmpxchg(head, entry->next, entry) != entry->next);
2878
2879 set_perf_event_pending();
2880
2881 put_cpu_var(perf_pending_head);
2882}
2883
2884static int __perf_pending_run(void)
2885{
2886 struct perf_pending_entry *list;
2887 int nr = 0;
2888
2889 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2890 while (list != PENDING_TAIL) {
2891 void (*func)(struct perf_pending_entry *);
2892 struct perf_pending_entry *entry = list;
2893
2894 list = list->next;
2895
2896 func = entry->func;
2897 entry->next = NULL;
2898 /*
2899 * Ensure we observe the unqueue before we issue the wakeup,
2900 * so that we won't be waiting forever.
2901 * -- see perf_not_pending().
2902 */
2903 smp_wmb();
2904
2905 func(entry);
2906 nr++;
2907 }
2908
2909 return nr;
2910}
2911
2912static inline int perf_not_pending(struct perf_event *event)
2913{
2914 /*
2915 * If we flush on whatever cpu we run, there is a chance we don't
2916 * need to wait.
2917 */
2918 get_cpu();
2919 __perf_pending_run();
2920 put_cpu();
2921
2922 /*
2923 * Ensure we see the proper queue state before going to sleep
2924 * so that we do not miss the wakeup. -- see perf_pending_handle()
2925 */
2926 smp_rmb();
2927 return event->pending.next == NULL;
2928}
2929
2930static void perf_pending_sync(struct perf_event *event)
2931{
2932 wait_event(event->waitq, perf_not_pending(event));
2933}
2934
2935void perf_event_do_pending(void)
2936{
2937 __perf_pending_run();
2938}
2939
2940/*
2941 * Callchain support -- arch specific
2942 */
2943
2944__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2945{
2946 return NULL;
2947}
2948
2949
2950/* 3142/*
2951 * We assume there is only KVM supporting the callbacks. 3143 * We assume there is only KVM supporting the callbacks.
2952 * Later on, we might change it to a list if there is 3144 * Later on, we might change it to a list if there is
@@ -2996,8 +3188,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
2996 3188
2997 if (handle->nmi) { 3189 if (handle->nmi) {
2998 handle->event->pending_wakeup = 1; 3190 handle->event->pending_wakeup = 1;
2999 perf_pending_queue(&handle->event->pending, 3191 irq_work_queue(&handle->event->pending);
3000 perf_pending_event);
3001 } else 3192 } else
3002 perf_event_wakeup(handle->event); 3193 perf_event_wakeup(handle->event);
3003} 3194}
@@ -3053,7 +3244,7 @@ again:
3053 if (handle->wakeup != local_read(&buffer->wakeup)) 3244 if (handle->wakeup != local_read(&buffer->wakeup))
3054 perf_output_wakeup(handle); 3245 perf_output_wakeup(handle);
3055 3246
3056 out: 3247out:
3057 preempt_enable(); 3248 preempt_enable();
3058} 3249}
3059 3250
@@ -3441,14 +3632,20 @@ static void perf_event_output(struct perf_event *event, int nmi,
3441 struct perf_output_handle handle; 3632 struct perf_output_handle handle;
3442 struct perf_event_header header; 3633 struct perf_event_header header;
3443 3634
3635 /* protect the callchain buffers */
3636 rcu_read_lock();
3637
3444 perf_prepare_sample(&header, data, event, regs); 3638 perf_prepare_sample(&header, data, event, regs);
3445 3639
3446 if (perf_output_begin(&handle, event, header.size, nmi, 1)) 3640 if (perf_output_begin(&handle, event, header.size, nmi, 1))
3447 return; 3641 goto exit;
3448 3642
3449 perf_output_sample(&handle, &header, data, event); 3643 perf_output_sample(&handle, &header, data, event);
3450 3644
3451 perf_output_end(&handle); 3645 perf_output_end(&handle);
3646
3647exit:
3648 rcu_read_unlock();
3452} 3649}
3453 3650
3454/* 3651/*
@@ -3562,16 +3759,27 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
3562static void perf_event_task_event(struct perf_task_event *task_event) 3759static void perf_event_task_event(struct perf_task_event *task_event)
3563{ 3760{
3564 struct perf_cpu_context *cpuctx; 3761 struct perf_cpu_context *cpuctx;
3565 struct perf_event_context *ctx = task_event->task_ctx; 3762 struct perf_event_context *ctx;
3763 struct pmu *pmu;
3764 int ctxn;
3566 3765
3567 rcu_read_lock(); 3766 rcu_read_lock();
3568 cpuctx = &get_cpu_var(perf_cpu_context); 3767 list_for_each_entry_rcu(pmu, &pmus, entry) {
3569 perf_event_task_ctx(&cpuctx->ctx, task_event); 3768 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3570 if (!ctx) 3769 perf_event_task_ctx(&cpuctx->ctx, task_event);
3571 ctx = rcu_dereference(current->perf_event_ctxp); 3770
3572 if (ctx) 3771 ctx = task_event->task_ctx;
3573 perf_event_task_ctx(ctx, task_event); 3772 if (!ctx) {
3574 put_cpu_var(perf_cpu_context); 3773 ctxn = pmu->task_ctx_nr;
3774 if (ctxn < 0)
3775 goto next;
3776 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
3777 }
3778 if (ctx)
3779 perf_event_task_ctx(ctx, task_event);
3780next:
3781 put_cpu_ptr(pmu->pmu_cpu_context);
3782 }
3575 rcu_read_unlock(); 3783 rcu_read_unlock();
3576} 3784}
3577 3785
@@ -3676,8 +3884,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3676{ 3884{
3677 struct perf_cpu_context *cpuctx; 3885 struct perf_cpu_context *cpuctx;
3678 struct perf_event_context *ctx; 3886 struct perf_event_context *ctx;
3679 unsigned int size;
3680 char comm[TASK_COMM_LEN]; 3887 char comm[TASK_COMM_LEN];
3888 unsigned int size;
3889 struct pmu *pmu;
3890 int ctxn;
3681 3891
3682 memset(comm, 0, sizeof(comm)); 3892 memset(comm, 0, sizeof(comm));
3683 strlcpy(comm, comm_event->task->comm, sizeof(comm)); 3893 strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -3689,21 +3899,36 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3689 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 3899 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3690 3900
3691 rcu_read_lock(); 3901 rcu_read_lock();
3692 cpuctx = &get_cpu_var(perf_cpu_context); 3902 list_for_each_entry_rcu(pmu, &pmus, entry) {
3693 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3903 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3694 ctx = rcu_dereference(current->perf_event_ctxp); 3904 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3695 if (ctx) 3905
3696 perf_event_comm_ctx(ctx, comm_event); 3906 ctxn = pmu->task_ctx_nr;
3697 put_cpu_var(perf_cpu_context); 3907 if (ctxn < 0)
3908 goto next;
3909
3910 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
3911 if (ctx)
3912 perf_event_comm_ctx(ctx, comm_event);
3913next:
3914 put_cpu_ptr(pmu->pmu_cpu_context);
3915 }
3698 rcu_read_unlock(); 3916 rcu_read_unlock();
3699} 3917}
3700 3918
3701void perf_event_comm(struct task_struct *task) 3919void perf_event_comm(struct task_struct *task)
3702{ 3920{
3703 struct perf_comm_event comm_event; 3921 struct perf_comm_event comm_event;
3922 struct perf_event_context *ctx;
3923 int ctxn;
3924
3925 for_each_task_context_nr(ctxn) {
3926 ctx = task->perf_event_ctxp[ctxn];
3927 if (!ctx)
3928 continue;
3704 3929
3705 if (task->perf_event_ctxp) 3930 perf_event_enable_on_exec(ctx);
3706 perf_event_enable_on_exec(task); 3931 }
3707 3932
3708 if (!atomic_read(&nr_comm_events)) 3933 if (!atomic_read(&nr_comm_events))
3709 return; 3934 return;
@@ -3805,6 +4030,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3805 char tmp[16]; 4030 char tmp[16];
3806 char *buf = NULL; 4031 char *buf = NULL;
3807 const char *name; 4032 const char *name;
4033 struct pmu *pmu;
4034 int ctxn;
3808 4035
3809 memset(tmp, 0, sizeof(tmp)); 4036 memset(tmp, 0, sizeof(tmp));
3810 4037
@@ -3857,12 +4084,23 @@ got_name:
3857 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 4084 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3858 4085
3859 rcu_read_lock(); 4086 rcu_read_lock();
3860 cpuctx = &get_cpu_var(perf_cpu_context); 4087 list_for_each_entry_rcu(pmu, &pmus, entry) {
3861 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); 4088 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3862 ctx = rcu_dereference(current->perf_event_ctxp); 4089 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
3863 if (ctx) 4090 vma->vm_flags & VM_EXEC);
3864 perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); 4091
3865 put_cpu_var(perf_cpu_context); 4092 ctxn = pmu->task_ctx_nr;
4093 if (ctxn < 0)
4094 goto next;
4095
4096 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4097 if (ctx) {
4098 perf_event_mmap_ctx(ctx, mmap_event,
4099 vma->vm_flags & VM_EXEC);
4100 }
4101next:
4102 put_cpu_ptr(pmu->pmu_cpu_context);
4103 }
3866 rcu_read_unlock(); 4104 rcu_read_unlock();
3867 4105
3868 kfree(buf); 4106 kfree(buf);
@@ -3944,8 +4182,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3944 struct hw_perf_event *hwc = &event->hw; 4182 struct hw_perf_event *hwc = &event->hw;
3945 int ret = 0; 4183 int ret = 0;
3946 4184
3947 throttle = (throttle && event->pmu->unthrottle != NULL);
3948
3949 if (!throttle) { 4185 if (!throttle) {
3950 hwc->interrupts++; 4186 hwc->interrupts++;
3951 } else { 4187 } else {
@@ -3988,8 +4224,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3988 event->pending_kill = POLL_HUP; 4224 event->pending_kill = POLL_HUP;
3989 if (nmi) { 4225 if (nmi) {
3990 event->pending_disable = 1; 4226 event->pending_disable = 1;
3991 perf_pending_queue(&event->pending, 4227 irq_work_queue(&event->pending);
3992 perf_pending_event);
3993 } else 4228 } else
3994 perf_event_disable(event); 4229 perf_event_disable(event);
3995 } 4230 }
@@ -4013,6 +4248,17 @@ int perf_event_overflow(struct perf_event *event, int nmi,
4013 * Generic software event infrastructure 4248 * Generic software event infrastructure
4014 */ 4249 */
4015 4250
4251struct swevent_htable {
4252 struct swevent_hlist *swevent_hlist;
4253 struct mutex hlist_mutex;
4254 int hlist_refcount;
4255
4256 /* Recursion avoidance in each contexts */
4257 int recursion[PERF_NR_CONTEXTS];
4258};
4259
4260static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
4261
4016/* 4262/*
4017 * We directly increment event->count and keep a second value in 4263 * We directly increment event->count and keep a second value in
4018 * event->hw.period_left to count intervals. This period event 4264 * event->hw.period_left to count intervals. This period event
@@ -4070,7 +4316,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4070 } 4316 }
4071} 4317}
4072 4318
4073static void perf_swevent_add(struct perf_event *event, u64 nr, 4319static void perf_swevent_event(struct perf_event *event, u64 nr,
4074 int nmi, struct perf_sample_data *data, 4320 int nmi, struct perf_sample_data *data,
4075 struct pt_regs *regs) 4321 struct pt_regs *regs)
4076{ 4322{
@@ -4096,6 +4342,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4096static int perf_exclude_event(struct perf_event *event, 4342static int perf_exclude_event(struct perf_event *event,
4097 struct pt_regs *regs) 4343 struct pt_regs *regs)
4098{ 4344{
4345 if (event->hw.state & PERF_HES_STOPPED)
4346 return 0;
4347
4099 if (regs) { 4348 if (regs) {
4100 if (event->attr.exclude_user && user_mode(regs)) 4349 if (event->attr.exclude_user && user_mode(regs))
4101 return 1; 4350 return 1;
@@ -4142,11 +4391,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
4142 4391
4143/* For the read side: events when they trigger */ 4392/* For the read side: events when they trigger */
4144static inline struct hlist_head * 4393static inline struct hlist_head *
4145find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) 4394find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
4146{ 4395{
4147 struct swevent_hlist *hlist; 4396 struct swevent_hlist *hlist;
4148 4397
4149 hlist = rcu_dereference(ctx->swevent_hlist); 4398 hlist = rcu_dereference(swhash->swevent_hlist);
4150 if (!hlist) 4399 if (!hlist)
4151 return NULL; 4400 return NULL;
4152 4401
@@ -4155,7 +4404,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4155 4404
4156/* For the event head insertion and removal in the hlist */ 4405/* For the event head insertion and removal in the hlist */
4157static inline struct hlist_head * 4406static inline struct hlist_head *
4158find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) 4407find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
4159{ 4408{
4160 struct swevent_hlist *hlist; 4409 struct swevent_hlist *hlist;
4161 u32 event_id = event->attr.config; 4410 u32 event_id = event->attr.config;
@@ -4166,7 +4415,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
4166 * and release. Which makes the protected version suitable here. 4415 * and release. Which makes the protected version suitable here.
4167 * The context lock guarantees that. 4416 * The context lock guarantees that.
4168 */ 4417 */
4169 hlist = rcu_dereference_protected(ctx->swevent_hlist, 4418 hlist = rcu_dereference_protected(swhash->swevent_hlist,
4170 lockdep_is_held(&event->ctx->lock)); 4419 lockdep_is_held(&event->ctx->lock));
4171 if (!hlist) 4420 if (!hlist)
4172 return NULL; 4421 return NULL;
@@ -4179,23 +4428,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4179 struct perf_sample_data *data, 4428 struct perf_sample_data *data,
4180 struct pt_regs *regs) 4429 struct pt_regs *regs)
4181{ 4430{
4182 struct perf_cpu_context *cpuctx; 4431 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4183 struct perf_event *event; 4432 struct perf_event *event;
4184 struct hlist_node *node; 4433 struct hlist_node *node;
4185 struct hlist_head *head; 4434 struct hlist_head *head;
4186 4435
4187 cpuctx = &__get_cpu_var(perf_cpu_context);
4188
4189 rcu_read_lock(); 4436 rcu_read_lock();
4190 4437 head = find_swevent_head_rcu(swhash, type, event_id);
4191 head = find_swevent_head_rcu(cpuctx, type, event_id);
4192
4193 if (!head) 4438 if (!head)
4194 goto end; 4439 goto end;
4195 4440
4196 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4441 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4197 if (perf_swevent_match(event, type, event_id, data, regs)) 4442 if (perf_swevent_match(event, type, event_id, data, regs))
4198 perf_swevent_add(event, nr, nmi, data, regs); 4443 perf_swevent_event(event, nr, nmi, data, regs);
4199 } 4444 }
4200end: 4445end:
4201 rcu_read_unlock(); 4446 rcu_read_unlock();
@@ -4203,33 +4448,17 @@ end:
4203 4448
4204int perf_swevent_get_recursion_context(void) 4449int perf_swevent_get_recursion_context(void)
4205{ 4450{
4206 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4451 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4207 int rctx;
4208
4209 if (in_nmi())
4210 rctx = 3;
4211 else if (in_irq())
4212 rctx = 2;
4213 else if (in_softirq())
4214 rctx = 1;
4215 else
4216 rctx = 0;
4217
4218 if (cpuctx->recursion[rctx])
4219 return -1;
4220 4452
4221 cpuctx->recursion[rctx]++; 4453 return get_recursion_context(swhash->recursion);
4222 barrier();
4223
4224 return rctx;
4225} 4454}
4226EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 4455EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4227 4456
4228void inline perf_swevent_put_recursion_context(int rctx) 4457void inline perf_swevent_put_recursion_context(int rctx)
4229{ 4458{
4230 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4459 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4231 barrier(); 4460
4232 cpuctx->recursion[rctx]--; 4461 put_recursion_context(swhash->recursion, rctx);
4233} 4462}
4234 4463
4235void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4464void __perf_sw_event(u32 event_id, u64 nr, int nmi,
@@ -4255,20 +4484,20 @@ static void perf_swevent_read(struct perf_event *event)
4255{ 4484{
4256} 4485}
4257 4486
4258static int perf_swevent_enable(struct perf_event *event) 4487static int perf_swevent_add(struct perf_event *event, int flags)
4259{ 4488{
4489 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4260 struct hw_perf_event *hwc = &event->hw; 4490 struct hw_perf_event *hwc = &event->hw;
4261 struct perf_cpu_context *cpuctx;
4262 struct hlist_head *head; 4491 struct hlist_head *head;
4263 4492
4264 cpuctx = &__get_cpu_var(perf_cpu_context);
4265
4266 if (hwc->sample_period) { 4493 if (hwc->sample_period) {
4267 hwc->last_period = hwc->sample_period; 4494 hwc->last_period = hwc->sample_period;
4268 perf_swevent_set_period(event); 4495 perf_swevent_set_period(event);
4269 } 4496 }
4270 4497
4271 head = find_swevent_head(cpuctx, event); 4498 hwc->state = !(flags & PERF_EF_START);
4499
4500 head = find_swevent_head(swhash, event);
4272 if (WARN_ON_ONCE(!head)) 4501 if (WARN_ON_ONCE(!head))
4273 return -EINVAL; 4502 return -EINVAL;
4274 4503
@@ -4277,202 +4506,27 @@ static int perf_swevent_enable(struct perf_event *event)
4277 return 0; 4506 return 0;
4278} 4507}
4279 4508
4280static void perf_swevent_disable(struct perf_event *event) 4509static void perf_swevent_del(struct perf_event *event, int flags)
4281{ 4510{
4282 hlist_del_rcu(&event->hlist_entry); 4511 hlist_del_rcu(&event->hlist_entry);
4283} 4512}
4284 4513
4285static void perf_swevent_void(struct perf_event *event) 4514static void perf_swevent_start(struct perf_event *event, int flags)
4286{
4287}
4288
4289static int perf_swevent_int(struct perf_event *event)
4290{
4291 return 0;
4292}
4293
4294static const struct pmu perf_ops_generic = {
4295 .enable = perf_swevent_enable,
4296 .disable = perf_swevent_disable,
4297 .start = perf_swevent_int,
4298 .stop = perf_swevent_void,
4299 .read = perf_swevent_read,
4300 .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */
4301};
4302
4303/*
4304 * hrtimer based swevent callback
4305 */
4306
4307static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4308{
4309 enum hrtimer_restart ret = HRTIMER_RESTART;
4310 struct perf_sample_data data;
4311 struct pt_regs *regs;
4312 struct perf_event *event;
4313 u64 period;
4314
4315 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4316 event->pmu->read(event);
4317
4318 perf_sample_data_init(&data, 0);
4319 data.period = event->hw.last_period;
4320 regs = get_irq_regs();
4321
4322 if (regs && !perf_exclude_event(event, regs)) {
4323 if (!(event->attr.exclude_idle && current->pid == 0))
4324 if (perf_event_overflow(event, 0, &data, regs))
4325 ret = HRTIMER_NORESTART;
4326 }
4327
4328 period = max_t(u64, 10000, event->hw.sample_period);
4329 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4330
4331 return ret;
4332}
4333
4334static void perf_swevent_start_hrtimer(struct perf_event *event)
4335{ 4515{
4336 struct hw_perf_event *hwc = &event->hw; 4516 event->hw.state = 0;
4337
4338 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4339 hwc->hrtimer.function = perf_swevent_hrtimer;
4340 if (hwc->sample_period) {
4341 u64 period;
4342
4343 if (hwc->remaining) {
4344 if (hwc->remaining < 0)
4345 period = 10000;
4346 else
4347 period = hwc->remaining;
4348 hwc->remaining = 0;
4349 } else {
4350 period = max_t(u64, 10000, hwc->sample_period);
4351 }
4352 __hrtimer_start_range_ns(&hwc->hrtimer,
4353 ns_to_ktime(period), 0,
4354 HRTIMER_MODE_REL, 0);
4355 }
4356} 4517}
4357 4518
4358static void perf_swevent_cancel_hrtimer(struct perf_event *event) 4519static void perf_swevent_stop(struct perf_event *event, int flags)
4359{ 4520{
4360 struct hw_perf_event *hwc = &event->hw; 4521 event->hw.state = PERF_HES_STOPPED;
4361
4362 if (hwc->sample_period) {
4363 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4364 hwc->remaining = ktime_to_ns(remaining);
4365
4366 hrtimer_cancel(&hwc->hrtimer);
4367 }
4368} 4522}
4369 4523
4370/*
4371 * Software event: cpu wall time clock
4372 */
4373
4374static void cpu_clock_perf_event_update(struct perf_event *event)
4375{
4376 int cpu = raw_smp_processor_id();
4377 s64 prev;
4378 u64 now;
4379
4380 now = cpu_clock(cpu);
4381 prev = local64_xchg(&event->hw.prev_count, now);
4382 local64_add(now - prev, &event->count);
4383}
4384
4385static int cpu_clock_perf_event_enable(struct perf_event *event)
4386{
4387 struct hw_perf_event *hwc = &event->hw;
4388 int cpu = raw_smp_processor_id();
4389
4390 local64_set(&hwc->prev_count, cpu_clock(cpu));
4391 perf_swevent_start_hrtimer(event);
4392
4393 return 0;
4394}
4395
4396static void cpu_clock_perf_event_disable(struct perf_event *event)
4397{
4398 perf_swevent_cancel_hrtimer(event);
4399 cpu_clock_perf_event_update(event);
4400}
4401
4402static void cpu_clock_perf_event_read(struct perf_event *event)
4403{
4404 cpu_clock_perf_event_update(event);
4405}
4406
4407static const struct pmu perf_ops_cpu_clock = {
4408 .enable = cpu_clock_perf_event_enable,
4409 .disable = cpu_clock_perf_event_disable,
4410 .read = cpu_clock_perf_event_read,
4411};
4412
4413/*
4414 * Software event: task time clock
4415 */
4416
4417static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4418{
4419 u64 prev;
4420 s64 delta;
4421
4422 prev = local64_xchg(&event->hw.prev_count, now);
4423 delta = now - prev;
4424 local64_add(delta, &event->count);
4425}
4426
4427static int task_clock_perf_event_enable(struct perf_event *event)
4428{
4429 struct hw_perf_event *hwc = &event->hw;
4430 u64 now;
4431
4432 now = event->ctx->time;
4433
4434 local64_set(&hwc->prev_count, now);
4435
4436 perf_swevent_start_hrtimer(event);
4437
4438 return 0;
4439}
4440
4441static void task_clock_perf_event_disable(struct perf_event *event)
4442{
4443 perf_swevent_cancel_hrtimer(event);
4444 task_clock_perf_event_update(event, event->ctx->time);
4445
4446}
4447
4448static void task_clock_perf_event_read(struct perf_event *event)
4449{
4450 u64 time;
4451
4452 if (!in_nmi()) {
4453 update_context_time(event->ctx);
4454 time = event->ctx->time;
4455 } else {
4456 u64 now = perf_clock();
4457 u64 delta = now - event->ctx->timestamp;
4458 time = event->ctx->time + delta;
4459 }
4460
4461 task_clock_perf_event_update(event, time);
4462}
4463
4464static const struct pmu perf_ops_task_clock = {
4465 .enable = task_clock_perf_event_enable,
4466 .disable = task_clock_perf_event_disable,
4467 .read = task_clock_perf_event_read,
4468};
4469
4470/* Deref the hlist from the update side */ 4524/* Deref the hlist from the update side */
4471static inline struct swevent_hlist * 4525static inline struct swevent_hlist *
4472swevent_hlist_deref(struct perf_cpu_context *cpuctx) 4526swevent_hlist_deref(struct swevent_htable *swhash)
4473{ 4527{
4474 return rcu_dereference_protected(cpuctx->swevent_hlist, 4528 return rcu_dereference_protected(swhash->swevent_hlist,
4475 lockdep_is_held(&cpuctx->hlist_mutex)); 4529 lockdep_is_held(&swhash->hlist_mutex));
4476} 4530}
4477 4531
4478static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) 4532static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
@@ -4483,27 +4537,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4483 kfree(hlist); 4537 kfree(hlist);
4484} 4538}
4485 4539
4486static void swevent_hlist_release(struct perf_cpu_context *cpuctx) 4540static void swevent_hlist_release(struct swevent_htable *swhash)
4487{ 4541{
4488 struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); 4542 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
4489 4543
4490 if (!hlist) 4544 if (!hlist)
4491 return; 4545 return;
4492 4546
4493 rcu_assign_pointer(cpuctx->swevent_hlist, NULL); 4547 rcu_assign_pointer(swhash->swevent_hlist, NULL);
4494 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); 4548 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
4495} 4549}
4496 4550
4497static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) 4551static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
4498{ 4552{
4499 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 4553 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4500 4554
4501 mutex_lock(&cpuctx->hlist_mutex); 4555 mutex_lock(&swhash->hlist_mutex);
4502 4556
4503 if (!--cpuctx->hlist_refcount) 4557 if (!--swhash->hlist_refcount)
4504 swevent_hlist_release(cpuctx); 4558 swevent_hlist_release(swhash);
4505 4559
4506 mutex_unlock(&cpuctx->hlist_mutex); 4560 mutex_unlock(&swhash->hlist_mutex);
4507} 4561}
4508 4562
4509static void swevent_hlist_put(struct perf_event *event) 4563static void swevent_hlist_put(struct perf_event *event)
@@ -4521,12 +4575,12 @@ static void swevent_hlist_put(struct perf_event *event)
4521 4575
4522static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) 4576static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4523{ 4577{
4524 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 4578 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4525 int err = 0; 4579 int err = 0;
4526 4580
4527 mutex_lock(&cpuctx->hlist_mutex); 4581 mutex_lock(&swhash->hlist_mutex);
4528 4582
4529 if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { 4583 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
4530 struct swevent_hlist *hlist; 4584 struct swevent_hlist *hlist;
4531 4585
4532 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 4586 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4534,11 +4588,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4534 err = -ENOMEM; 4588 err = -ENOMEM;
4535 goto exit; 4589 goto exit;
4536 } 4590 }
4537 rcu_assign_pointer(cpuctx->swevent_hlist, hlist); 4591 rcu_assign_pointer(swhash->swevent_hlist, hlist);
4538 } 4592 }
4539 cpuctx->hlist_refcount++; 4593 swhash->hlist_refcount++;
4540 exit: 4594exit:
4541 mutex_unlock(&cpuctx->hlist_mutex); 4595 mutex_unlock(&swhash->hlist_mutex);
4542 4596
4543 return err; 4597 return err;
4544} 4598}
@@ -4562,7 +4616,7 @@ static int swevent_hlist_get(struct perf_event *event)
4562 put_online_cpus(); 4616 put_online_cpus();
4563 4617
4564 return 0; 4618 return 0;
4565 fail: 4619fail:
4566 for_each_possible_cpu(cpu) { 4620 for_each_possible_cpu(cpu) {
4567 if (cpu == failed_cpu) 4621 if (cpu == failed_cpu)
4568 break; 4622 break;
@@ -4573,17 +4627,64 @@ static int swevent_hlist_get(struct perf_event *event)
4573 return err; 4627 return err;
4574} 4628}
4575 4629
4576#ifdef CONFIG_EVENT_TRACING 4630atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4631
4632static void sw_perf_event_destroy(struct perf_event *event)
4633{
4634 u64 event_id = event->attr.config;
4635
4636 WARN_ON(event->parent);
4637
4638 jump_label_dec(&perf_swevent_enabled[event_id]);
4639 swevent_hlist_put(event);
4640}
4641
4642static int perf_swevent_init(struct perf_event *event)
4643{
4644 int event_id = event->attr.config;
4645
4646 if (event->attr.type != PERF_TYPE_SOFTWARE)
4647 return -ENOENT;
4577 4648
4578static const struct pmu perf_ops_tracepoint = { 4649 switch (event_id) {
4579 .enable = perf_trace_enable, 4650 case PERF_COUNT_SW_CPU_CLOCK:
4580 .disable = perf_trace_disable, 4651 case PERF_COUNT_SW_TASK_CLOCK:
4581 .start = perf_swevent_int, 4652 return -ENOENT;
4582 .stop = perf_swevent_void, 4653
4654 default:
4655 break;
4656 }
4657
4658 if (event_id > PERF_COUNT_SW_MAX)
4659 return -ENOENT;
4660
4661 if (!event->parent) {
4662 int err;
4663
4664 err = swevent_hlist_get(event);
4665 if (err)
4666 return err;
4667
4668 jump_label_inc(&perf_swevent_enabled[event_id]);
4669 event->destroy = sw_perf_event_destroy;
4670 }
4671
4672 return 0;
4673}
4674
4675static struct pmu perf_swevent = {
4676 .task_ctx_nr = perf_sw_context,
4677
4678 .event_init = perf_swevent_init,
4679 .add = perf_swevent_add,
4680 .del = perf_swevent_del,
4681 .start = perf_swevent_start,
4682 .stop = perf_swevent_stop,
4583 .read = perf_swevent_read, 4683 .read = perf_swevent_read,
4584 .unthrottle = perf_swevent_void,
4585}; 4684};
4586 4685
4686#ifdef CONFIG_EVENT_TRACING
4687
4587static int perf_tp_filter_match(struct perf_event *event, 4688static int perf_tp_filter_match(struct perf_event *event,
4588 struct perf_sample_data *data) 4689 struct perf_sample_data *data)
4589{ 4690{
@@ -4627,7 +4728,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4627 4728
4628 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4729 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4629 if (perf_tp_event_match(event, &data, regs)) 4730 if (perf_tp_event_match(event, &data, regs))
4630 perf_swevent_add(event, count, 1, &data, regs); 4731 perf_swevent_event(event, count, 1, &data, regs);
4631 } 4732 }
4632 4733
4633 perf_swevent_put_recursion_context(rctx); 4734 perf_swevent_put_recursion_context(rctx);
@@ -4639,10 +4740,13 @@ static void tp_perf_event_destroy(struct perf_event *event)
4639 perf_trace_destroy(event); 4740 perf_trace_destroy(event);
4640} 4741}
4641 4742
4642static const struct pmu *tp_perf_event_init(struct perf_event *event) 4743static int perf_tp_event_init(struct perf_event *event)
4643{ 4744{
4644 int err; 4745 int err;
4645 4746
4747 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4748 return -ENOENT;
4749
4646 /* 4750 /*
4647 * Raw tracepoint data is a severe data leak, only allow root to 4751 * Raw tracepoint data is a severe data leak, only allow root to
4648 * have these. 4752 * have these.
@@ -4650,15 +4754,31 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4650 if ((event->attr.sample_type & PERF_SAMPLE_RAW) && 4754 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4651 perf_paranoid_tracepoint_raw() && 4755 perf_paranoid_tracepoint_raw() &&
4652 !capable(CAP_SYS_ADMIN)) 4756 !capable(CAP_SYS_ADMIN))
4653 return ERR_PTR(-EPERM); 4757 return -EPERM;
4654 4758
4655 err = perf_trace_init(event); 4759 err = perf_trace_init(event);
4656 if (err) 4760 if (err)
4657 return NULL; 4761 return err;
4658 4762
4659 event->destroy = tp_perf_event_destroy; 4763 event->destroy = tp_perf_event_destroy;
4660 4764
4661 return &perf_ops_tracepoint; 4765 return 0;
4766}
4767
4768static struct pmu perf_tracepoint = {
4769 .task_ctx_nr = perf_sw_context,
4770
4771 .event_init = perf_tp_event_init,
4772 .add = perf_trace_add,
4773 .del = perf_trace_del,
4774 .start = perf_swevent_start,
4775 .stop = perf_swevent_stop,
4776 .read = perf_swevent_read,
4777};
4778
4779static inline void perf_tp_register(void)
4780{
4781 perf_pmu_register(&perf_tracepoint);
4662} 4782}
4663 4783
4664static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4784static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4686,9 +4806,8 @@ static void perf_event_free_filter(struct perf_event *event)
4686 4806
4687#else 4807#else
4688 4808
4689static const struct pmu *tp_perf_event_init(struct perf_event *event) 4809static inline void perf_tp_register(void)
4690{ 4810{
4691 return NULL;
4692} 4811}
4693 4812
4694static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4813static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4703,106 +4822,390 @@ static void perf_event_free_filter(struct perf_event *event)
4703#endif /* CONFIG_EVENT_TRACING */ 4822#endif /* CONFIG_EVENT_TRACING */
4704 4823
4705#ifdef CONFIG_HAVE_HW_BREAKPOINT 4824#ifdef CONFIG_HAVE_HW_BREAKPOINT
4706static void bp_perf_event_destroy(struct perf_event *event) 4825void perf_bp_event(struct perf_event *bp, void *data)
4707{ 4826{
4708 release_bp_slot(event); 4827 struct perf_sample_data sample;
4828 struct pt_regs *regs = data;
4829
4830 perf_sample_data_init(&sample, bp->attr.bp_addr);
4831
4832 if (!bp->hw.state && !perf_exclude_event(bp, regs))
4833 perf_swevent_event(bp, 1, 1, &sample, regs);
4709} 4834}
4835#endif
4710 4836
4711static const struct pmu *bp_perf_event_init(struct perf_event *bp) 4837/*
4838 * hrtimer based swevent callback
4839 */
4840
4841static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4712{ 4842{
4713 int err; 4843 enum hrtimer_restart ret = HRTIMER_RESTART;
4844 struct perf_sample_data data;
4845 struct pt_regs *regs;
4846 struct perf_event *event;
4847 u64 period;
4714 4848
4715 err = register_perf_hw_breakpoint(bp); 4849 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4716 if (err) 4850 event->pmu->read(event);
4717 return ERR_PTR(err);
4718 4851
4719 bp->destroy = bp_perf_event_destroy; 4852 perf_sample_data_init(&data, 0);
4853 data.period = event->hw.last_period;
4854 regs = get_irq_regs();
4720 4855
4721 return &perf_ops_bp; 4856 if (regs && !perf_exclude_event(event, regs)) {
4857 if (!(event->attr.exclude_idle && current->pid == 0))
4858 if (perf_event_overflow(event, 0, &data, regs))
4859 ret = HRTIMER_NORESTART;
4860 }
4861
4862 period = max_t(u64, 10000, event->hw.sample_period);
4863 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4864
4865 return ret;
4722} 4866}
4723 4867
4724void perf_bp_event(struct perf_event *bp, void *data) 4868static void perf_swevent_start_hrtimer(struct perf_event *event)
4725{ 4869{
4726 struct perf_sample_data sample; 4870 struct hw_perf_event *hwc = &event->hw;
4727 struct pt_regs *regs = data;
4728 4871
4729 perf_sample_data_init(&sample, bp->attr.bp_addr); 4872 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4873 hwc->hrtimer.function = perf_swevent_hrtimer;
4874 if (hwc->sample_period) {
4875 s64 period = local64_read(&hwc->period_left);
4876
4877 if (period) {
4878 if (period < 0)
4879 period = 10000;
4730 4880
4731 if (!perf_exclude_event(bp, regs)) 4881 local64_set(&hwc->period_left, 0);
4732 perf_swevent_add(bp, 1, 1, &sample, regs); 4882 } else {
4883 period = max_t(u64, 10000, hwc->sample_period);
4884 }
4885 __hrtimer_start_range_ns(&hwc->hrtimer,
4886 ns_to_ktime(period), 0,
4887 HRTIMER_MODE_REL_PINNED, 0);
4888 }
4733} 4889}
4734#else 4890
4735static const struct pmu *bp_perf_event_init(struct perf_event *bp) 4891static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4736{ 4892{
4737 return NULL; 4893 struct hw_perf_event *hwc = &event->hw;
4894
4895 if (hwc->sample_period) {
4896 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4897 local64_set(&hwc->period_left, ktime_to_ns(remaining));
4898
4899 hrtimer_cancel(&hwc->hrtimer);
4900 }
4738} 4901}
4739 4902
4740void perf_bp_event(struct perf_event *bp, void *regs) 4903/*
4904 * Software event: cpu wall time clock
4905 */
4906
4907static void cpu_clock_event_update(struct perf_event *event)
4741{ 4908{
4909 s64 prev;
4910 u64 now;
4911
4912 now = local_clock();
4913 prev = local64_xchg(&event->hw.prev_count, now);
4914 local64_add(now - prev, &event->count);
4742} 4915}
4743#endif
4744 4916
4745atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 4917static void cpu_clock_event_start(struct perf_event *event, int flags)
4918{
4919 local64_set(&event->hw.prev_count, local_clock());
4920 perf_swevent_start_hrtimer(event);
4921}
4746 4922
4747static void sw_perf_event_destroy(struct perf_event *event) 4923static void cpu_clock_event_stop(struct perf_event *event, int flags)
4748{ 4924{
4749 u64 event_id = event->attr.config; 4925 perf_swevent_cancel_hrtimer(event);
4926 cpu_clock_event_update(event);
4927}
4750 4928
4751 WARN_ON(event->parent); 4929static int cpu_clock_event_add(struct perf_event *event, int flags)
4930{
4931 if (flags & PERF_EF_START)
4932 cpu_clock_event_start(event, flags);
4752 4933
4753 atomic_dec(&perf_swevent_enabled[event_id]); 4934 return 0;
4754 swevent_hlist_put(event);
4755} 4935}
4756 4936
4757static const struct pmu *sw_perf_event_init(struct perf_event *event) 4937static void cpu_clock_event_del(struct perf_event *event, int flags)
4758{ 4938{
4759 const struct pmu *pmu = NULL; 4939 cpu_clock_event_stop(event, flags);
4760 u64 event_id = event->attr.config; 4940}
4941
4942static void cpu_clock_event_read(struct perf_event *event)
4943{
4944 cpu_clock_event_update(event);
4945}
4946
4947static int cpu_clock_event_init(struct perf_event *event)
4948{
4949 if (event->attr.type != PERF_TYPE_SOFTWARE)
4950 return -ENOENT;
4951
4952 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
4953 return -ENOENT;
4954
4955 return 0;
4956}
4957
4958static struct pmu perf_cpu_clock = {
4959 .task_ctx_nr = perf_sw_context,
4960
4961 .event_init = cpu_clock_event_init,
4962 .add = cpu_clock_event_add,
4963 .del = cpu_clock_event_del,
4964 .start = cpu_clock_event_start,
4965 .stop = cpu_clock_event_stop,
4966 .read = cpu_clock_event_read,
4967};
4968
4969/*
4970 * Software event: task time clock
4971 */
4972
4973static void task_clock_event_update(struct perf_event *event, u64 now)
4974{
4975 u64 prev;
4976 s64 delta;
4977
4978 prev = local64_xchg(&event->hw.prev_count, now);
4979 delta = now - prev;
4980 local64_add(delta, &event->count);
4981}
4982
4983static void task_clock_event_start(struct perf_event *event, int flags)
4984{
4985 local64_set(&event->hw.prev_count, event->ctx->time);
4986 perf_swevent_start_hrtimer(event);
4987}
4988
4989static void task_clock_event_stop(struct perf_event *event, int flags)
4990{
4991 perf_swevent_cancel_hrtimer(event);
4992 task_clock_event_update(event, event->ctx->time);
4993}
4994
4995static int task_clock_event_add(struct perf_event *event, int flags)
4996{
4997 if (flags & PERF_EF_START)
4998 task_clock_event_start(event, flags);
4999
5000 return 0;
5001}
5002
5003static void task_clock_event_del(struct perf_event *event, int flags)
5004{
5005 task_clock_event_stop(event, PERF_EF_UPDATE);
5006}
5007
5008static void task_clock_event_read(struct perf_event *event)
5009{
5010 u64 time;
5011
5012 if (!in_nmi()) {
5013 update_context_time(event->ctx);
5014 time = event->ctx->time;
5015 } else {
5016 u64 now = perf_clock();
5017 u64 delta = now - event->ctx->timestamp;
5018 time = event->ctx->time + delta;
5019 }
5020
5021 task_clock_event_update(event, time);
5022}
5023
5024static int task_clock_event_init(struct perf_event *event)
5025{
5026 if (event->attr.type != PERF_TYPE_SOFTWARE)
5027 return -ENOENT;
5028
5029 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5030 return -ENOENT;
5031
5032 return 0;
5033}
5034
5035static struct pmu perf_task_clock = {
5036 .task_ctx_nr = perf_sw_context,
5037
5038 .event_init = task_clock_event_init,
5039 .add = task_clock_event_add,
5040 .del = task_clock_event_del,
5041 .start = task_clock_event_start,
5042 .stop = task_clock_event_stop,
5043 .read = task_clock_event_read,
5044};
5045
5046static void perf_pmu_nop_void(struct pmu *pmu)
5047{
5048}
5049
5050static int perf_pmu_nop_int(struct pmu *pmu)
5051{
5052 return 0;
5053}
5054
5055static void perf_pmu_start_txn(struct pmu *pmu)
5056{
5057 perf_pmu_disable(pmu);
5058}
5059
5060static int perf_pmu_commit_txn(struct pmu *pmu)
5061{
5062 perf_pmu_enable(pmu);
5063 return 0;
5064}
5065
5066static void perf_pmu_cancel_txn(struct pmu *pmu)
5067{
5068 perf_pmu_enable(pmu);
5069}
5070
5071/*
5072 * Ensures all contexts with the same task_ctx_nr have the same
5073 * pmu_cpu_context too.
5074 */
5075static void *find_pmu_context(int ctxn)
5076{
5077 struct pmu *pmu;
5078
5079 if (ctxn < 0)
5080 return NULL;
5081
5082 list_for_each_entry(pmu, &pmus, entry) {
5083 if (pmu->task_ctx_nr == ctxn)
5084 return pmu->pmu_cpu_context;
5085 }
5086
5087 return NULL;
5088}
5089
5090static void free_pmu_context(void * __percpu cpu_context)
5091{
5092 struct pmu *pmu;
4761 5093
5094 mutex_lock(&pmus_lock);
4762 /* 5095 /*
4763 * Software events (currently) can't in general distinguish 5096 * Like a real lame refcount.
4764 * between user, kernel and hypervisor events.
4765 * However, context switches and cpu migrations are considered
4766 * to be kernel events, and page faults are never hypervisor
4767 * events.
4768 */ 5097 */
4769 switch (event_id) { 5098 list_for_each_entry(pmu, &pmus, entry) {
4770 case PERF_COUNT_SW_CPU_CLOCK: 5099 if (pmu->pmu_cpu_context == cpu_context)
4771 pmu = &perf_ops_cpu_clock; 5100 goto out;
5101 }
4772 5102
4773 break; 5103 free_percpu(cpu_context);
4774 case PERF_COUNT_SW_TASK_CLOCK: 5104out:
4775 /* 5105 mutex_unlock(&pmus_lock);
4776 * If the user instantiates this as a per-cpu event, 5106}
4777 * use the cpu_clock event instead.
4778 */
4779 if (event->ctx->task)
4780 pmu = &perf_ops_task_clock;
4781 else
4782 pmu = &perf_ops_cpu_clock;
4783 5107
4784 break; 5108int perf_pmu_register(struct pmu *pmu)
4785 case PERF_COUNT_SW_PAGE_FAULTS: 5109{
4786 case PERF_COUNT_SW_PAGE_FAULTS_MIN: 5110 int cpu, ret;
4787 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 5111
4788 case PERF_COUNT_SW_CONTEXT_SWITCHES: 5112 mutex_lock(&pmus_lock);
4789 case PERF_COUNT_SW_CPU_MIGRATIONS: 5113 ret = -ENOMEM;
4790 case PERF_COUNT_SW_ALIGNMENT_FAULTS: 5114 pmu->pmu_disable_count = alloc_percpu(int);
4791 case PERF_COUNT_SW_EMULATION_FAULTS: 5115 if (!pmu->pmu_disable_count)
4792 if (!event->parent) { 5116 goto unlock;
4793 int err; 5117
4794 5118 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
4795 err = swevent_hlist_get(event); 5119 if (pmu->pmu_cpu_context)
4796 if (err) 5120 goto got_cpu_context;
4797 return ERR_PTR(err); 5121
5122 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5123 if (!pmu->pmu_cpu_context)
5124 goto free_pdc;
5125
5126 for_each_possible_cpu(cpu) {
5127 struct perf_cpu_context *cpuctx;
5128
5129 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5130 __perf_event_init_context(&cpuctx->ctx);
5131 cpuctx->ctx.type = cpu_context;
5132 cpuctx->ctx.pmu = pmu;
5133 cpuctx->jiffies_interval = 1;
5134 INIT_LIST_HEAD(&cpuctx->rotation_list);
5135 }
4798 5136
4799 atomic_inc(&perf_swevent_enabled[event_id]); 5137got_cpu_context:
4800 event->destroy = sw_perf_event_destroy; 5138 if (!pmu->start_txn) {
5139 if (pmu->pmu_enable) {
5140 /*
5141 * If we have pmu_enable/pmu_disable calls, install
5142 * transaction stubs that use that to try and batch
5143 * hardware accesses.
5144 */
5145 pmu->start_txn = perf_pmu_start_txn;
5146 pmu->commit_txn = perf_pmu_commit_txn;
5147 pmu->cancel_txn = perf_pmu_cancel_txn;
5148 } else {
5149 pmu->start_txn = perf_pmu_nop_void;
5150 pmu->commit_txn = perf_pmu_nop_int;
5151 pmu->cancel_txn = perf_pmu_nop_void;
4801 } 5152 }
4802 pmu = &perf_ops_generic;
4803 break;
4804 } 5153 }
4805 5154
5155 if (!pmu->pmu_enable) {
5156 pmu->pmu_enable = perf_pmu_nop_void;
5157 pmu->pmu_disable = perf_pmu_nop_void;
5158 }
5159
5160 list_add_rcu(&pmu->entry, &pmus);
5161 ret = 0;
5162unlock:
5163 mutex_unlock(&pmus_lock);
5164
5165 return ret;
5166
5167free_pdc:
5168 free_percpu(pmu->pmu_disable_count);
5169 goto unlock;
5170}
5171
5172void perf_pmu_unregister(struct pmu *pmu)
5173{
5174 mutex_lock(&pmus_lock);
5175 list_del_rcu(&pmu->entry);
5176 mutex_unlock(&pmus_lock);
5177
5178 /*
5179 * We dereference the pmu list under both SRCU and regular RCU, so
5180 * synchronize against both of those.
5181 */
5182 synchronize_srcu(&pmus_srcu);
5183 synchronize_rcu();
5184
5185 free_percpu(pmu->pmu_disable_count);
5186 free_pmu_context(pmu->pmu_cpu_context);
5187}
5188
5189struct pmu *perf_init_event(struct perf_event *event)
5190{
5191 struct pmu *pmu = NULL;
5192 int idx;
5193
5194 idx = srcu_read_lock(&pmus_srcu);
5195 list_for_each_entry_rcu(pmu, &pmus, entry) {
5196 int ret = pmu->event_init(event);
5197 if (!ret)
5198 goto unlock;
5199
5200 if (ret != -ENOENT) {
5201 pmu = ERR_PTR(ret);
5202 goto unlock;
5203 }
5204 }
5205 pmu = ERR_PTR(-ENOENT);
5206unlock:
5207 srcu_read_unlock(&pmus_srcu, idx);
5208
4806 return pmu; 5209 return pmu;
4807} 5210}
4808 5211
@@ -4810,20 +5213,18 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4810 * Allocate and initialize a event structure 5213 * Allocate and initialize a event structure
4811 */ 5214 */
4812static struct perf_event * 5215static struct perf_event *
4813perf_event_alloc(struct perf_event_attr *attr, 5216perf_event_alloc(struct perf_event_attr *attr, int cpu,
4814 int cpu, 5217 struct task_struct *task,
4815 struct perf_event_context *ctx, 5218 struct perf_event *group_leader,
4816 struct perf_event *group_leader, 5219 struct perf_event *parent_event,
4817 struct perf_event *parent_event, 5220 perf_overflow_handler_t overflow_handler)
4818 perf_overflow_handler_t overflow_handler, 5221{
4819 gfp_t gfpflags) 5222 struct pmu *pmu;
4820{
4821 const struct pmu *pmu;
4822 struct perf_event *event; 5223 struct perf_event *event;
4823 struct hw_perf_event *hwc; 5224 struct hw_perf_event *hwc;
4824 long err; 5225 long err;
4825 5226
4826 event = kzalloc(sizeof(*event), gfpflags); 5227 event = kzalloc(sizeof(*event), GFP_KERNEL);
4827 if (!event) 5228 if (!event)
4828 return ERR_PTR(-ENOMEM); 5229 return ERR_PTR(-ENOMEM);
4829 5230
@@ -4841,6 +5242,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4841 INIT_LIST_HEAD(&event->event_entry); 5242 INIT_LIST_HEAD(&event->event_entry);
4842 INIT_LIST_HEAD(&event->sibling_list); 5243 INIT_LIST_HEAD(&event->sibling_list);
4843 init_waitqueue_head(&event->waitq); 5244 init_waitqueue_head(&event->waitq);
5245 init_irq_work(&event->pending, perf_pending_event);
4844 5246
4845 mutex_init(&event->mmap_mutex); 5247 mutex_init(&event->mmap_mutex);
4846 5248
@@ -4848,7 +5250,6 @@ perf_event_alloc(struct perf_event_attr *attr,
4848 event->attr = *attr; 5250 event->attr = *attr;
4849 event->group_leader = group_leader; 5251 event->group_leader = group_leader;
4850 event->pmu = NULL; 5252 event->pmu = NULL;
4851 event->ctx = ctx;
4852 event->oncpu = -1; 5253 event->oncpu = -1;
4853 5254
4854 event->parent = parent_event; 5255 event->parent = parent_event;
@@ -4858,6 +5259,17 @@ perf_event_alloc(struct perf_event_attr *attr,
4858 5259
4859 event->state = PERF_EVENT_STATE_INACTIVE; 5260 event->state = PERF_EVENT_STATE_INACTIVE;
4860 5261
5262 if (task) {
5263 event->attach_state = PERF_ATTACH_TASK;
5264#ifdef CONFIG_HAVE_HW_BREAKPOINT
5265 /*
5266 * hw_breakpoint is a bit difficult here..
5267 */
5268 if (attr->type == PERF_TYPE_BREAKPOINT)
5269 event->hw.bp_target = task;
5270#endif
5271 }
5272
4861 if (!overflow_handler && parent_event) 5273 if (!overflow_handler && parent_event)
4862 overflow_handler = parent_event->overflow_handler; 5274 overflow_handler = parent_event->overflow_handler;
4863 5275
@@ -4882,29 +5294,8 @@ perf_event_alloc(struct perf_event_attr *attr,
4882 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) 5294 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4883 goto done; 5295 goto done;
4884 5296
4885 switch (attr->type) { 5297 pmu = perf_init_event(event);
4886 case PERF_TYPE_RAW:
4887 case PERF_TYPE_HARDWARE:
4888 case PERF_TYPE_HW_CACHE:
4889 pmu = hw_perf_event_init(event);
4890 break;
4891
4892 case PERF_TYPE_SOFTWARE:
4893 pmu = sw_perf_event_init(event);
4894 break;
4895
4896 case PERF_TYPE_TRACEPOINT:
4897 pmu = tp_perf_event_init(event);
4898 break;
4899
4900 case PERF_TYPE_BREAKPOINT:
4901 pmu = bp_perf_event_init(event);
4902 break;
4903 5298
4904
4905 default:
4906 break;
4907 }
4908done: 5299done:
4909 err = 0; 5300 err = 0;
4910 if (!pmu) 5301 if (!pmu)
@@ -4922,13 +5313,21 @@ done:
4922 event->pmu = pmu; 5313 event->pmu = pmu;
4923 5314
4924 if (!event->parent) { 5315 if (!event->parent) {
4925 atomic_inc(&nr_events); 5316 if (event->attach_state & PERF_ATTACH_TASK)
5317 jump_label_inc(&perf_task_events);
4926 if (event->attr.mmap || event->attr.mmap_data) 5318 if (event->attr.mmap || event->attr.mmap_data)
4927 atomic_inc(&nr_mmap_events); 5319 atomic_inc(&nr_mmap_events);
4928 if (event->attr.comm) 5320 if (event->attr.comm)
4929 atomic_inc(&nr_comm_events); 5321 atomic_inc(&nr_comm_events);
4930 if (event->attr.task) 5322 if (event->attr.task)
4931 atomic_inc(&nr_task_events); 5323 atomic_inc(&nr_task_events);
5324 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
5325 err = get_callchain_buffers();
5326 if (err) {
5327 free_event(event);
5328 return ERR_PTR(err);
5329 }
5330 }
4932 } 5331 }
4933 5332
4934 return event; 5333 return event;
@@ -5076,12 +5475,16 @@ SYSCALL_DEFINE5(perf_event_open,
5076 struct perf_event_attr __user *, attr_uptr, 5475 struct perf_event_attr __user *, attr_uptr,
5077 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) 5476 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
5078{ 5477{
5079 struct perf_event *event, *group_leader = NULL, *output_event = NULL; 5478 struct perf_event *group_leader = NULL, *output_event = NULL;
5479 struct perf_event *event, *sibling;
5080 struct perf_event_attr attr; 5480 struct perf_event_attr attr;
5081 struct perf_event_context *ctx; 5481 struct perf_event_context *ctx;
5082 struct file *event_file = NULL; 5482 struct file *event_file = NULL;
5083 struct file *group_file = NULL; 5483 struct file *group_file = NULL;
5484 struct task_struct *task = NULL;
5485 struct pmu *pmu;
5084 int event_fd; 5486 int event_fd;
5487 int move_group = 0;
5085 int fput_needed = 0; 5488 int fput_needed = 0;
5086 int err; 5489 int err;
5087 5490
@@ -5107,20 +5510,11 @@ SYSCALL_DEFINE5(perf_event_open,
5107 if (event_fd < 0) 5510 if (event_fd < 0)
5108 return event_fd; 5511 return event_fd;
5109 5512
5110 /*
5111 * Get the target context (task or percpu):
5112 */
5113 ctx = find_get_context(pid, cpu);
5114 if (IS_ERR(ctx)) {
5115 err = PTR_ERR(ctx);
5116 goto err_fd;
5117 }
5118
5119 if (group_fd != -1) { 5513 if (group_fd != -1) {
5120 group_leader = perf_fget_light(group_fd, &fput_needed); 5514 group_leader = perf_fget_light(group_fd, &fput_needed);
5121 if (IS_ERR(group_leader)) { 5515 if (IS_ERR(group_leader)) {
5122 err = PTR_ERR(group_leader); 5516 err = PTR_ERR(group_leader);
5123 goto err_put_context; 5517 goto err_fd;
5124 } 5518 }
5125 group_file = group_leader->filp; 5519 group_file = group_leader->filp;
5126 if (flags & PERF_FLAG_FD_OUTPUT) 5520 if (flags & PERF_FLAG_FD_OUTPUT)
@@ -5129,6 +5523,58 @@ SYSCALL_DEFINE5(perf_event_open,
5129 group_leader = NULL; 5523 group_leader = NULL;
5130 } 5524 }
5131 5525
5526 if (pid != -1) {
5527 task = find_lively_task_by_vpid(pid);
5528 if (IS_ERR(task)) {
5529 err = PTR_ERR(task);
5530 goto err_group_fd;
5531 }
5532 }
5533
5534 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
5535 if (IS_ERR(event)) {
5536 err = PTR_ERR(event);
5537 goto err_task;
5538 }
5539
5540 /*
5541 * Special case software events and allow them to be part of
5542 * any hardware group.
5543 */
5544 pmu = event->pmu;
5545
5546 if (group_leader &&
5547 (is_software_event(event) != is_software_event(group_leader))) {
5548 if (is_software_event(event)) {
5549 /*
5550 * If event and group_leader are not both a software
5551 * event, and event is, then group leader is not.
5552 *
5553 * Allow the addition of software events to !software
5554 * groups, this is safe because software events never
5555 * fail to schedule.
5556 */
5557 pmu = group_leader->pmu;
5558 } else if (is_software_event(group_leader) &&
5559 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
5560 /*
5561 * In case the group is a pure software group, and we
5562 * try to add a hardware event, move the whole group to
5563 * the hardware context.
5564 */
5565 move_group = 1;
5566 }
5567 }
5568
5569 /*
5570 * Get the target context (task or percpu):
5571 */
5572 ctx = find_get_context(pmu, task, cpu);
5573 if (IS_ERR(ctx)) {
5574 err = PTR_ERR(ctx);
5575 goto err_alloc;
5576 }
5577
5132 /* 5578 /*
5133 * Look up the group leader (we will attach this event to it): 5579 * Look up the group leader (we will attach this event to it):
5134 */ 5580 */
@@ -5140,42 +5586,66 @@ SYSCALL_DEFINE5(perf_event_open,
5140 * becoming part of another group-sibling): 5586 * becoming part of another group-sibling):
5141 */ 5587 */
5142 if (group_leader->group_leader != group_leader) 5588 if (group_leader->group_leader != group_leader)
5143 goto err_put_context; 5589 goto err_context;
5144 /* 5590 /*
5145 * Do not allow to attach to a group in a different 5591 * Do not allow to attach to a group in a different
5146 * task or CPU context: 5592 * task or CPU context:
5147 */ 5593 */
5148 if (group_leader->ctx != ctx) 5594 if (move_group) {
5149 goto err_put_context; 5595 if (group_leader->ctx->type != ctx->type)
5596 goto err_context;
5597 } else {
5598 if (group_leader->ctx != ctx)
5599 goto err_context;
5600 }
5601
5150 /* 5602 /*
5151 * Only a group leader can be exclusive or pinned 5603 * Only a group leader can be exclusive or pinned
5152 */ 5604 */
5153 if (attr.exclusive || attr.pinned) 5605 if (attr.exclusive || attr.pinned)
5154 goto err_put_context; 5606 goto err_context;
5155 }
5156
5157 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
5158 NULL, NULL, GFP_KERNEL);
5159 if (IS_ERR(event)) {
5160 err = PTR_ERR(event);
5161 goto err_put_context;
5162 } 5607 }
5163 5608
5164 if (output_event) { 5609 if (output_event) {
5165 err = perf_event_set_output(event, output_event); 5610 err = perf_event_set_output(event, output_event);
5166 if (err) 5611 if (err)
5167 goto err_free_put_context; 5612 goto err_context;
5168 } 5613 }
5169 5614
5170 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); 5615 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
5171 if (IS_ERR(event_file)) { 5616 if (IS_ERR(event_file)) {
5172 err = PTR_ERR(event_file); 5617 err = PTR_ERR(event_file);
5173 goto err_free_put_context; 5618 goto err_context;
5619 }
5620
5621 if (move_group) {
5622 struct perf_event_context *gctx = group_leader->ctx;
5623
5624 mutex_lock(&gctx->mutex);
5625 perf_event_remove_from_context(group_leader);
5626 list_for_each_entry(sibling, &group_leader->sibling_list,
5627 group_entry) {
5628 perf_event_remove_from_context(sibling);
5629 put_ctx(gctx);
5630 }
5631 mutex_unlock(&gctx->mutex);
5632 put_ctx(gctx);
5174 } 5633 }
5175 5634
5176 event->filp = event_file; 5635 event->filp = event_file;
5177 WARN_ON_ONCE(ctx->parent_ctx); 5636 WARN_ON_ONCE(ctx->parent_ctx);
5178 mutex_lock(&ctx->mutex); 5637 mutex_lock(&ctx->mutex);
5638
5639 if (move_group) {
5640 perf_install_in_context(ctx, group_leader, cpu);
5641 get_ctx(ctx);
5642 list_for_each_entry(sibling, &group_leader->sibling_list,
5643 group_entry) {
5644 perf_install_in_context(ctx, sibling, cpu);
5645 get_ctx(ctx);
5646 }
5647 }
5648
5179 perf_install_in_context(ctx, event, cpu); 5649 perf_install_in_context(ctx, event, cpu);
5180 ++ctx->generation; 5650 ++ctx->generation;
5181 mutex_unlock(&ctx->mutex); 5651 mutex_unlock(&ctx->mutex);
@@ -5196,11 +5666,15 @@ SYSCALL_DEFINE5(perf_event_open,
5196 fd_install(event_fd, event_file); 5666 fd_install(event_fd, event_file);
5197 return event_fd; 5667 return event_fd;
5198 5668
5199err_free_put_context: 5669err_context:
5670 put_ctx(ctx);
5671err_alloc:
5200 free_event(event); 5672 free_event(event);
5201err_put_context: 5673err_task:
5674 if (task)
5675 put_task_struct(task);
5676err_group_fd:
5202 fput_light(group_file, fput_needed); 5677 fput_light(group_file, fput_needed);
5203 put_ctx(ctx);
5204err_fd: 5678err_fd:
5205 put_unused_fd(event_fd); 5679 put_unused_fd(event_fd);
5206 return err; 5680 return err;
@@ -5211,32 +5685,31 @@ err_fd:
5211 * 5685 *
5212 * @attr: attributes of the counter to create 5686 * @attr: attributes of the counter to create
5213 * @cpu: cpu in which the counter is bound 5687 * @cpu: cpu in which the counter is bound
5214 * @pid: task to profile 5688 * @task: task to profile (NULL for percpu)
5215 */ 5689 */
5216struct perf_event * 5690struct perf_event *
5217perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 5691perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5218 pid_t pid, 5692 struct task_struct *task,
5219 perf_overflow_handler_t overflow_handler) 5693 perf_overflow_handler_t overflow_handler)
5220{ 5694{
5221 struct perf_event *event;
5222 struct perf_event_context *ctx; 5695 struct perf_event_context *ctx;
5696 struct perf_event *event;
5223 int err; 5697 int err;
5224 5698
5225 /* 5699 /*
5226 * Get the target context (task or percpu): 5700 * Get the target context (task or percpu):
5227 */ 5701 */
5228 5702
5229 ctx = find_get_context(pid, cpu); 5703 event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
5230 if (IS_ERR(ctx)) {
5231 err = PTR_ERR(ctx);
5232 goto err_exit;
5233 }
5234
5235 event = perf_event_alloc(attr, cpu, ctx, NULL,
5236 NULL, overflow_handler, GFP_KERNEL);
5237 if (IS_ERR(event)) { 5704 if (IS_ERR(event)) {
5238 err = PTR_ERR(event); 5705 err = PTR_ERR(event);
5239 goto err_put_context; 5706 goto err;
5707 }
5708
5709 ctx = find_get_context(event->pmu, task, cpu);
5710 if (IS_ERR(ctx)) {
5711 err = PTR_ERR(ctx);
5712 goto err_free;
5240 } 5713 }
5241 5714
5242 event->filp = NULL; 5715 event->filp = NULL;
@@ -5254,112 +5727,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5254 5727
5255 return event; 5728 return event;
5256 5729
5257 err_put_context: 5730err_free:
5258 put_ctx(ctx); 5731 free_event(event);
5259 err_exit: 5732err:
5260 return ERR_PTR(err); 5733 return ERR_PTR(err);
5261} 5734}
5262EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); 5735EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
5263 5736
5264/*
5265 * inherit a event from parent task to child task:
5266 */
5267static struct perf_event *
5268inherit_event(struct perf_event *parent_event,
5269 struct task_struct *parent,
5270 struct perf_event_context *parent_ctx,
5271 struct task_struct *child,
5272 struct perf_event *group_leader,
5273 struct perf_event_context *child_ctx)
5274{
5275 struct perf_event *child_event;
5276
5277 /*
5278 * Instead of creating recursive hierarchies of events,
5279 * we link inherited events back to the original parent,
5280 * which has a filp for sure, which we use as the reference
5281 * count:
5282 */
5283 if (parent_event->parent)
5284 parent_event = parent_event->parent;
5285
5286 child_event = perf_event_alloc(&parent_event->attr,
5287 parent_event->cpu, child_ctx,
5288 group_leader, parent_event,
5289 NULL, GFP_KERNEL);
5290 if (IS_ERR(child_event))
5291 return child_event;
5292 get_ctx(child_ctx);
5293
5294 /*
5295 * Make the child state follow the state of the parent event,
5296 * not its attr.disabled bit. We hold the parent's mutex,
5297 * so we won't race with perf_event_{en, dis}able_family.
5298 */
5299 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
5300 child_event->state = PERF_EVENT_STATE_INACTIVE;
5301 else
5302 child_event->state = PERF_EVENT_STATE_OFF;
5303
5304 if (parent_event->attr.freq) {
5305 u64 sample_period = parent_event->hw.sample_period;
5306 struct hw_perf_event *hwc = &child_event->hw;
5307
5308 hwc->sample_period = sample_period;
5309 hwc->last_period = sample_period;
5310
5311 local64_set(&hwc->period_left, sample_period);
5312 }
5313
5314 child_event->overflow_handler = parent_event->overflow_handler;
5315
5316 /*
5317 * Link it up in the child's context:
5318 */
5319 add_event_to_ctx(child_event, child_ctx);
5320
5321 /*
5322 * Get a reference to the parent filp - we will fput it
5323 * when the child event exits. This is safe to do because
5324 * we are in the parent and we know that the filp still
5325 * exists and has a nonzero count:
5326 */
5327 atomic_long_inc(&parent_event->filp->f_count);
5328
5329 /*
5330 * Link this into the parent event's child list
5331 */
5332 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
5333 mutex_lock(&parent_event->child_mutex);
5334 list_add_tail(&child_event->child_list, &parent_event->child_list);
5335 mutex_unlock(&parent_event->child_mutex);
5336
5337 return child_event;
5338}
5339
5340static int inherit_group(struct perf_event *parent_event,
5341 struct task_struct *parent,
5342 struct perf_event_context *parent_ctx,
5343 struct task_struct *child,
5344 struct perf_event_context *child_ctx)
5345{
5346 struct perf_event *leader;
5347 struct perf_event *sub;
5348 struct perf_event *child_ctr;
5349
5350 leader = inherit_event(parent_event, parent, parent_ctx,
5351 child, NULL, child_ctx);
5352 if (IS_ERR(leader))
5353 return PTR_ERR(leader);
5354 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
5355 child_ctr = inherit_event(sub, parent, parent_ctx,
5356 child, leader, child_ctx);
5357 if (IS_ERR(child_ctr))
5358 return PTR_ERR(child_ctr);
5359 }
5360 return 0;
5361}
5362
5363static void sync_child_event(struct perf_event *child_event, 5737static void sync_child_event(struct perf_event *child_event,
5364 struct task_struct *child) 5738 struct task_struct *child)
5365{ 5739{
@@ -5416,16 +5790,13 @@ __perf_event_exit_task(struct perf_event *child_event,
5416 } 5790 }
5417} 5791}
5418 5792
5419/* 5793static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
5420 * When a child task exits, feed back event values to parent events.
5421 */
5422void perf_event_exit_task(struct task_struct *child)
5423{ 5794{
5424 struct perf_event *child_event, *tmp; 5795 struct perf_event *child_event, *tmp;
5425 struct perf_event_context *child_ctx; 5796 struct perf_event_context *child_ctx;
5426 unsigned long flags; 5797 unsigned long flags;
5427 5798
5428 if (likely(!child->perf_event_ctxp)) { 5799 if (likely(!child->perf_event_ctxp[ctxn])) {
5429 perf_event_task(child, NULL, 0); 5800 perf_event_task(child, NULL, 0);
5430 return; 5801 return;
5431 } 5802 }
@@ -5437,8 +5808,8 @@ void perf_event_exit_task(struct task_struct *child)
5437 * scheduled, so we are now safe from rescheduling changing 5808 * scheduled, so we are now safe from rescheduling changing
5438 * our context. 5809 * our context.
5439 */ 5810 */
5440 child_ctx = child->perf_event_ctxp; 5811 child_ctx = child->perf_event_ctxp[ctxn];
5441 __perf_event_task_sched_out(child_ctx); 5812 task_ctx_sched_out(child_ctx, EVENT_ALL);
5442 5813
5443 /* 5814 /*
5444 * Take the context lock here so that if find_get_context is 5815 * Take the context lock here so that if find_get_context is
@@ -5446,7 +5817,7 @@ void perf_event_exit_task(struct task_struct *child)
5446 * incremented the context's refcount before we do put_ctx below. 5817 * incremented the context's refcount before we do put_ctx below.
5447 */ 5818 */
5448 raw_spin_lock(&child_ctx->lock); 5819 raw_spin_lock(&child_ctx->lock);
5449 child->perf_event_ctxp = NULL; 5820 child->perf_event_ctxp[ctxn] = NULL;
5450 /* 5821 /*
5451 * If this context is a clone; unclone it so it can't get 5822 * If this context is a clone; unclone it so it can't get
5452 * swapped to another process while we're removing all 5823 * swapped to another process while we're removing all
@@ -5499,6 +5870,17 @@ again:
5499 put_ctx(child_ctx); 5870 put_ctx(child_ctx);
5500} 5871}
5501 5872
5873/*
5874 * When a child task exits, feed back event values to parent events.
5875 */
5876void perf_event_exit_task(struct task_struct *child)
5877{
5878 int ctxn;
5879
5880 for_each_task_context_nr(ctxn)
5881 perf_event_exit_task_context(child, ctxn);
5882}
5883
5502static void perf_free_event(struct perf_event *event, 5884static void perf_free_event(struct perf_event *event,
5503 struct perf_event_context *ctx) 5885 struct perf_event_context *ctx)
5504{ 5886{
@@ -5520,48 +5902,166 @@ static void perf_free_event(struct perf_event *event,
5520 5902
5521/* 5903/*
5522 * free an unexposed, unused context as created by inheritance by 5904 * free an unexposed, unused context as created by inheritance by
5523 * init_task below, used by fork() in case of fail. 5905 * perf_event_init_task below, used by fork() in case of fail.
5524 */ 5906 */
5525void perf_event_free_task(struct task_struct *task) 5907void perf_event_free_task(struct task_struct *task)
5526{ 5908{
5527 struct perf_event_context *ctx = task->perf_event_ctxp; 5909 struct perf_event_context *ctx;
5528 struct perf_event *event, *tmp; 5910 struct perf_event *event, *tmp;
5911 int ctxn;
5529 5912
5530 if (!ctx) 5913 for_each_task_context_nr(ctxn) {
5531 return; 5914 ctx = task->perf_event_ctxp[ctxn];
5915 if (!ctx)
5916 continue;
5532 5917
5533 mutex_lock(&ctx->mutex); 5918 mutex_lock(&ctx->mutex);
5534again: 5919again:
5535 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 5920 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
5536 perf_free_event(event, ctx); 5921 group_entry)
5922 perf_free_event(event, ctx);
5537 5923
5538 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, 5924 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5539 group_entry) 5925 group_entry)
5540 perf_free_event(event, ctx); 5926 perf_free_event(event, ctx);
5541 5927
5542 if (!list_empty(&ctx->pinned_groups) || 5928 if (!list_empty(&ctx->pinned_groups) ||
5543 !list_empty(&ctx->flexible_groups)) 5929 !list_empty(&ctx->flexible_groups))
5544 goto again; 5930 goto again;
5545 5931
5546 mutex_unlock(&ctx->mutex); 5932 mutex_unlock(&ctx->mutex);
5547 5933
5548 put_ctx(ctx); 5934 put_ctx(ctx);
5935 }
5936}
5937
5938void perf_event_delayed_put(struct task_struct *task)
5939{
5940 int ctxn;
5941
5942 for_each_task_context_nr(ctxn)
5943 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
5944}
5945
5946/*
5947 * inherit a event from parent task to child task:
5948 */
5949static struct perf_event *
5950inherit_event(struct perf_event *parent_event,
5951 struct task_struct *parent,
5952 struct perf_event_context *parent_ctx,
5953 struct task_struct *child,
5954 struct perf_event *group_leader,
5955 struct perf_event_context *child_ctx)
5956{
5957 struct perf_event *child_event;
5958 unsigned long flags;
5959
5960 /*
5961 * Instead of creating recursive hierarchies of events,
5962 * we link inherited events back to the original parent,
5963 * which has a filp for sure, which we use as the reference
5964 * count:
5965 */
5966 if (parent_event->parent)
5967 parent_event = parent_event->parent;
5968
5969 child_event = perf_event_alloc(&parent_event->attr,
5970 parent_event->cpu,
5971 child,
5972 group_leader, parent_event,
5973 NULL);
5974 if (IS_ERR(child_event))
5975 return child_event;
5976 get_ctx(child_ctx);
5977
5978 /*
5979 * Make the child state follow the state of the parent event,
5980 * not its attr.disabled bit. We hold the parent's mutex,
5981 * so we won't race with perf_event_{en, dis}able_family.
5982 */
5983 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
5984 child_event->state = PERF_EVENT_STATE_INACTIVE;
5985 else
5986 child_event->state = PERF_EVENT_STATE_OFF;
5987
5988 if (parent_event->attr.freq) {
5989 u64 sample_period = parent_event->hw.sample_period;
5990 struct hw_perf_event *hwc = &child_event->hw;
5991
5992 hwc->sample_period = sample_period;
5993 hwc->last_period = sample_period;
5994
5995 local64_set(&hwc->period_left, sample_period);
5996 }
5997
5998 child_event->ctx = child_ctx;
5999 child_event->overflow_handler = parent_event->overflow_handler;
6000
6001 /*
6002 * Link it up in the child's context:
6003 */
6004 raw_spin_lock_irqsave(&child_ctx->lock, flags);
6005 add_event_to_ctx(child_event, child_ctx);
6006 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
6007
6008 /*
6009 * Get a reference to the parent filp - we will fput it
6010 * when the child event exits. This is safe to do because
6011 * we are in the parent and we know that the filp still
6012 * exists and has a nonzero count:
6013 */
6014 atomic_long_inc(&parent_event->filp->f_count);
6015
6016 /*
6017 * Link this into the parent event's child list
6018 */
6019 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
6020 mutex_lock(&parent_event->child_mutex);
6021 list_add_tail(&child_event->child_list, &parent_event->child_list);
6022 mutex_unlock(&parent_event->child_mutex);
6023
6024 return child_event;
6025}
6026
6027static int inherit_group(struct perf_event *parent_event,
6028 struct task_struct *parent,
6029 struct perf_event_context *parent_ctx,
6030 struct task_struct *child,
6031 struct perf_event_context *child_ctx)
6032{
6033 struct perf_event *leader;
6034 struct perf_event *sub;
6035 struct perf_event *child_ctr;
6036
6037 leader = inherit_event(parent_event, parent, parent_ctx,
6038 child, NULL, child_ctx);
6039 if (IS_ERR(leader))
6040 return PTR_ERR(leader);
6041 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
6042 child_ctr = inherit_event(sub, parent, parent_ctx,
6043 child, leader, child_ctx);
6044 if (IS_ERR(child_ctr))
6045 return PTR_ERR(child_ctr);
6046 }
6047 return 0;
5549} 6048}
5550 6049
5551static int 6050static int
5552inherit_task_group(struct perf_event *event, struct task_struct *parent, 6051inherit_task_group(struct perf_event *event, struct task_struct *parent,
5553 struct perf_event_context *parent_ctx, 6052 struct perf_event_context *parent_ctx,
5554 struct task_struct *child, 6053 struct task_struct *child, int ctxn,
5555 int *inherited_all) 6054 int *inherited_all)
5556{ 6055{
5557 int ret; 6056 int ret;
5558 struct perf_event_context *child_ctx = child->perf_event_ctxp; 6057 struct perf_event_context *child_ctx;
5559 6058
5560 if (!event->attr.inherit) { 6059 if (!event->attr.inherit) {
5561 *inherited_all = 0; 6060 *inherited_all = 0;
5562 return 0; 6061 return 0;
5563 } 6062 }
5564 6063
6064 child_ctx = child->perf_event_ctxp[ctxn];
5565 if (!child_ctx) { 6065 if (!child_ctx) {
5566 /* 6066 /*
5567 * This is executed from the parent task context, so 6067 * This is executed from the parent task context, so
@@ -5570,14 +6070,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
5570 * child. 6070 * child.
5571 */ 6071 */
5572 6072
5573 child_ctx = kzalloc(sizeof(struct perf_event_context), 6073 child_ctx = alloc_perf_context(event->pmu, child);
5574 GFP_KERNEL);
5575 if (!child_ctx) 6074 if (!child_ctx)
5576 return -ENOMEM; 6075 return -ENOMEM;
5577 6076
5578 __perf_event_init_context(child_ctx, child); 6077 child->perf_event_ctxp[ctxn] = child_ctx;
5579 child->perf_event_ctxp = child_ctx;
5580 get_task_struct(child);
5581 } 6078 }
5582 6079
5583 ret = inherit_group(event, parent, parent_ctx, 6080 ret = inherit_group(event, parent, parent_ctx,
@@ -5589,11 +6086,10 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
5589 return ret; 6086 return ret;
5590} 6087}
5591 6088
5592
5593/* 6089/*
5594 * Initialize the perf_event context in task_struct 6090 * Initialize the perf_event context in task_struct
5595 */ 6091 */
5596int perf_event_init_task(struct task_struct *child) 6092int perf_event_init_context(struct task_struct *child, int ctxn)
5597{ 6093{
5598 struct perf_event_context *child_ctx, *parent_ctx; 6094 struct perf_event_context *child_ctx, *parent_ctx;
5599 struct perf_event_context *cloned_ctx; 6095 struct perf_event_context *cloned_ctx;
@@ -5602,19 +6098,19 @@ int perf_event_init_task(struct task_struct *child)
5602 int inherited_all = 1; 6098 int inherited_all = 1;
5603 int ret = 0; 6099 int ret = 0;
5604 6100
5605 child->perf_event_ctxp = NULL; 6101 child->perf_event_ctxp[ctxn] = NULL;
5606 6102
5607 mutex_init(&child->perf_event_mutex); 6103 mutex_init(&child->perf_event_mutex);
5608 INIT_LIST_HEAD(&child->perf_event_list); 6104 INIT_LIST_HEAD(&child->perf_event_list);
5609 6105
5610 if (likely(!parent->perf_event_ctxp)) 6106 if (likely(!parent->perf_event_ctxp[ctxn]))
5611 return 0; 6107 return 0;
5612 6108
5613 /* 6109 /*
5614 * If the parent's context is a clone, pin it so it won't get 6110 * If the parent's context is a clone, pin it so it won't get
5615 * swapped under us. 6111 * swapped under us.
5616 */ 6112 */
5617 parent_ctx = perf_pin_task_context(parent); 6113 parent_ctx = perf_pin_task_context(parent, ctxn);
5618 6114
5619 /* 6115 /*
5620 * No need to check if parent_ctx != NULL here; since we saw 6116 * No need to check if parent_ctx != NULL here; since we saw
@@ -5634,20 +6130,20 @@ int perf_event_init_task(struct task_struct *child)
5634 * the list, not manipulating it: 6130 * the list, not manipulating it:
5635 */ 6131 */
5636 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { 6132 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5637 ret = inherit_task_group(event, parent, parent_ctx, child, 6133 ret = inherit_task_group(event, parent, parent_ctx,
5638 &inherited_all); 6134 child, ctxn, &inherited_all);
5639 if (ret) 6135 if (ret)
5640 break; 6136 break;
5641 } 6137 }
5642 6138
5643 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 6139 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5644 ret = inherit_task_group(event, parent, parent_ctx, child, 6140 ret = inherit_task_group(event, parent, parent_ctx,
5645 &inherited_all); 6141 child, ctxn, &inherited_all);
5646 if (ret) 6142 if (ret)
5647 break; 6143 break;
5648 } 6144 }
5649 6145
5650 child_ctx = child->perf_event_ctxp; 6146 child_ctx = child->perf_event_ctxp[ctxn];
5651 6147
5652 if (child_ctx && inherited_all) { 6148 if (child_ctx && inherited_all) {
5653 /* 6149 /*
@@ -5676,63 +6172,98 @@ int perf_event_init_task(struct task_struct *child)
5676 return ret; 6172 return ret;
5677} 6173}
5678 6174
6175/*
6176 * Initialize the perf_event context in task_struct
6177 */
6178int perf_event_init_task(struct task_struct *child)
6179{
6180 int ctxn, ret;
6181
6182 for_each_task_context_nr(ctxn) {
6183 ret = perf_event_init_context(child, ctxn);
6184 if (ret)
6185 return ret;
6186 }
6187
6188 return 0;
6189}
6190
5679static void __init perf_event_init_all_cpus(void) 6191static void __init perf_event_init_all_cpus(void)
5680{ 6192{
6193 struct swevent_htable *swhash;
5681 int cpu; 6194 int cpu;
5682 struct perf_cpu_context *cpuctx;
5683 6195
5684 for_each_possible_cpu(cpu) { 6196 for_each_possible_cpu(cpu) {
5685 cpuctx = &per_cpu(perf_cpu_context, cpu); 6197 swhash = &per_cpu(swevent_htable, cpu);
5686 mutex_init(&cpuctx->hlist_mutex); 6198 mutex_init(&swhash->hlist_mutex);
5687 __perf_event_init_context(&cpuctx->ctx, NULL); 6199 INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
5688 } 6200 }
5689} 6201}
5690 6202
5691static void __cpuinit perf_event_init_cpu(int cpu) 6203static void __cpuinit perf_event_init_cpu(int cpu)
5692{ 6204{
5693 struct perf_cpu_context *cpuctx; 6205 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5694
5695 cpuctx = &per_cpu(perf_cpu_context, cpu);
5696 6206
5697 spin_lock(&perf_resource_lock); 6207 mutex_lock(&swhash->hlist_mutex);
5698 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; 6208 if (swhash->hlist_refcount > 0) {
5699 spin_unlock(&perf_resource_lock);
5700
5701 mutex_lock(&cpuctx->hlist_mutex);
5702 if (cpuctx->hlist_refcount > 0) {
5703 struct swevent_hlist *hlist; 6209 struct swevent_hlist *hlist;
5704 6210
5705 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 6211 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
5706 WARN_ON_ONCE(!hlist); 6212 WARN_ON(!hlist);
5707 rcu_assign_pointer(cpuctx->swevent_hlist, hlist); 6213 rcu_assign_pointer(swhash->swevent_hlist, hlist);
5708 } 6214 }
5709 mutex_unlock(&cpuctx->hlist_mutex); 6215 mutex_unlock(&swhash->hlist_mutex);
5710} 6216}
5711 6217
5712#ifdef CONFIG_HOTPLUG_CPU 6218#ifdef CONFIG_HOTPLUG_CPU
5713static void __perf_event_exit_cpu(void *info) 6219static void perf_pmu_rotate_stop(struct pmu *pmu)
5714{ 6220{
5715 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 6221 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
5716 struct perf_event_context *ctx = &cpuctx->ctx; 6222
6223 WARN_ON(!irqs_disabled());
6224
6225 list_del_init(&cpuctx->rotation_list);
6226}
6227
6228static void __perf_event_exit_context(void *__info)
6229{
6230 struct perf_event_context *ctx = __info;
5717 struct perf_event *event, *tmp; 6231 struct perf_event *event, *tmp;
5718 6232
6233 perf_pmu_rotate_stop(ctx->pmu);
6234
5719 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 6235 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5720 __perf_event_remove_from_context(event); 6236 __perf_event_remove_from_context(event);
5721 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) 6237 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5722 __perf_event_remove_from_context(event); 6238 __perf_event_remove_from_context(event);
5723} 6239}
6240
6241static void perf_event_exit_cpu_context(int cpu)
6242{
6243 struct perf_event_context *ctx;
6244 struct pmu *pmu;
6245 int idx;
6246
6247 idx = srcu_read_lock(&pmus_srcu);
6248 list_for_each_entry_rcu(pmu, &pmus, entry) {
6249 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
6250
6251 mutex_lock(&ctx->mutex);
6252 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
6253 mutex_unlock(&ctx->mutex);
6254 }
6255 srcu_read_unlock(&pmus_srcu, idx);
6256}
6257
5724static void perf_event_exit_cpu(int cpu) 6258static void perf_event_exit_cpu(int cpu)
5725{ 6259{
5726 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 6260 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5727 struct perf_event_context *ctx = &cpuctx->ctx;
5728 6261
5729 mutex_lock(&cpuctx->hlist_mutex); 6262 mutex_lock(&swhash->hlist_mutex);
5730 swevent_hlist_release(cpuctx); 6263 swevent_hlist_release(swhash);
5731 mutex_unlock(&cpuctx->hlist_mutex); 6264 mutex_unlock(&swhash->hlist_mutex);
5732 6265
5733 mutex_lock(&ctx->mutex); 6266 perf_event_exit_cpu_context(cpu);
5734 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5735 mutex_unlock(&ctx->mutex);
5736} 6267}
5737#else 6268#else
5738static inline void perf_event_exit_cpu(int cpu) { } 6269static inline void perf_event_exit_cpu(int cpu) { }
@@ -5743,15 +6274,15 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5743{ 6274{
5744 unsigned int cpu = (long)hcpu; 6275 unsigned int cpu = (long)hcpu;
5745 6276
5746 switch (action) { 6277 switch (action & ~CPU_TASKS_FROZEN) {
5747 6278
5748 case CPU_UP_PREPARE: 6279 case CPU_UP_PREPARE:
5749 case CPU_UP_PREPARE_FROZEN: 6280 case CPU_DOWN_FAILED:
5750 perf_event_init_cpu(cpu); 6281 perf_event_init_cpu(cpu);
5751 break; 6282 break;
5752 6283
6284 case CPU_UP_CANCELED:
5753 case CPU_DOWN_PREPARE: 6285 case CPU_DOWN_PREPARE:
5754 case CPU_DOWN_PREPARE_FROZEN:
5755 perf_event_exit_cpu(cpu); 6286 perf_event_exit_cpu(cpu);
5756 break; 6287 break;
5757 6288
@@ -5762,118 +6293,13 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5762 return NOTIFY_OK; 6293 return NOTIFY_OK;
5763} 6294}
5764 6295
5765/*
5766 * This has to have a higher priority than migration_notifier in sched.c.
5767 */
5768static struct notifier_block __cpuinitdata perf_cpu_nb = {
5769 .notifier_call = perf_cpu_notify,
5770 .priority = 20,
5771};
5772
5773void __init perf_event_init(void) 6296void __init perf_event_init(void)
5774{ 6297{
5775 perf_event_init_all_cpus(); 6298 perf_event_init_all_cpus();
5776 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 6299 init_srcu_struct(&pmus_srcu);
5777 (void *)(long)smp_processor_id()); 6300 perf_pmu_register(&perf_swevent);
5778 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, 6301 perf_pmu_register(&perf_cpu_clock);
5779 (void *)(long)smp_processor_id()); 6302 perf_pmu_register(&perf_task_clock);
5780 register_cpu_notifier(&perf_cpu_nb); 6303 perf_tp_register();
5781} 6304 perf_cpu_notifier(perf_cpu_notify);
5782
5783static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
5784 struct sysdev_class_attribute *attr,
5785 char *buf)
5786{
5787 return sprintf(buf, "%d\n", perf_reserved_percpu);
5788}
5789
5790static ssize_t
5791perf_set_reserve_percpu(struct sysdev_class *class,
5792 struct sysdev_class_attribute *attr,
5793 const char *buf,
5794 size_t count)
5795{
5796 struct perf_cpu_context *cpuctx;
5797 unsigned long val;
5798 int err, cpu, mpt;
5799
5800 err = strict_strtoul(buf, 10, &val);
5801 if (err)
5802 return err;
5803 if (val > perf_max_events)
5804 return -EINVAL;
5805
5806 spin_lock(&perf_resource_lock);
5807 perf_reserved_percpu = val;
5808 for_each_online_cpu(cpu) {
5809 cpuctx = &per_cpu(perf_cpu_context, cpu);
5810 raw_spin_lock_irq(&cpuctx->ctx.lock);
5811 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5812 perf_max_events - perf_reserved_percpu);
5813 cpuctx->max_pertask = mpt;
5814 raw_spin_unlock_irq(&cpuctx->ctx.lock);
5815 }
5816 spin_unlock(&perf_resource_lock);
5817
5818 return count;
5819}
5820
5821static ssize_t perf_show_overcommit(struct sysdev_class *class,
5822 struct sysdev_class_attribute *attr,
5823 char *buf)
5824{
5825 return sprintf(buf, "%d\n", perf_overcommit);
5826}
5827
5828static ssize_t
5829perf_set_overcommit(struct sysdev_class *class,
5830 struct sysdev_class_attribute *attr,
5831 const char *buf, size_t count)
5832{
5833 unsigned long val;
5834 int err;
5835
5836 err = strict_strtoul(buf, 10, &val);
5837 if (err)
5838 return err;
5839 if (val > 1)
5840 return -EINVAL;
5841
5842 spin_lock(&perf_resource_lock);
5843 perf_overcommit = val;
5844 spin_unlock(&perf_resource_lock);
5845
5846 return count;
5847}
5848
5849static SYSDEV_CLASS_ATTR(
5850 reserve_percpu,
5851 0644,
5852 perf_show_reserve_percpu,
5853 perf_set_reserve_percpu
5854 );
5855
5856static SYSDEV_CLASS_ATTR(
5857 overcommit,
5858 0644,
5859 perf_show_overcommit,
5860 perf_set_overcommit
5861 );
5862
5863static struct attribute *perfclass_attrs[] = {
5864 &attr_reserve_percpu.attr,
5865 &attr_overcommit.attr,
5866 NULL
5867};
5868
5869static struct attribute_group perfclass_attr_group = {
5870 .attrs = perfclass_attrs,
5871 .name = "perf_events",
5872};
5873
5874static int __init perf_event_sysfs_init(void)
5875{
5876 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5877 &perfclass_attr_group);
5878} 6305}
5879device_initcall(perf_event_sysfs_init);
diff --git a/kernel/pid.c b/kernel/pid.c
index d55c6fb8d087..39b65b69584f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -401,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
401 struct task_struct *result = NULL; 401 struct task_struct *result = NULL;
402 if (pid) { 402 if (pid) {
403 struct hlist_node *first; 403 struct hlist_node *first;
404 first = rcu_dereference_check(pid->tasks[type].first, 404 first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
405 rcu_read_lock_held() || 405 rcu_read_lock_held() ||
406 lockdep_tasklist_lock_is_held()); 406 lockdep_tasklist_lock_is_held());
407 if (first) 407 if (first)
@@ -416,6 +416,7 @@ EXPORT_SYMBOL(pid_task);
416 */ 416 */
417struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 417struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
418{ 418{
419 rcu_lockdep_assert(rcu_read_lock_held());
419 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); 420 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
420} 421}
421 422
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 996a4dec5f96..c7a8f453919e 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -110,6 +110,7 @@ static const struct file_operations pm_qos_power_fops = {
110 .write = pm_qos_power_write, 110 .write = pm_qos_power_write,
111 .open = pm_qos_power_open, 111 .open = pm_qos_power_open,
112 .release = pm_qos_power_release, 112 .release = pm_qos_power_release,
113 .llseek = noop_llseek,
113}; 114};
114 115
115/* unlocked internal variant */ 116/* unlocked internal variant */
@@ -212,15 +213,17 @@ EXPORT_SYMBOL_GPL(pm_qos_request_active);
212 213
213/** 214/**
214 * pm_qos_add_request - inserts new qos request into the list 215 * pm_qos_add_request - inserts new qos request into the list
215 * @pm_qos_class: identifies which list of qos request to us 216 * @dep: pointer to a preallocated handle
217 * @pm_qos_class: identifies which list of qos request to use
216 * @value: defines the qos request 218 * @value: defines the qos request
217 * 219 *
218 * This function inserts a new entry in the pm_qos_class list of requested qos 220 * This function inserts a new entry in the pm_qos_class list of requested qos
219 * performance characteristics. It recomputes the aggregate QoS expectations 221 * performance characteristics. It recomputes the aggregate QoS expectations
220 * for the pm_qos_class of parameters, and returns the pm_qos_request list 222 * for the pm_qos_class of parameters and initializes the pm_qos_request_list
221 * element as a handle for use in updating and removal. Call needs to save 223 * handle. Caller needs to save this handle for later use in updates and
222 * this handle for later use. 224 * removal.
223 */ 225 */
226
224void pm_qos_add_request(struct pm_qos_request_list *dep, 227void pm_qos_add_request(struct pm_qos_request_list *dep,
225 int pm_qos_class, s32 value) 228 int pm_qos_class, s32 value)
226{ 229{
@@ -348,7 +351,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
348 351
349 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 352 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
350 if (pm_qos_class >= 0) { 353 if (pm_qos_class >= 0) {
351 struct pm_qos_request_list *req = kzalloc(GFP_KERNEL, sizeof(*req)); 354 struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL);
352 if (!req) 355 if (!req)
353 return -ENOMEM; 356 return -ENOMEM;
354 357
@@ -387,14 +390,16 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
387 } else if (count == 11) { /* len('0x12345678/0') */ 390 } else if (count == 11) { /* len('0x12345678/0') */
388 if (copy_from_user(ascii_value, buf, 11)) 391 if (copy_from_user(ascii_value, buf, 11))
389 return -EFAULT; 392 return -EFAULT;
393 if (strlen(ascii_value) != 10)
394 return -EINVAL;
390 x = sscanf(ascii_value, "%x", &value); 395 x = sscanf(ascii_value, "%x", &value);
391 if (x != 1) 396 if (x != 1)
392 return -EINVAL; 397 return -EINVAL;
393 pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value); 398 pr_debug("%s, %d, 0x%x\n", ascii_value, x, value);
394 } else 399 } else
395 return -EINVAL; 400 return -EINVAL;
396 401
397 pm_qos_req = (struct pm_qos_request_list *)filp->private_data; 402 pm_qos_req = filp->private_data;
398 pm_qos_update_request(pm_qos_req, value); 403 pm_qos_update_request(pm_qos_req, value);
399 404
400 return count; 405 return count;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ca6066a6952e..29bff6117abc 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -86,6 +86,7 @@ config PM_SLEEP_SMP
86 depends on SMP 86 depends on SMP
87 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE 87 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
88 depends on PM_SLEEP 88 depends on PM_SLEEP
89 select HOTPLUG
89 select HOTPLUG_CPU 90 select HOTPLUG_CPU
90 default y 91 default y
91 92
@@ -137,6 +138,8 @@ config SUSPEND_FREEZER
137config HIBERNATION 138config HIBERNATION
138 bool "Hibernation (aka 'suspend to disk')" 139 bool "Hibernation (aka 'suspend to disk')"
139 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE 140 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
141 select LZO_COMPRESS
142 select LZO_DECOMPRESS
140 select SUSPEND_NVS if HAS_IOMEM 143 select SUSPEND_NVS if HAS_IOMEM
141 ---help--- 144 ---help---
142 Enable the suspend to disk (STD) functionality, which is usually 145 Enable the suspend to disk (STD) functionality, which is usually
@@ -242,3 +245,17 @@ config PM_OPS
242 bool 245 bool
243 depends on PM_SLEEP || PM_RUNTIME 246 depends on PM_SLEEP || PM_RUNTIME
244 default y 247 default y
248
249config PM_OPP
250 bool "Operating Performance Point (OPP) Layer library"
251 depends on PM
252 ---help---
253 SOCs have a standard set of tuples consisting of frequency and
254 voltage pairs that the device will support per voltage domain. This
255 is called Operating Performance Point or OPP. The actual definitions
256 of OPP varies over silicon within the same family of devices.
257
258 OPP layer organizes the data internally using device pointers
259 representing individual voltage domains and provides SOC
260 implementations a ready to use framework to manage OPPs.
261 For more information, read <file:Documentation/power/opp.txt>
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index c77963938bca..657272e91d0a 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -29,6 +29,7 @@
29#include "power.h" 29#include "power.h"
30 30
31 31
32static int nocompress = 0;
32static int noresume = 0; 33static int noresume = 0;
33static char resume_file[256] = CONFIG_PM_STD_PARTITION; 34static char resume_file[256] = CONFIG_PM_STD_PARTITION;
34dev_t swsusp_resume_device; 35dev_t swsusp_resume_device;
@@ -338,7 +339,6 @@ int hibernation_snapshot(int platform_mode)
338 goto Close; 339 goto Close;
339 340
340 suspend_console(); 341 suspend_console();
341 hibernation_freeze_swap();
342 saved_mask = clear_gfp_allowed_mask(GFP_IOFS); 342 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
343 error = dpm_suspend_start(PMSG_FREEZE); 343 error = dpm_suspend_start(PMSG_FREEZE);
344 if (error) 344 if (error)
@@ -639,6 +639,8 @@ int hibernate(void)
639 639
640 if (hibernation_mode == HIBERNATION_PLATFORM) 640 if (hibernation_mode == HIBERNATION_PLATFORM)
641 flags |= SF_PLATFORM_MODE; 641 flags |= SF_PLATFORM_MODE;
642 if (nocompress)
643 flags |= SF_NOCOMPRESS_MODE;
642 pr_debug("PM: writing image.\n"); 644 pr_debug("PM: writing image.\n");
643 error = swsusp_write(flags); 645 error = swsusp_write(flags);
644 swsusp_free(); 646 swsusp_free();
@@ -706,7 +708,7 @@ static int software_resume(void)
706 goto Unlock; 708 goto Unlock;
707 } 709 }
708 710
709 pr_debug("PM: Checking image partition %s\n", resume_file); 711 pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
710 712
711 /* Check if the device is there */ 713 /* Check if the device is there */
712 swsusp_resume_device = name_to_dev_t(resume_file); 714 swsusp_resume_device = name_to_dev_t(resume_file);
@@ -731,10 +733,10 @@ static int software_resume(void)
731 } 733 }
732 734
733 Check_image: 735 Check_image:
734 pr_debug("PM: Resume from partition %d:%d\n", 736 pr_debug("PM: Hibernation image partition %d:%d present\n",
735 MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); 737 MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
736 738
737 pr_debug("PM: Checking hibernation image.\n"); 739 pr_debug("PM: Looking for hibernation image.\n");
738 error = swsusp_check(); 740 error = swsusp_check();
739 if (error) 741 if (error)
740 goto Unlock; 742 goto Unlock;
@@ -766,14 +768,14 @@ static int software_resume(void)
766 goto Done; 768 goto Done;
767 } 769 }
768 770
769 pr_debug("PM: Reading hibernation image.\n"); 771 pr_debug("PM: Loading hibernation image.\n");
770 772
771 error = swsusp_read(&flags); 773 error = swsusp_read(&flags);
772 swsusp_close(FMODE_READ); 774 swsusp_close(FMODE_READ);
773 if (!error) 775 if (!error)
774 hibernation_restore(flags & SF_PLATFORM_MODE); 776 hibernation_restore(flags & SF_PLATFORM_MODE);
775 777
776 printk(KERN_ERR "PM: Restore failed, recovering.\n"); 778 printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
777 swsusp_free(); 779 swsusp_free();
778 thaw_processes(); 780 thaw_processes();
779 Done: 781 Done:
@@ -786,7 +788,7 @@ static int software_resume(void)
786 /* For success case, the suspend path will release the lock */ 788 /* For success case, the suspend path will release the lock */
787 Unlock: 789 Unlock:
788 mutex_unlock(&pm_mutex); 790 mutex_unlock(&pm_mutex);
789 pr_debug("PM: Resume from disk failed.\n"); 791 pr_debug("PM: Hibernation image not present or could not be loaded.\n");
790 return error; 792 return error;
791close_finish: 793close_finish:
792 swsusp_close(FMODE_READ); 794 swsusp_close(FMODE_READ);
@@ -1005,6 +1007,15 @@ static int __init resume_offset_setup(char *str)
1005 return 1; 1007 return 1;
1006} 1008}
1007 1009
1010static int __init hibernate_setup(char *str)
1011{
1012 if (!strncmp(str, "noresume", 8))
1013 noresume = 1;
1014 else if (!strncmp(str, "nocompress", 10))
1015 nocompress = 1;
1016 return 1;
1017}
1018
1008static int __init noresume_setup(char *str) 1019static int __init noresume_setup(char *str)
1009{ 1020{
1010 noresume = 1; 1021 noresume = 1;
@@ -1014,3 +1025,4 @@ static int __init noresume_setup(char *str)
1014__setup("noresume", noresume_setup); 1025__setup("noresume", noresume_setup);
1015__setup("resume_offset=", resume_offset_setup); 1026__setup("resume_offset=", resume_offset_setup);
1016__setup("resume=", resume_setup); 1027__setup("resume=", resume_setup);
1028__setup("hibernate=", hibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 62b0bc6e4983..7b5db6a8561e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -237,18 +237,18 @@ static ssize_t wakeup_count_show(struct kobject *kobj,
237 struct kobj_attribute *attr, 237 struct kobj_attribute *attr,
238 char *buf) 238 char *buf)
239{ 239{
240 unsigned long val; 240 unsigned int val;
241 241
242 return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR; 242 return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR;
243} 243}
244 244
245static ssize_t wakeup_count_store(struct kobject *kobj, 245static ssize_t wakeup_count_store(struct kobject *kobj,
246 struct kobj_attribute *attr, 246 struct kobj_attribute *attr,
247 const char *buf, size_t n) 247 const char *buf, size_t n)
248{ 248{
249 unsigned long val; 249 unsigned int val;
250 250
251 if (sscanf(buf, "%lu", &val) == 1) { 251 if (sscanf(buf, "%u", &val) == 1) {
252 if (pm_save_wakeup_count(val)) 252 if (pm_save_wakeup_count(val))
253 return n; 253 return n;
254 } 254 }
@@ -281,12 +281,30 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
281} 281}
282 282
283power_attr(pm_trace); 283power_attr(pm_trace);
284
285static ssize_t pm_trace_dev_match_show(struct kobject *kobj,
286 struct kobj_attribute *attr,
287 char *buf)
288{
289 return show_trace_dev_match(buf, PAGE_SIZE);
290}
291
292static ssize_t
293pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr,
294 const char *buf, size_t n)
295{
296 return -EINVAL;
297}
298
299power_attr(pm_trace_dev_match);
300
284#endif /* CONFIG_PM_TRACE */ 301#endif /* CONFIG_PM_TRACE */
285 302
286static struct attribute * g[] = { 303static struct attribute * g[] = {
287 &state_attr.attr, 304 &state_attr.attr,
288#ifdef CONFIG_PM_TRACE 305#ifdef CONFIG_PM_TRACE
289 &pm_trace_attr.attr, 306 &pm_trace_attr.attr,
307 &pm_trace_dev_match_attr.attr,
290#endif 308#endif
291#ifdef CONFIG_PM_SLEEP 309#ifdef CONFIG_PM_SLEEP
292 &pm_async_attr.attr, 310 &pm_async_attr.attr,
@@ -308,7 +326,7 @@ EXPORT_SYMBOL_GPL(pm_wq);
308 326
309static int __init pm_start_workqueue(void) 327static int __init pm_start_workqueue(void)
310{ 328{
311 pm_wq = create_freezeable_workqueue("pm"); 329 pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0);
312 330
313 return pm_wq ? 0 : -ENOMEM; 331 return pm_wq ? 0 : -ENOMEM;
314} 332}
@@ -321,6 +339,7 @@ static int __init pm_init(void)
321 int error = pm_start_workqueue(); 339 int error = pm_start_workqueue();
322 if (error) 340 if (error)
323 return error; 341 return error;
342 hibernate_image_size_init();
324 power_kobj = kobject_create_and_add("power", NULL); 343 power_kobj = kobject_create_and_add("power", NULL);
325 if (!power_kobj) 344 if (!power_kobj)
326 return -ENOMEM; 345 return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 006270fe382d..03634be55f62 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -14,6 +14,9 @@ struct swsusp_info {
14} __attribute__((aligned(PAGE_SIZE))); 14} __attribute__((aligned(PAGE_SIZE)));
15 15
16#ifdef CONFIG_HIBERNATION 16#ifdef CONFIG_HIBERNATION
17/* kernel/power/snapshot.c */
18extern void __init hibernate_image_size_init(void);
19
17#ifdef CONFIG_ARCH_HIBERNATION_HEADER 20#ifdef CONFIG_ARCH_HIBERNATION_HEADER
18/* Maximum size of architecture specific data in a hibernation header */ 21/* Maximum size of architecture specific data in a hibernation header */
19#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) 22#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4)
@@ -49,7 +52,11 @@ static inline char *check_image_kernel(struct swsusp_info *info)
49extern int hibernation_snapshot(int platform_mode); 52extern int hibernation_snapshot(int platform_mode);
50extern int hibernation_restore(int platform_mode); 53extern int hibernation_restore(int platform_mode);
51extern int hibernation_platform_enter(void); 54extern int hibernation_platform_enter(void);
52#endif 55
56#else /* !CONFIG_HIBERNATION */
57
58static inline void hibernate_image_size_init(void) {}
59#endif /* !CONFIG_HIBERNATION */
53 60
54extern int pfn_is_nosave(unsigned long); 61extern int pfn_is_nosave(unsigned long);
55 62
@@ -134,6 +141,7 @@ extern int swsusp_swap_in_use(void);
134 * the image header. 141 * the image header.
135 */ 142 */
136#define SF_PLATFORM_MODE 1 143#define SF_PLATFORM_MODE 1
144#define SF_NOCOMPRESS_MODE 2
137 145
138/* kernel/power/hibernate.c */ 146/* kernel/power/hibernate.c */
139extern int swsusp_check(void); 147extern int swsusp_check(void);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 028a99598f49..e50b4c1b2a0f 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -40,6 +40,7 @@ static int try_to_freeze_tasks(bool sig_only)
40 struct timeval start, end; 40 struct timeval start, end;
41 u64 elapsed_csecs64; 41 u64 elapsed_csecs64;
42 unsigned int elapsed_csecs; 42 unsigned int elapsed_csecs;
43 bool wakeup = false;
43 44
44 do_gettimeofday(&start); 45 do_gettimeofday(&start);
45 46
@@ -78,6 +79,11 @@ static int try_to_freeze_tasks(bool sig_only)
78 if (!todo || time_after(jiffies, end_time)) 79 if (!todo || time_after(jiffies, end_time))
79 break; 80 break;
80 81
82 if (!pm_check_wakeup_events()) {
83 wakeup = true;
84 break;
85 }
86
81 /* 87 /*
82 * We need to retry, but first give the freezing tasks some 88 * We need to retry, but first give the freezing tasks some
83 * time to enter the regrigerator. 89 * time to enter the regrigerator.
@@ -97,8 +103,9 @@ static int try_to_freeze_tasks(bool sig_only)
97 * but it cleans up leftover PF_FREEZE requests. 103 * but it cleans up leftover PF_FREEZE requests.
98 */ 104 */
99 printk("\n"); 105 printk("\n");
100 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " 106 printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds "
101 "(%d tasks refusing to freeze, wq_busy=%d):\n", 107 "(%d tasks refusing to freeze, wq_busy=%d):\n",
108 wakeup ? "aborted" : "failed",
102 elapsed_csecs / 100, elapsed_csecs % 100, 109 elapsed_csecs / 100, elapsed_csecs % 100,
103 todo - wq_busy, wq_busy); 110 todo - wq_busy, wq_busy);
104 111
@@ -107,7 +114,7 @@ static int try_to_freeze_tasks(bool sig_only)
107 read_lock(&tasklist_lock); 114 read_lock(&tasklist_lock);
108 do_each_thread(g, p) { 115 do_each_thread(g, p) {
109 task_lock(p); 116 task_lock(p);
110 if (freezing(p) && !freezer_should_skip(p)) 117 if (!wakeup && freezing(p) && !freezer_should_skip(p))
111 sched_show_task(p); 118 sched_show_task(p);
112 cancel_freezing(p); 119 cancel_freezing(p);
113 task_unlock(p); 120 task_unlock(p);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5e7edfb05e66..0dac75ea4456 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -46,7 +46,12 @@ static void swsusp_unset_page_forbidden(struct page *);
46 * size will not exceed N bytes, but if that is impossible, it will 46 * size will not exceed N bytes, but if that is impossible, it will
47 * try to create the smallest image possible. 47 * try to create the smallest image possible.
48 */ 48 */
49unsigned long image_size = 500 * 1024 * 1024; 49unsigned long image_size;
50
51void __init hibernate_image_size_init(void)
52{
53 image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
54}
50 55
51/* List of PBEs needed for restoring the pages that were allocated before 56/* List of PBEs needed for restoring the pages that were allocated before
52 * the suspend and included in the suspend image, but have also been 57 * the suspend and included in the suspend image, but have also been
@@ -979,8 +984,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
979 src = kmap_atomic(s_page, KM_USER0); 984 src = kmap_atomic(s_page, KM_USER0);
980 dst = kmap_atomic(d_page, KM_USER1); 985 dst = kmap_atomic(d_page, KM_USER1);
981 do_copy_page(dst, src); 986 do_copy_page(dst, src);
982 kunmap_atomic(src, KM_USER0);
983 kunmap_atomic(dst, KM_USER1); 987 kunmap_atomic(dst, KM_USER1);
988 kunmap_atomic(src, KM_USER0);
984 } else { 989 } else {
985 if (PageHighMem(d_page)) { 990 if (PageHighMem(d_page)) {
986 /* Page pointed to by src may contain some kernel 991 /* Page pointed to by src may contain some kernel
@@ -988,7 +993,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
988 */ 993 */
989 safe_copy_page(buffer, s_page); 994 safe_copy_page(buffer, s_page);
990 dst = kmap_atomic(d_page, KM_USER0); 995 dst = kmap_atomic(d_page, KM_USER0);
991 memcpy(dst, buffer, PAGE_SIZE); 996 copy_page(dst, buffer);
992 kunmap_atomic(dst, KM_USER0); 997 kunmap_atomic(dst, KM_USER0);
993 } else { 998 } else {
994 safe_copy_page(page_address(d_page), s_page); 999 safe_copy_page(page_address(d_page), s_page);
@@ -1086,7 +1091,6 @@ void swsusp_free(void)
1086 buffer = NULL; 1091 buffer = NULL;
1087 alloc_normal = 0; 1092 alloc_normal = 0;
1088 alloc_highmem = 0; 1093 alloc_highmem = 0;
1089 hibernation_thaw_swap();
1090} 1094}
1091 1095
1092/* Helper functions used for the shrinking of memory. */ 1096/* Helper functions used for the shrinking of memory. */
@@ -1122,9 +1126,19 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
1122 return nr_alloc; 1126 return nr_alloc;
1123} 1127}
1124 1128
1125static unsigned long preallocate_image_memory(unsigned long nr_pages) 1129static unsigned long preallocate_image_memory(unsigned long nr_pages,
1130 unsigned long avail_normal)
1126{ 1131{
1127 return preallocate_image_pages(nr_pages, GFP_IMAGE); 1132 unsigned long alloc;
1133
1134 if (avail_normal <= alloc_normal)
1135 return 0;
1136
1137 alloc = avail_normal - alloc_normal;
1138 if (nr_pages < alloc)
1139 alloc = nr_pages;
1140
1141 return preallocate_image_pages(alloc, GFP_IMAGE);
1128} 1142}
1129 1143
1130#ifdef CONFIG_HIGHMEM 1144#ifdef CONFIG_HIGHMEM
@@ -1170,15 +1184,22 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1170 */ 1184 */
1171static void free_unnecessary_pages(void) 1185static void free_unnecessary_pages(void)
1172{ 1186{
1173 unsigned long save_highmem, to_free_normal, to_free_highmem; 1187 unsigned long save, to_free_normal, to_free_highmem;
1174 1188
1175 to_free_normal = alloc_normal - count_data_pages(); 1189 save = count_data_pages();
1176 save_highmem = count_highmem_pages(); 1190 if (alloc_normal >= save) {
1177 if (alloc_highmem > save_highmem) { 1191 to_free_normal = alloc_normal - save;
1178 to_free_highmem = alloc_highmem - save_highmem; 1192 save = 0;
1193 } else {
1194 to_free_normal = 0;
1195 save -= alloc_normal;
1196 }
1197 save += count_highmem_pages();
1198 if (alloc_highmem >= save) {
1199 to_free_highmem = alloc_highmem - save;
1179 } else { 1200 } else {
1180 to_free_highmem = 0; 1201 to_free_highmem = 0;
1181 to_free_normal -= save_highmem - alloc_highmem; 1202 to_free_normal -= save - alloc_highmem;
1182 } 1203 }
1183 1204
1184 memory_bm_position_reset(&copy_bm); 1205 memory_bm_position_reset(&copy_bm);
@@ -1259,7 +1280,7 @@ int hibernate_preallocate_memory(void)
1259{ 1280{
1260 struct zone *zone; 1281 struct zone *zone;
1261 unsigned long saveable, size, max_size, count, highmem, pages = 0; 1282 unsigned long saveable, size, max_size, count, highmem, pages = 0;
1262 unsigned long alloc, save_highmem, pages_highmem; 1283 unsigned long alloc, save_highmem, pages_highmem, avail_normal;
1263 struct timeval start, stop; 1284 struct timeval start, stop;
1264 int error; 1285 int error;
1265 1286
@@ -1296,26 +1317,38 @@ int hibernate_preallocate_memory(void)
1296 else 1317 else
1297 count += zone_page_state(zone, NR_FREE_PAGES); 1318 count += zone_page_state(zone, NR_FREE_PAGES);
1298 } 1319 }
1320 avail_normal = count;
1299 count += highmem; 1321 count += highmem;
1300 count -= totalreserve_pages; 1322 count -= totalreserve_pages;
1301 1323
1302 /* Compute the maximum number of saveable pages to leave in memory. */ 1324 /* Compute the maximum number of saveable pages to leave in memory. */
1303 max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; 1325 max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES;
1326 /* Compute the desired number of image pages specified by image_size. */
1304 size = DIV_ROUND_UP(image_size, PAGE_SIZE); 1327 size = DIV_ROUND_UP(image_size, PAGE_SIZE);
1305 if (size > max_size) 1328 if (size > max_size)
1306 size = max_size; 1329 size = max_size;
1307 /* 1330 /*
1308 * If the maximum is not less than the current number of saveable pages 1331 * If the desired number of image pages is at least as large as the
1309 * in memory, allocate page frames for the image and we're done. 1332 * current number of saveable pages in memory, allocate page frames for
1333 * the image and we're done.
1310 */ 1334 */
1311 if (size >= saveable) { 1335 if (size >= saveable) {
1312 pages = preallocate_image_highmem(save_highmem); 1336 pages = preallocate_image_highmem(save_highmem);
1313 pages += preallocate_image_memory(saveable - pages); 1337 pages += preallocate_image_memory(saveable - pages, avail_normal);
1314 goto out; 1338 goto out;
1315 } 1339 }
1316 1340
1317 /* Estimate the minimum size of the image. */ 1341 /* Estimate the minimum size of the image. */
1318 pages = minimum_image_size(saveable); 1342 pages = minimum_image_size(saveable);
1343 /*
1344 * To avoid excessive pressure on the normal zone, leave room in it to
1345 * accommodate an image of the minimum size (unless it's already too
1346 * small, in which case don't preallocate pages from it at all).
1347 */
1348 if (avail_normal > pages)
1349 avail_normal -= pages;
1350 else
1351 avail_normal = 0;
1319 if (size < pages) 1352 if (size < pages)
1320 size = min_t(unsigned long, pages, max_size); 1353 size = min_t(unsigned long, pages, max_size);
1321 1354
@@ -1336,16 +1369,34 @@ int hibernate_preallocate_memory(void)
1336 */ 1369 */
1337 pages_highmem = preallocate_image_highmem(highmem / 2); 1370 pages_highmem = preallocate_image_highmem(highmem / 2);
1338 alloc = (count - max_size) - pages_highmem; 1371 alloc = (count - max_size) - pages_highmem;
1339 pages = preallocate_image_memory(alloc); 1372 pages = preallocate_image_memory(alloc, avail_normal);
1340 if (pages < alloc) 1373 if (pages < alloc) {
1341 goto err_out; 1374 /* We have exhausted non-highmem pages, try highmem. */
1342 size = max_size - size; 1375 alloc -= pages;
1343 alloc = size; 1376 pages += pages_highmem;
1344 size = preallocate_highmem_fraction(size, highmem, count); 1377 pages_highmem = preallocate_image_highmem(alloc);
1345 pages_highmem += size; 1378 if (pages_highmem < alloc)
1346 alloc -= size; 1379 goto err_out;
1347 pages += preallocate_image_memory(alloc); 1380 pages += pages_highmem;
1348 pages += pages_highmem; 1381 /*
1382 * size is the desired number of saveable pages to leave in
1383 * memory, so try to preallocate (all memory - size) pages.
1384 */
1385 alloc = (count - pages) - size;
1386 pages += preallocate_image_highmem(alloc);
1387 } else {
1388 /*
1389 * There are approximately max_size saveable pages at this point
1390 * and we want to reduce this number down to size.
1391 */
1392 alloc = max_size - size;
1393 size = preallocate_highmem_fraction(alloc, highmem, count);
1394 pages_highmem += size;
1395 alloc -= size;
1396 size = preallocate_image_memory(alloc, avail_normal);
1397 pages_highmem += preallocate_image_highmem(alloc - size);
1398 pages += pages_highmem + size;
1399 }
1349 1400
1350 /* 1401 /*
1351 * We only need as many page frames for the image as there are saveable 1402 * We only need as many page frames for the image as there are saveable
@@ -1636,7 +1687,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
1636 memory_bm_position_reset(&orig_bm); 1687 memory_bm_position_reset(&orig_bm);
1637 memory_bm_position_reset(&copy_bm); 1688 memory_bm_position_reset(&copy_bm);
1638 } else if (handle->cur <= nr_meta_pages) { 1689 } else if (handle->cur <= nr_meta_pages) {
1639 memset(buffer, 0, PAGE_SIZE); 1690 clear_page(buffer);
1640 pack_pfns(buffer, &orig_bm); 1691 pack_pfns(buffer, &orig_bm);
1641 } else { 1692 } else {
1642 struct page *page; 1693 struct page *page;
@@ -1650,7 +1701,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
1650 void *kaddr; 1701 void *kaddr;
1651 1702
1652 kaddr = kmap_atomic(page, KM_USER0); 1703 kaddr = kmap_atomic(page, KM_USER0);
1653 memcpy(buffer, kaddr, PAGE_SIZE); 1704 copy_page(buffer, kaddr);
1654 kunmap_atomic(kaddr, KM_USER0); 1705 kunmap_atomic(kaddr, KM_USER0);
1655 handle->buffer = buffer; 1706 handle->buffer = buffer;
1656 } else { 1707 } else {
@@ -1933,7 +1984,7 @@ static void copy_last_highmem_page(void)
1933 void *dst; 1984 void *dst;
1934 1985
1935 dst = kmap_atomic(last_highmem_page, KM_USER0); 1986 dst = kmap_atomic(last_highmem_page, KM_USER0);
1936 memcpy(dst, buffer, PAGE_SIZE); 1987 copy_page(dst, buffer);
1937 kunmap_atomic(dst, KM_USER0); 1988 kunmap_atomic(dst, KM_USER0);
1938 last_highmem_page = NULL; 1989 last_highmem_page = NULL;
1939 } 1990 }
@@ -2219,11 +2270,11 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
2219 2270
2220 kaddr1 = kmap_atomic(p1, KM_USER0); 2271 kaddr1 = kmap_atomic(p1, KM_USER0);
2221 kaddr2 = kmap_atomic(p2, KM_USER1); 2272 kaddr2 = kmap_atomic(p2, KM_USER1);
2222 memcpy(buf, kaddr1, PAGE_SIZE); 2273 copy_page(buf, kaddr1);
2223 memcpy(kaddr1, kaddr2, PAGE_SIZE); 2274 copy_page(kaddr1, kaddr2);
2224 memcpy(kaddr2, buf, PAGE_SIZE); 2275 copy_page(kaddr2, buf);
2225 kunmap_atomic(kaddr1, KM_USER0);
2226 kunmap_atomic(kaddr2, KM_USER1); 2276 kunmap_atomic(kaddr2, KM_USER1);
2277 kunmap_atomic(kaddr1, KM_USER0);
2227} 2278}
2228 2279
2229/** 2280/**
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 5d0059eed3e4..a0e4a86ccf94 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -24,10 +24,12 @@
24#include <linux/swapops.h> 24#include <linux/swapops.h>
25#include <linux/pm.h> 25#include <linux/pm.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/lzo.h>
28#include <linux/vmalloc.h>
27 29
28#include "power.h" 30#include "power.h"
29 31
30#define SWSUSP_SIG "S1SUSPEND" 32#define HIBERNATE_SIG "LINHIB0001"
31 33
32/* 34/*
33 * The swap map is a data structure used for keeping track of each page 35 * The swap map is a data structure used for keeping track of each page
@@ -136,10 +138,10 @@ sector_t alloc_swapdev_block(int swap)
136{ 138{
137 unsigned long offset; 139 unsigned long offset;
138 140
139 offset = swp_offset(get_swap_for_hibernation(swap)); 141 offset = swp_offset(get_swap_page_of_type(swap));
140 if (offset) { 142 if (offset) {
141 if (swsusp_extents_insert(offset)) 143 if (swsusp_extents_insert(offset))
142 swap_free_for_hibernation(swp_entry(swap, offset)); 144 swap_free(swp_entry(swap, offset));
143 else 145 else
144 return swapdev_block(swap, offset); 146 return swapdev_block(swap, offset);
145 } 147 }
@@ -163,7 +165,7 @@ void free_all_swap_pages(int swap)
163 ext = container_of(node, struct swsusp_extent, node); 165 ext = container_of(node, struct swsusp_extent, node);
164 rb_erase(node, &swsusp_extents); 166 rb_erase(node, &swsusp_extents);
165 for (offset = ext->start; offset <= ext->end; offset++) 167 for (offset = ext->start; offset <= ext->end; offset++)
166 swap_free_for_hibernation(swp_entry(swap, offset)); 168 swap_free(swp_entry(swap, offset));
167 169
168 kfree(ext); 170 kfree(ext);
169 } 171 }
@@ -193,7 +195,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
193 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || 195 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
194 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { 196 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
195 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); 197 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
196 memcpy(swsusp_header->sig,SWSUSP_SIG, 10); 198 memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
197 swsusp_header->image = handle->first_sector; 199 swsusp_header->image = handle->first_sector;
198 swsusp_header->flags = flags; 200 swsusp_header->flags = flags;
199 error = hib_bio_write_page(swsusp_resume_block, 201 error = hib_bio_write_page(swsusp_resume_block,
@@ -249,7 +251,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
249 if (bio_chain) { 251 if (bio_chain) {
250 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 252 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
251 if (src) { 253 if (src) {
252 memcpy(src, buf, PAGE_SIZE); 254 copy_page(src, buf);
253 } else { 255 } else {
254 WARN_ON_ONCE(1); 256 WARN_ON_ONCE(1);
255 bio_chain = NULL; /* Go synchronous */ 257 bio_chain = NULL; /* Go synchronous */
@@ -323,7 +325,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
323 error = write_page(handle->cur, handle->cur_swap, NULL); 325 error = write_page(handle->cur, handle->cur_swap, NULL);
324 if (error) 326 if (error)
325 goto out; 327 goto out;
326 memset(handle->cur, 0, PAGE_SIZE); 328 clear_page(handle->cur);
327 handle->cur_swap = offset; 329 handle->cur_swap = offset;
328 handle->k = 0; 330 handle->k = 0;
329 } 331 }
@@ -357,6 +359,18 @@ static int swap_writer_finish(struct swap_map_handle *handle,
357 return error; 359 return error;
358} 360}
359 361
362/* We need to remember how much compressed data we need to read. */
363#define LZO_HEADER sizeof(size_t)
364
365/* Number of pages/bytes we'll compress at one time. */
366#define LZO_UNC_PAGES 32
367#define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE)
368
369/* Number of pages/bytes we need for compressed data (worst case). */
370#define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \
371 LZO_HEADER, PAGE_SIZE)
372#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE)
373
360/** 374/**
361 * save_image - save the suspend image data 375 * save_image - save the suspend image data
362 */ 376 */
@@ -404,6 +418,137 @@ static int save_image(struct swap_map_handle *handle,
404 return ret; 418 return ret;
405} 419}
406 420
421
422/**
423 * save_image_lzo - Save the suspend image data compressed with LZO.
424 * @handle: Swap mam handle to use for saving the image.
425 * @snapshot: Image to read data from.
426 * @nr_to_write: Number of pages to save.
427 */
428static int save_image_lzo(struct swap_map_handle *handle,
429 struct snapshot_handle *snapshot,
430 unsigned int nr_to_write)
431{
432 unsigned int m;
433 int ret = 0;
434 int nr_pages;
435 int err2;
436 struct bio *bio;
437 struct timeval start;
438 struct timeval stop;
439 size_t off, unc_len, cmp_len;
440 unsigned char *unc, *cmp, *wrk, *page;
441
442 page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
443 if (!page) {
444 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
445 return -ENOMEM;
446 }
447
448 wrk = vmalloc(LZO1X_1_MEM_COMPRESS);
449 if (!wrk) {
450 printk(KERN_ERR "PM: Failed to allocate LZO workspace\n");
451 free_page((unsigned long)page);
452 return -ENOMEM;
453 }
454
455 unc = vmalloc(LZO_UNC_SIZE);
456 if (!unc) {
457 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
458 vfree(wrk);
459 free_page((unsigned long)page);
460 return -ENOMEM;
461 }
462
463 cmp = vmalloc(LZO_CMP_SIZE);
464 if (!cmp) {
465 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
466 vfree(unc);
467 vfree(wrk);
468 free_page((unsigned long)page);
469 return -ENOMEM;
470 }
471
472 printk(KERN_INFO
473 "PM: Compressing and saving image data (%u pages) ... ",
474 nr_to_write);
475 m = nr_to_write / 100;
476 if (!m)
477 m = 1;
478 nr_pages = 0;
479 bio = NULL;
480 do_gettimeofday(&start);
481 for (;;) {
482 for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
483 ret = snapshot_read_next(snapshot);
484 if (ret < 0)
485 goto out_finish;
486
487 if (!ret)
488 break;
489
490 memcpy(unc + off, data_of(*snapshot), PAGE_SIZE);
491
492 if (!(nr_pages % m))
493 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
494 nr_pages++;
495 }
496
497 if (!off)
498 break;
499
500 unc_len = off;
501 ret = lzo1x_1_compress(unc, unc_len,
502 cmp + LZO_HEADER, &cmp_len, wrk);
503 if (ret < 0) {
504 printk(KERN_ERR "PM: LZO compression failed\n");
505 break;
506 }
507
508 if (unlikely(!cmp_len ||
509 cmp_len > lzo1x_worst_compress(unc_len))) {
510 printk(KERN_ERR "PM: Invalid LZO compressed length\n");
511 ret = -1;
512 break;
513 }
514
515 *(size_t *)cmp = cmp_len;
516
517 /*
518 * Given we are writing one page at a time to disk, we copy
519 * that much from the buffer, although the last bit will likely
520 * be smaller than full page. This is OK - we saved the length
521 * of the compressed data, so any garbage at the end will be
522 * discarded when we read it.
523 */
524 for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
525 memcpy(page, cmp + off, PAGE_SIZE);
526
527 ret = swap_write_page(handle, page, &bio);
528 if (ret)
529 goto out_finish;
530 }
531 }
532
533out_finish:
534 err2 = hib_wait_on_bio_chain(&bio);
535 do_gettimeofday(&stop);
536 if (!ret)
537 ret = err2;
538 if (!ret)
539 printk(KERN_CONT "\b\b\b\bdone\n");
540 else
541 printk(KERN_CONT "\n");
542 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
543
544 vfree(cmp);
545 vfree(unc);
546 vfree(wrk);
547 free_page((unsigned long)page);
548
549 return ret;
550}
551
407/** 552/**
408 * enough_swap - Make sure we have enough swap to save the image. 553 * enough_swap - Make sure we have enough swap to save the image.
409 * 554 *
@@ -411,12 +556,16 @@ static int save_image(struct swap_map_handle *handle,
411 * space avaiable from the resume partition. 556 * space avaiable from the resume partition.
412 */ 557 */
413 558
414static int enough_swap(unsigned int nr_pages) 559static int enough_swap(unsigned int nr_pages, unsigned int flags)
415{ 560{
416 unsigned int free_swap = count_swap_pages(root_swap, 1); 561 unsigned int free_swap = count_swap_pages(root_swap, 1);
562 unsigned int required;
417 563
418 pr_debug("PM: Free swap pages: %u\n", free_swap); 564 pr_debug("PM: Free swap pages: %u\n", free_swap);
419 return free_swap > nr_pages + PAGES_FOR_IO; 565
566 required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ?
567 nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1);
568 return free_swap > required;
420} 569}
421 570
422/** 571/**
@@ -443,7 +592,7 @@ int swsusp_write(unsigned int flags)
443 printk(KERN_ERR "PM: Cannot get swap writer\n"); 592 printk(KERN_ERR "PM: Cannot get swap writer\n");
444 return error; 593 return error;
445 } 594 }
446 if (!enough_swap(pages)) { 595 if (!enough_swap(pages, flags)) {
447 printk(KERN_ERR "PM: Not enough free swap\n"); 596 printk(KERN_ERR "PM: Not enough free swap\n");
448 error = -ENOSPC; 597 error = -ENOSPC;
449 goto out_finish; 598 goto out_finish;
@@ -458,8 +607,11 @@ int swsusp_write(unsigned int flags)
458 } 607 }
459 header = (struct swsusp_info *)data_of(snapshot); 608 header = (struct swsusp_info *)data_of(snapshot);
460 error = swap_write_page(&handle, header, NULL); 609 error = swap_write_page(&handle, header, NULL);
461 if (!error) 610 if (!error) {
462 error = save_image(&handle, &snapshot, pages - 1); 611 error = (flags & SF_NOCOMPRESS_MODE) ?
612 save_image(&handle, &snapshot, pages - 1) :
613 save_image_lzo(&handle, &snapshot, pages - 1);
614 }
463out_finish: 615out_finish:
464 error = swap_writer_finish(&handle, flags, error); 616 error = swap_writer_finish(&handle, flags, error);
465 return error; 617 return error;
@@ -590,6 +742,127 @@ static int load_image(struct swap_map_handle *handle,
590} 742}
591 743
592/** 744/**
745 * load_image_lzo - Load compressed image data and decompress them with LZO.
746 * @handle: Swap map handle to use for loading data.
747 * @snapshot: Image to copy uncompressed data into.
748 * @nr_to_read: Number of pages to load.
749 */
750static int load_image_lzo(struct swap_map_handle *handle,
751 struct snapshot_handle *snapshot,
752 unsigned int nr_to_read)
753{
754 unsigned int m;
755 int error = 0;
756 struct timeval start;
757 struct timeval stop;
758 unsigned nr_pages;
759 size_t off, unc_len, cmp_len;
760 unsigned char *unc, *cmp, *page;
761
762 page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
763 if (!page) {
764 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
765 return -ENOMEM;
766 }
767
768 unc = vmalloc(LZO_UNC_SIZE);
769 if (!unc) {
770 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
771 free_page((unsigned long)page);
772 return -ENOMEM;
773 }
774
775 cmp = vmalloc(LZO_CMP_SIZE);
776 if (!cmp) {
777 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
778 vfree(unc);
779 free_page((unsigned long)page);
780 return -ENOMEM;
781 }
782
783 printk(KERN_INFO
784 "PM: Loading and decompressing image data (%u pages) ... ",
785 nr_to_read);
786 m = nr_to_read / 100;
787 if (!m)
788 m = 1;
789 nr_pages = 0;
790 do_gettimeofday(&start);
791
792 error = snapshot_write_next(snapshot);
793 if (error <= 0)
794 goto out_finish;
795
796 for (;;) {
797 error = swap_read_page(handle, page, NULL); /* sync */
798 if (error)
799 break;
800
801 cmp_len = *(size_t *)page;
802 if (unlikely(!cmp_len ||
803 cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) {
804 printk(KERN_ERR "PM: Invalid LZO compressed length\n");
805 error = -1;
806 break;
807 }
808
809 memcpy(cmp, page, PAGE_SIZE);
810 for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
811 error = swap_read_page(handle, page, NULL); /* sync */
812 if (error)
813 goto out_finish;
814
815 memcpy(cmp + off, page, PAGE_SIZE);
816 }
817
818 unc_len = LZO_UNC_SIZE;
819 error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len,
820 unc, &unc_len);
821 if (error < 0) {
822 printk(KERN_ERR "PM: LZO decompression failed\n");
823 break;
824 }
825
826 if (unlikely(!unc_len ||
827 unc_len > LZO_UNC_SIZE ||
828 unc_len & (PAGE_SIZE - 1))) {
829 printk(KERN_ERR "PM: Invalid LZO uncompressed length\n");
830 error = -1;
831 break;
832 }
833
834 for (off = 0; off < unc_len; off += PAGE_SIZE) {
835 memcpy(data_of(*snapshot), unc + off, PAGE_SIZE);
836
837 if (!(nr_pages % m))
838 printk("\b\b\b\b%3d%%", nr_pages / m);
839 nr_pages++;
840
841 error = snapshot_write_next(snapshot);
842 if (error <= 0)
843 goto out_finish;
844 }
845 }
846
847out_finish:
848 do_gettimeofday(&stop);
849 if (!error) {
850 printk("\b\b\b\bdone\n");
851 snapshot_write_finalize(snapshot);
852 if (!snapshot_image_loaded(snapshot))
853 error = -ENODATA;
854 } else
855 printk("\n");
856 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
857
858 vfree(cmp);
859 vfree(unc);
860 free_page((unsigned long)page);
861
862 return error;
863}
864
865/**
593 * swsusp_read - read the hibernation image. 866 * swsusp_read - read the hibernation image.
594 * @flags_p: flags passed by the "frozen" kernel in the image header should 867 * @flags_p: flags passed by the "frozen" kernel in the image header should
595 * be written into this memeory location 868 * be written into this memeory location
@@ -612,8 +885,11 @@ int swsusp_read(unsigned int *flags_p)
612 goto end; 885 goto end;
613 if (!error) 886 if (!error)
614 error = swap_read_page(&handle, header, NULL); 887 error = swap_read_page(&handle, header, NULL);
615 if (!error) 888 if (!error) {
616 error = load_image(&handle, &snapshot, header->pages - 1); 889 error = (*flags_p & SF_NOCOMPRESS_MODE) ?
890 load_image(&handle, &snapshot, header->pages - 1) :
891 load_image_lzo(&handle, &snapshot, header->pages - 1);
892 }
617 swap_reader_finish(&handle); 893 swap_reader_finish(&handle);
618end: 894end:
619 if (!error) 895 if (!error)
@@ -634,13 +910,13 @@ int swsusp_check(void)
634 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 910 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
635 if (!IS_ERR(hib_resume_bdev)) { 911 if (!IS_ERR(hib_resume_bdev)) {
636 set_blocksize(hib_resume_bdev, PAGE_SIZE); 912 set_blocksize(hib_resume_bdev, PAGE_SIZE);
637 memset(swsusp_header, 0, PAGE_SIZE); 913 clear_page(swsusp_header);
638 error = hib_bio_read_page(swsusp_resume_block, 914 error = hib_bio_read_page(swsusp_resume_block,
639 swsusp_header, NULL); 915 swsusp_header, NULL);
640 if (error) 916 if (error)
641 goto put; 917 goto put;
642 918
643 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { 919 if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
644 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); 920 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
645 /* Reset swap signature now */ 921 /* Reset swap signature now */
646 error = hib_bio_write_page(swsusp_resume_block, 922 error = hib_bio_write_page(swsusp_resume_block,
@@ -653,13 +929,13 @@ put:
653 if (error) 929 if (error)
654 blkdev_put(hib_resume_bdev, FMODE_READ); 930 blkdev_put(hib_resume_bdev, FMODE_READ);
655 else 931 else
656 pr_debug("PM: Signature found, resuming\n"); 932 pr_debug("PM: Image signature found, resuming\n");
657 } else { 933 } else {
658 error = PTR_ERR(hib_resume_bdev); 934 error = PTR_ERR(hib_resume_bdev);
659 } 935 }
660 936
661 if (error) 937 if (error)
662 pr_debug("PM: Error %d checking image file\n", error); 938 pr_debug("PM: Image not found (code %d)\n", error);
663 939
664 return error; 940 return error;
665} 941}
diff --git a/kernel/printk.c b/kernel/printk.c
index 8fe465ac008a..b2ebaee8c377 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -85,7 +85,7 @@ EXPORT_SYMBOL(oops_in_progress);
85 * provides serialisation for access to the entire console 85 * provides serialisation for access to the entire console
86 * driver system. 86 * driver system.
87 */ 87 */
88static DECLARE_MUTEX(console_sem); 88static DEFINE_SEMAPHORE(console_sem);
89struct console *console_drivers; 89struct console *console_drivers;
90EXPORT_SYMBOL_GPL(console_drivers); 90EXPORT_SYMBOL_GPL(console_drivers);
91 91
@@ -210,7 +210,7 @@ __setup("log_buf_len=", log_buf_len_setup);
210 210
211#ifdef CONFIG_BOOT_PRINTK_DELAY 211#ifdef CONFIG_BOOT_PRINTK_DELAY
212 212
213static unsigned int boot_delay; /* msecs delay after each printk during bootup */ 213static int boot_delay; /* msecs delay after each printk during bootup */
214static unsigned long long loops_per_msec; /* based on boot_delay */ 214static unsigned long long loops_per_msec; /* based on boot_delay */
215 215
216static int __init boot_delay_setup(char *str) 216static int __init boot_delay_setup(char *str)
@@ -556,7 +556,7 @@ static void zap_locks(void)
556 /* If a crash is occurring, make sure we can't deadlock */ 556 /* If a crash is occurring, make sure we can't deadlock */
557 spin_lock_init(&logbuf_lock); 557 spin_lock_init(&logbuf_lock);
558 /* And make sure that we print immediately */ 558 /* And make sure that we print immediately */
559 init_MUTEX(&console_sem); 559 sema_init(&console_sem, 1);
560} 560}
561 561
562#if defined(CONFIG_PRINTK_TIME) 562#if defined(CONFIG_PRINTK_TIME)
@@ -647,6 +647,7 @@ static inline int can_use_console(unsigned int cpu)
647 * released but interrupts still disabled. 647 * released but interrupts still disabled.
648 */ 648 */
649static int acquire_console_semaphore_for_printk(unsigned int cpu) 649static int acquire_console_semaphore_for_printk(unsigned int cpu)
650 __releases(&logbuf_lock)
650{ 651{
651 int retval = 0; 652 int retval = 0;
652 653
@@ -1511,7 +1512,7 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
1511} 1512}
1512EXPORT_SYMBOL_GPL(kmsg_dump_unregister); 1513EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1513 1514
1514static const char const *kmsg_reasons[] = { 1515static const char * const kmsg_reasons[] = {
1515 [KMSG_DUMP_OOPS] = "oops", 1516 [KMSG_DUMP_OOPS] = "oops",
1516 [KMSG_DUMP_PANIC] = "panic", 1517 [KMSG_DUMP_PANIC] = "panic",
1517 [KMSG_DUMP_KEXEC] = "kexec", 1518 [KMSG_DUMP_KEXEC] = "kexec",
diff --git a/kernel/profile.c b/kernel/profile.c
index b22a899934cc..66f841b7fbd3 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -555,6 +555,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
555static const struct file_operations proc_profile_operations = { 555static const struct file_operations proc_profile_operations = {
556 .read = read_profile, 556 .read = read_profile,
557 .write = write_profile, 557 .write = write_profile,
558 .llseek = default_llseek,
558}; 559};
559 560
560#ifdef CONFIG_SMP 561#ifdef CONFIG_SMP
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f34d798ef4a2..99bbaa3e5b0d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -181,7 +181,7 @@ int ptrace_attach(struct task_struct *task)
181 * under ptrace. 181 * under ptrace.
182 */ 182 */
183 retval = -ERESTARTNOINTR; 183 retval = -ERESTARTNOINTR;
184 if (mutex_lock_interruptible(&task->cred_guard_mutex)) 184 if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
185 goto out; 185 goto out;
186 186
187 task_lock(task); 187 task_lock(task);
@@ -208,7 +208,7 @@ int ptrace_attach(struct task_struct *task)
208unlock_tasklist: 208unlock_tasklist:
209 write_unlock_irq(&tasklist_lock); 209 write_unlock_irq(&tasklist_lock);
210unlock_creds: 210unlock_creds:
211 mutex_unlock(&task->cred_guard_mutex); 211 mutex_unlock(&task->signal->cred_guard_mutex);
212out: 212out:
213 return retval; 213 return retval;
214} 214}
@@ -329,6 +329,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
329 * and reacquire the lock. 329 * and reacquire the lock.
330 */ 330 */
331void exit_ptrace(struct task_struct *tracer) 331void exit_ptrace(struct task_struct *tracer)
332 __releases(&tasklist_lock)
333 __acquires(&tasklist_lock)
332{ 334{
333 struct task_struct *p, *n; 335 struct task_struct *p, *n;
334 LIST_HEAD(ptrace_dead); 336 LIST_HEAD(ptrace_dead);
@@ -402,7 +404,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
402 return copied; 404 return copied;
403} 405}
404 406
405static int ptrace_setoptions(struct task_struct *child, long data) 407static int ptrace_setoptions(struct task_struct *child, unsigned long data)
406{ 408{
407 child->ptrace &= ~PT_TRACE_MASK; 409 child->ptrace &= ~PT_TRACE_MASK;
408 410
@@ -481,7 +483,8 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
481#define is_sysemu_singlestep(request) 0 483#define is_sysemu_singlestep(request) 0
482#endif 484#endif
483 485
484static int ptrace_resume(struct task_struct *child, long request, long data) 486static int ptrace_resume(struct task_struct *child, long request,
487 unsigned long data)
485{ 488{
486 if (!valid_signal(data)) 489 if (!valid_signal(data))
487 return -EIO; 490 return -EIO;
@@ -558,10 +561,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
558#endif 561#endif
559 562
560int ptrace_request(struct task_struct *child, long request, 563int ptrace_request(struct task_struct *child, long request,
561 long addr, long data) 564 unsigned long addr, unsigned long data)
562{ 565{
563 int ret = -EIO; 566 int ret = -EIO;
564 siginfo_t siginfo; 567 siginfo_t siginfo;
568 void __user *datavp = (void __user *) data;
569 unsigned long __user *datalp = datavp;
565 570
566 switch (request) { 571 switch (request) {
567 case PTRACE_PEEKTEXT: 572 case PTRACE_PEEKTEXT:
@@ -578,19 +583,17 @@ int ptrace_request(struct task_struct *child, long request,
578 ret = ptrace_setoptions(child, data); 583 ret = ptrace_setoptions(child, data);
579 break; 584 break;
580 case PTRACE_GETEVENTMSG: 585 case PTRACE_GETEVENTMSG:
581 ret = put_user(child->ptrace_message, (unsigned long __user *) data); 586 ret = put_user(child->ptrace_message, datalp);
582 break; 587 break;
583 588
584 case PTRACE_GETSIGINFO: 589 case PTRACE_GETSIGINFO:
585 ret = ptrace_getsiginfo(child, &siginfo); 590 ret = ptrace_getsiginfo(child, &siginfo);
586 if (!ret) 591 if (!ret)
587 ret = copy_siginfo_to_user((siginfo_t __user *) data, 592 ret = copy_siginfo_to_user(datavp, &siginfo);
588 &siginfo);
589 break; 593 break;
590 594
591 case PTRACE_SETSIGINFO: 595 case PTRACE_SETSIGINFO:
592 if (copy_from_user(&siginfo, (siginfo_t __user *) data, 596 if (copy_from_user(&siginfo, datavp, sizeof siginfo))
593 sizeof siginfo))
594 ret = -EFAULT; 597 ret = -EFAULT;
595 else 598 else
596 ret = ptrace_setsiginfo(child, &siginfo); 599 ret = ptrace_setsiginfo(child, &siginfo);
@@ -621,7 +624,7 @@ int ptrace_request(struct task_struct *child, long request,
621 } 624 }
622 mmput(mm); 625 mmput(mm);
623 626
624 ret = put_user(tmp, (unsigned long __user *) data); 627 ret = put_user(tmp, datalp);
625 break; 628 break;
626 } 629 }
627#endif 630#endif
@@ -650,7 +653,7 @@ int ptrace_request(struct task_struct *child, long request,
650 case PTRACE_SETREGSET: 653 case PTRACE_SETREGSET:
651 { 654 {
652 struct iovec kiov; 655 struct iovec kiov;
653 struct iovec __user *uiov = (struct iovec __user *) data; 656 struct iovec __user *uiov = datavp;
654 657
655 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) 658 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
656 return -EFAULT; 659 return -EFAULT;
@@ -691,7 +694,8 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid)
691#define arch_ptrace_attach(child) do { } while (0) 694#define arch_ptrace_attach(child) do { } while (0)
692#endif 695#endif
693 696
694SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) 697SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
698 unsigned long, data)
695{ 699{
696 struct task_struct *child; 700 struct task_struct *child;
697 long ret; 701 long ret;
@@ -732,7 +736,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
732 return ret; 736 return ret;
733} 737}
734 738
735int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) 739int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
740 unsigned long data)
736{ 741{
737 unsigned long tmp; 742 unsigned long tmp;
738 int copied; 743 int copied;
@@ -743,7 +748,8 @@ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
743 return put_user(tmp, (unsigned long __user *)data); 748 return put_user(tmp, (unsigned long __user *)data);
744} 749}
745 750
746int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) 751int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
752 unsigned long data)
747{ 753{
748 int copied; 754 int copied;
749 755
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 4d169835fb36..a23a57a976d1 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void)
73EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); 73EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
74 74
75/** 75/**
76 * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? 76 * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
77 * 77 *
78 * Check for bottom half being disabled, which covers both the 78 * Check for bottom half being disabled, which covers both the
79 * CONFIG_PROVE_RCU and not cases. Note that if someone uses 79 * CONFIG_PROVE_RCU and not cases. Note that if someone uses
80 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) 80 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
81 * will show the situation. 81 * will show the situation. This is useful for debug checks in functions
82 * that require that they be called within an RCU read-side critical
83 * section.
82 * 84 *
83 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. 85 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
84 */ 86 */
@@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void)
86{ 88{
87 if (!debug_lockdep_rcu_enabled()) 89 if (!debug_lockdep_rcu_enabled())
88 return 1; 90 return 1;
89 return in_softirq(); 91 return in_softirq() || irqs_disabled();
90} 92}
91EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); 93EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
92 94
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 196ec02f8be0..d806735342ac 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -59,6 +59,14 @@ int rcu_scheduler_active __read_mostly;
59EXPORT_SYMBOL_GPL(rcu_scheduler_active); 59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61 61
62/* Forward declarations for rcutiny_plugin.h. */
63static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
64static void __call_rcu(struct rcu_head *head,
65 void (*func)(struct rcu_head *rcu),
66 struct rcu_ctrlblk *rcp);
67
68#include "rcutiny_plugin.h"
69
62#ifdef CONFIG_NO_HZ 70#ifdef CONFIG_NO_HZ
63 71
64static long rcu_dynticks_nesting = 1; 72static long rcu_dynticks_nesting = 1;
@@ -140,6 +148,7 @@ void rcu_check_callbacks(int cpu, int user)
140 rcu_sched_qs(cpu); 148 rcu_sched_qs(cpu);
141 else if (!in_softirq()) 149 else if (!in_softirq())
142 rcu_bh_qs(cpu); 150 rcu_bh_qs(cpu);
151 rcu_preempt_check_callbacks();
143} 152}
144 153
145/* 154/*
@@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
162 *rcp->donetail = NULL; 171 *rcp->donetail = NULL;
163 if (rcp->curtail == rcp->donetail) 172 if (rcp->curtail == rcp->donetail)
164 rcp->curtail = &rcp->rcucblist; 173 rcp->curtail = &rcp->rcucblist;
174 rcu_preempt_remove_callbacks(rcp);
165 rcp->donetail = &rcp->rcucblist; 175 rcp->donetail = &rcp->rcucblist;
166 local_irq_restore(flags); 176 local_irq_restore(flags);
167 177
@@ -182,6 +192,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
182{ 192{
183 __rcu_process_callbacks(&rcu_sched_ctrlblk); 193 __rcu_process_callbacks(&rcu_sched_ctrlblk);
184 __rcu_process_callbacks(&rcu_bh_ctrlblk); 194 __rcu_process_callbacks(&rcu_bh_ctrlblk);
195 rcu_preempt_process_callbacks();
185} 196}
186 197
187/* 198/*
@@ -223,15 +234,15 @@ static void __call_rcu(struct rcu_head *head,
223} 234}
224 235
225/* 236/*
226 * Post an RCU callback to be invoked after the end of an RCU grace 237 * Post an RCU callback to be invoked after the end of an RCU-sched grace
227 * period. But since we have but one CPU, that would be after any 238 * period. But since we have but one CPU, that would be after any
228 * quiescent state. 239 * quiescent state.
229 */ 240 */
230void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 241void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
231{ 242{
232 __call_rcu(head, func, &rcu_sched_ctrlblk); 243 __call_rcu(head, func, &rcu_sched_ctrlblk);
233} 244}
234EXPORT_SYMBOL_GPL(call_rcu); 245EXPORT_SYMBOL_GPL(call_rcu_sched);
235 246
236/* 247/*
237 * Post an RCU bottom-half callback to be invoked after any subsequent 248 * Post an RCU bottom-half callback to be invoked after any subsequent
@@ -243,20 +254,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
243} 254}
244EXPORT_SYMBOL_GPL(call_rcu_bh); 255EXPORT_SYMBOL_GPL(call_rcu_bh);
245 256
246void rcu_barrier(void)
247{
248 struct rcu_synchronize rcu;
249
250 init_rcu_head_on_stack(&rcu.head);
251 init_completion(&rcu.completion);
252 /* Will wake me after RCU finished. */
253 call_rcu(&rcu.head, wakeme_after_rcu);
254 /* Wait for it. */
255 wait_for_completion(&rcu.completion);
256 destroy_rcu_head_on_stack(&rcu.head);
257}
258EXPORT_SYMBOL_GPL(rcu_barrier);
259
260void rcu_barrier_bh(void) 257void rcu_barrier_bh(void)
261{ 258{
262 struct rcu_synchronize rcu; 259 struct rcu_synchronize rcu;
@@ -289,5 +286,3 @@ void __init rcu_init(void)
289{ 286{
290 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 287 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
291} 288}
292
293#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index d223a92bc742..6ceca4f745ff 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version) 2 * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
3 * Internal non-public definitions that provide either classic 3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics. 4 * or preemptible semantics.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -17,11 +17,587 @@
17 * along with this program; if not, write to the Free Software 17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 * 19 *
20 * Copyright IBM Corporation, 2009 20 * Copyright (c) 2010 Linaro
21 * 21 *
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */ 23 */
24 24
25#ifdef CONFIG_TINY_PREEMPT_RCU
26
27#include <linux/delay.h>
28
29/* Global control variables for preemptible RCU. */
30struct rcu_preempt_ctrlblk {
31 struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */
32 struct rcu_head **nexttail;
33 /* Tasks blocked in a preemptible RCU */
34 /* read-side critical section while an */
35 /* preemptible-RCU grace period is in */
36 /* progress must wait for a later grace */
37 /* period. This pointer points to the */
38 /* ->next pointer of the last task that */
39 /* must wait for a later grace period, or */
40 /* to &->rcb.rcucblist if there is no */
41 /* such task. */
42 struct list_head blkd_tasks;
43 /* Tasks blocked in RCU read-side critical */
44 /* section. Tasks are placed at the head */
45 /* of this list and age towards the tail. */
46 struct list_head *gp_tasks;
47 /* Pointer to the first task blocking the */
48 /* current grace period, or NULL if there */
49 /* is not such task. */
50 struct list_head *exp_tasks;
51 /* Pointer to first task blocking the */
52 /* current expedited grace period, or NULL */
53 /* if there is no such task. If there */
54 /* is no current expedited grace period, */
55 /* then there cannot be any such task. */
56 u8 gpnum; /* Current grace period. */
57 u8 gpcpu; /* Last grace period blocked by the CPU. */
58 u8 completed; /* Last grace period completed. */
59 /* If all three are equal, RCU is idle. */
60};
61
62static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
63 .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist,
64 .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
65 .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
66 .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
67};
68
69static int rcu_preempted_readers_exp(void);
70static void rcu_report_exp_done(void);
71
72/*
73 * Return true if the CPU has not yet responded to the current grace period.
74 */
75static int rcu_cpu_blocking_cur_gp(void)
76{
77 return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum;
78}
79
80/*
81 * Check for a running RCU reader. Because there is only one CPU,
82 * there can be but one running RCU reader at a time. ;-)
83 */
84static int rcu_preempt_running_reader(void)
85{
86 return current->rcu_read_lock_nesting;
87}
88
89/*
90 * Check for preempted RCU readers blocking any grace period.
91 * If the caller needs a reliable answer, it must disable hard irqs.
92 */
93static int rcu_preempt_blocked_readers_any(void)
94{
95 return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks);
96}
97
98/*
99 * Check for preempted RCU readers blocking the current grace period.
100 * If the caller needs a reliable answer, it must disable hard irqs.
101 */
102static int rcu_preempt_blocked_readers_cgp(void)
103{
104 return rcu_preempt_ctrlblk.gp_tasks != NULL;
105}
106
107/*
108 * Return true if another preemptible-RCU grace period is needed.
109 */
110static int rcu_preempt_needs_another_gp(void)
111{
112 return *rcu_preempt_ctrlblk.rcb.curtail != NULL;
113}
114
115/*
116 * Return true if a preemptible-RCU grace period is in progress.
117 * The caller must disable hardirqs.
118 */
119static int rcu_preempt_gp_in_progress(void)
120{
121 return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
122}
123
124/*
125 * Record a preemptible-RCU quiescent state for the specified CPU. Note
126 * that this just means that the task currently running on the CPU is
127 * in a quiescent state. There might be any number of tasks blocked
128 * while in an RCU read-side critical section.
129 *
130 * Unlike the other rcu_*_qs() functions, callers to this function
131 * must disable irqs in order to protect the assignment to
132 * ->rcu_read_unlock_special.
133 *
134 * Because this is a single-CPU implementation, the only way a grace
135 * period can end is if the CPU is in a quiescent state. The reason is
136 * that a blocked preemptible-RCU reader can exit its critical section
137 * only if the CPU is running it at the time. Therefore, when the
138 * last task blocking the current grace period exits its RCU read-side
139 * critical section, neither the CPU nor blocked tasks will be stopping
140 * the current grace period. (In contrast, SMP implementations
141 * might have CPUs running in RCU read-side critical sections that
142 * block later grace periods -- but this is not possible given only
143 * one CPU.)
144 */
145static void rcu_preempt_cpu_qs(void)
146{
147 /* Record both CPU and task as having responded to current GP. */
148 rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
149 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
150
151 /*
152 * If there is no GP, or if blocked readers are still blocking GP,
153 * then there is nothing more to do.
154 */
155 if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
156 return;
157
158 /* Advance callbacks. */
159 rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
160 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail;
161 rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail;
162
163 /* If there are no blocked readers, next GP is done instantly. */
164 if (!rcu_preempt_blocked_readers_any())
165 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
166
167 /* If there are done callbacks, make RCU_SOFTIRQ process them. */
168 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
169 raise_softirq(RCU_SOFTIRQ);
170}
171
172/*
173 * Start a new RCU grace period if warranted. Hard irqs must be disabled.
174 */
175static void rcu_preempt_start_gp(void)
176{
177 if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) {
178
179 /* Official start of GP. */
180 rcu_preempt_ctrlblk.gpnum++;
181
182 /* Any blocked RCU readers block new GP. */
183 if (rcu_preempt_blocked_readers_any())
184 rcu_preempt_ctrlblk.gp_tasks =
185 rcu_preempt_ctrlblk.blkd_tasks.next;
186
187 /* If there is no running reader, CPU is done with GP. */
188 if (!rcu_preempt_running_reader())
189 rcu_preempt_cpu_qs();
190 }
191}
192
193/*
194 * We have entered the scheduler, and the current task might soon be
195 * context-switched away from. If this task is in an RCU read-side
196 * critical section, we will no longer be able to rely on the CPU to
197 * record that fact, so we enqueue the task on the blkd_tasks list.
198 * If the task started after the current grace period began, as recorded
199 * by ->gpcpu, we enqueue at the beginning of the list. Otherwise
200 * before the element referenced by ->gp_tasks (or at the tail if
201 * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element.
202 * The task will dequeue itself when it exits the outermost enclosing
203 * RCU read-side critical section. Therefore, the current grace period
204 * cannot be permitted to complete until the ->gp_tasks pointer becomes
205 * NULL.
206 *
207 * Caller must disable preemption.
208 */
209void rcu_preempt_note_context_switch(void)
210{
211 struct task_struct *t = current;
212 unsigned long flags;
213
214 local_irq_save(flags); /* must exclude scheduler_tick(). */
215 if (rcu_preempt_running_reader() &&
216 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
217
218 /* Possibly blocking in an RCU read-side critical section. */
219 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
220
221 /*
222 * If this CPU has already checked in, then this task
223 * will hold up the next grace period rather than the
224 * current grace period. Queue the task accordingly.
225 * If the task is queued for the current grace period
226 * (i.e., this CPU has not yet passed through a quiescent
227 * state for the current grace period), then as long
228 * as that task remains queued, the current grace period
229 * cannot end.
230 */
231 list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
232 if (rcu_cpu_blocking_cur_gp())
233 rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
234 }
235
236 /*
237 * Either we were not in an RCU read-side critical section to
238 * begin with, or we have now recorded that critical section
239 * globally. Either way, we can now note a quiescent state
240 * for this CPU. Again, if we were in an RCU read-side critical
241 * section, and if that critical section was blocking the current
242 * grace period, then the fact that the task has been enqueued
243 * means that current grace period continues to be blocked.
244 */
245 rcu_preempt_cpu_qs();
246 local_irq_restore(flags);
247}
248
249/*
250 * Tiny-preemptible RCU implementation for rcu_read_lock().
251 * Just increment ->rcu_read_lock_nesting, shared state will be updated
252 * if we block.
253 */
254void __rcu_read_lock(void)
255{
256 current->rcu_read_lock_nesting++;
257 barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */
258}
259EXPORT_SYMBOL_GPL(__rcu_read_lock);
260
261/*
262 * Handle special cases during rcu_read_unlock(), such as needing to
263 * notify RCU core processing or task having blocked during the RCU
264 * read-side critical section.
265 */
266static void rcu_read_unlock_special(struct task_struct *t)
267{
268 int empty;
269 int empty_exp;
270 unsigned long flags;
271 struct list_head *np;
272 int special;
273
274 /*
275 * NMI handlers cannot block and cannot safely manipulate state.
276 * They therefore cannot possibly be special, so just leave.
277 */
278 if (in_nmi())
279 return;
280
281 local_irq_save(flags);
282
283 /*
284 * If RCU core is waiting for this CPU to exit critical section,
285 * let it know that we have done so.
286 */
287 special = t->rcu_read_unlock_special;
288 if (special & RCU_READ_UNLOCK_NEED_QS)
289 rcu_preempt_cpu_qs();
290
291 /* Hardware IRQ handlers cannot block. */
292 if (in_irq()) {
293 local_irq_restore(flags);
294 return;
295 }
296
297 /* Clean up if blocked during RCU read-side critical section. */
298 if (special & RCU_READ_UNLOCK_BLOCKED) {
299 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
300
301 /*
302 * Remove this task from the ->blkd_tasks list and adjust
303 * any pointers that might have been referencing it.
304 */
305 empty = !rcu_preempt_blocked_readers_cgp();
306 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
307 np = t->rcu_node_entry.next;
308 if (np == &rcu_preempt_ctrlblk.blkd_tasks)
309 np = NULL;
310 list_del(&t->rcu_node_entry);
311 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
312 rcu_preempt_ctrlblk.gp_tasks = np;
313 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
314 rcu_preempt_ctrlblk.exp_tasks = np;
315 INIT_LIST_HEAD(&t->rcu_node_entry);
316
317 /*
318 * If this was the last task on the current list, and if
319 * we aren't waiting on the CPU, report the quiescent state
320 * and start a new grace period if needed.
321 */
322 if (!empty && !rcu_preempt_blocked_readers_cgp()) {
323 rcu_preempt_cpu_qs();
324 rcu_preempt_start_gp();
325 }
326
327 /*
328 * If this was the last task on the expedited lists,
329 * then we need wake up the waiting task.
330 */
331 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
332 rcu_report_exp_done();
333 }
334 local_irq_restore(flags);
335}
336
337/*
338 * Tiny-preemptible RCU implementation for rcu_read_unlock().
339 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
340 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
341 * invoke rcu_read_unlock_special() to clean up after a context switch
342 * in an RCU read-side critical section and other special cases.
343 */
344void __rcu_read_unlock(void)
345{
346 struct task_struct *t = current;
347
348 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
349 --t->rcu_read_lock_nesting;
350 barrier(); /* decrement before load of ->rcu_read_unlock_special */
351 if (t->rcu_read_lock_nesting == 0 &&
352 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
353 rcu_read_unlock_special(t);
354#ifdef CONFIG_PROVE_LOCKING
355 WARN_ON_ONCE(t->rcu_read_lock_nesting < 0);
356#endif /* #ifdef CONFIG_PROVE_LOCKING */
357}
358EXPORT_SYMBOL_GPL(__rcu_read_unlock);
359
360/*
361 * Check for a quiescent state from the current CPU. When a task blocks,
362 * the task is recorded in the rcu_preempt_ctrlblk structure, which is
363 * checked elsewhere. This is called from the scheduling-clock interrupt.
364 *
365 * Caller must disable hard irqs.
366 */
367static void rcu_preempt_check_callbacks(void)
368{
369 struct task_struct *t = current;
370
371 if (rcu_preempt_gp_in_progress() &&
372 (!rcu_preempt_running_reader() ||
373 !rcu_cpu_blocking_cur_gp()))
374 rcu_preempt_cpu_qs();
375 if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
376 rcu_preempt_ctrlblk.rcb.donetail)
377 raise_softirq(RCU_SOFTIRQ);
378 if (rcu_preempt_gp_in_progress() &&
379 rcu_cpu_blocking_cur_gp() &&
380 rcu_preempt_running_reader())
381 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
382}
383
384/*
385 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
386 * update, so this is invoked from __rcu_process_callbacks() to
387 * handle that case. Of course, it is invoked for all flavors of
388 * RCU, but RCU callbacks can appear only on one of the lists, and
389 * neither ->nexttail nor ->donetail can possibly be NULL, so there
390 * is no need for an explicit check.
391 */
392static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
393{
394 if (rcu_preempt_ctrlblk.nexttail == rcp->donetail)
395 rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist;
396}
397
398/*
399 * Process callbacks for preemptible RCU.
400 */
401static void rcu_preempt_process_callbacks(void)
402{
403 __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
404}
405
406/*
407 * Queue a preemptible -RCU callback for invocation after a grace period.
408 */
409void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
410{
411 unsigned long flags;
412
413 debug_rcu_head_queue(head);
414 head->func = func;
415 head->next = NULL;
416
417 local_irq_save(flags);
418 *rcu_preempt_ctrlblk.nexttail = head;
419 rcu_preempt_ctrlblk.nexttail = &head->next;
420 rcu_preempt_start_gp(); /* checks to see if GP needed. */
421 local_irq_restore(flags);
422}
423EXPORT_SYMBOL_GPL(call_rcu);
424
425void rcu_barrier(void)
426{
427 struct rcu_synchronize rcu;
428
429 init_rcu_head_on_stack(&rcu.head);
430 init_completion(&rcu.completion);
431 /* Will wake me after RCU finished. */
432 call_rcu(&rcu.head, wakeme_after_rcu);
433 /* Wait for it. */
434 wait_for_completion(&rcu.completion);
435 destroy_rcu_head_on_stack(&rcu.head);
436}
437EXPORT_SYMBOL_GPL(rcu_barrier);
438
439/*
440 * synchronize_rcu - wait until a grace period has elapsed.
441 *
442 * Control will return to the caller some time after a full grace
443 * period has elapsed, in other words after all currently executing RCU
444 * read-side critical sections have completed. RCU read-side critical
445 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
446 * and may be nested.
447 */
448void synchronize_rcu(void)
449{
450#ifdef CONFIG_DEBUG_LOCK_ALLOC
451 if (!rcu_scheduler_active)
452 return;
453#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
454
455 WARN_ON_ONCE(rcu_preempt_running_reader());
456 if (!rcu_preempt_blocked_readers_any())
457 return;
458
459 /* Once we get past the fastpath checks, same code as rcu_barrier(). */
460 rcu_barrier();
461}
462EXPORT_SYMBOL_GPL(synchronize_rcu);
463
464static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
465static unsigned long sync_rcu_preempt_exp_count;
466static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
467
468/*
469 * Return non-zero if there are any tasks in RCU read-side critical
470 * sections blocking the current preemptible-RCU expedited grace period.
471 * If there is no preemptible-RCU expedited grace period currently in
472 * progress, returns zero unconditionally.
473 */
474static int rcu_preempted_readers_exp(void)
475{
476 return rcu_preempt_ctrlblk.exp_tasks != NULL;
477}
478
479/*
480 * Report the exit from RCU read-side critical section for the last task
481 * that queued itself during or before the current expedited preemptible-RCU
482 * grace period.
483 */
484static void rcu_report_exp_done(void)
485{
486 wake_up(&sync_rcu_preempt_exp_wq);
487}
488
489/*
490 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
491 * is to rely in the fact that there is but one CPU, and that it is
492 * illegal for a task to invoke synchronize_rcu_expedited() while in a
493 * preemptible-RCU read-side critical section. Therefore, any such
494 * critical sections must correspond to blocked tasks, which must therefore
495 * be on the ->blkd_tasks list. So just record the current head of the
496 * list in the ->exp_tasks pointer, and wait for all tasks including and
497 * after the task pointed to by ->exp_tasks to drain.
498 */
499void synchronize_rcu_expedited(void)
500{
501 unsigned long flags;
502 struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk;
503 unsigned long snap;
504
505 barrier(); /* ensure prior action seen before grace period. */
506
507 WARN_ON_ONCE(rcu_preempt_running_reader());
508
509 /*
510 * Acquire lock so that there is only one preemptible RCU grace
511 * period in flight. Of course, if someone does the expedited
512 * grace period for us while we are acquiring the lock, just leave.
513 */
514 snap = sync_rcu_preempt_exp_count + 1;
515 mutex_lock(&sync_rcu_preempt_exp_mutex);
516 if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count))
517 goto unlock_mb_ret; /* Others did our work for us. */
518
519 local_irq_save(flags);
520
521 /*
522 * All RCU readers have to already be on blkd_tasks because
523 * we cannot legally be executing in an RCU read-side critical
524 * section.
525 */
526
527 /* Snapshot current head of ->blkd_tasks list. */
528 rpcp->exp_tasks = rpcp->blkd_tasks.next;
529 if (rpcp->exp_tasks == &rpcp->blkd_tasks)
530 rpcp->exp_tasks = NULL;
531 local_irq_restore(flags);
532
533 /* Wait for tail of ->blkd_tasks list to drain. */
534 if (rcu_preempted_readers_exp())
535 wait_event(sync_rcu_preempt_exp_wq,
536 !rcu_preempted_readers_exp());
537
538 /* Clean up and exit. */
539 barrier(); /* ensure expedited GP seen before counter increment. */
540 sync_rcu_preempt_exp_count++;
541unlock_mb_ret:
542 mutex_unlock(&sync_rcu_preempt_exp_mutex);
543 barrier(); /* ensure subsequent action seen after grace period. */
544}
545EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
546
547/*
548 * Does preemptible RCU need the CPU to stay out of dynticks mode?
549 */
550int rcu_preempt_needs_cpu(void)
551{
552 if (!rcu_preempt_running_reader())
553 rcu_preempt_cpu_qs();
554 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
555}
556
557/*
558 * Check for a task exiting while in a preemptible -RCU read-side
559 * critical section, clean up if so. No need to issue warnings,
560 * as debug_check_no_locks_held() already does this if lockdep
561 * is enabled.
562 */
563void exit_rcu(void)
564{
565 struct task_struct *t = current;
566
567 if (t->rcu_read_lock_nesting == 0)
568 return;
569 t->rcu_read_lock_nesting = 1;
570 rcu_read_unlock();
571}
572
573#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
574
575/*
576 * Because preemptible RCU does not exist, it never has any callbacks
577 * to check.
578 */
579static void rcu_preempt_check_callbacks(void)
580{
581}
582
583/*
584 * Because preemptible RCU does not exist, it never has any callbacks
585 * to remove.
586 */
587static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
588{
589}
590
591/*
592 * Because preemptible RCU does not exist, it never has any callbacks
593 * to process.
594 */
595static void rcu_preempt_process_callbacks(void)
596{
597}
598
599#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
600
25#ifdef CONFIG_DEBUG_LOCK_ALLOC 601#ifdef CONFIG_DEBUG_LOCK_ALLOC
26 602
27#include <linux/kernel_stat.h> 603#include <linux/kernel_stat.h>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 2e2726d790b9..9d8e8fb2515f 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -120,7 +120,7 @@ struct rcu_torture {
120}; 120};
121 121
122static LIST_HEAD(rcu_torture_freelist); 122static LIST_HEAD(rcu_torture_freelist);
123static struct rcu_torture *rcu_torture_current; 123static struct rcu_torture __rcu *rcu_torture_current;
124static long rcu_torture_current_version; 124static long rcu_torture_current_version;
125static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 125static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
126static DEFINE_SPINLOCK(rcu_torture_lock); 126static DEFINE_SPINLOCK(rcu_torture_lock);
@@ -153,8 +153,10 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
153#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ 153#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */
154#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ 154#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */
155static int fullstop = FULLSTOP_RMMOD; 155static int fullstop = FULLSTOP_RMMOD;
156DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */ 156/*
157 /* of kthreads. */ 157 * Protect fullstop transitions and spawning of kthreads.
158 */
159static DEFINE_MUTEX(fullstop_mutex);
158 160
159/* 161/*
160 * Detect and respond to a system shutdown. 162 * Detect and respond to a system shutdown.
@@ -303,6 +305,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp)
303 mdelay(longdelay_ms); 305 mdelay(longdelay_ms);
304 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) 306 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
305 udelay(shortdelay_us); 307 udelay(shortdelay_us);
308#ifdef CONFIG_PREEMPT
309 if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
310 preempt_schedule(); /* No QS if preempt_disable() in effect */
311#endif
306} 312}
307 313
308static void rcu_torture_read_unlock(int idx) __releases(RCU) 314static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -536,6 +542,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
536 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); 542 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
537 if (!delay) 543 if (!delay)
538 schedule_timeout_interruptible(longdelay); 544 schedule_timeout_interruptible(longdelay);
545 else
546 rcu_read_delay(rrsp);
539} 547}
540 548
541static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) 549static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
@@ -731,7 +739,8 @@ rcu_torture_writer(void *arg)
731 continue; 739 continue;
732 rp->rtort_pipe_count = 0; 740 rp->rtort_pipe_count = 0;
733 udelay(rcu_random(&rand) & 0x3ff); 741 udelay(rcu_random(&rand) & 0x3ff);
734 old_rp = rcu_torture_current; 742 old_rp = rcu_dereference_check(rcu_torture_current,
743 current == writer_task);
735 rp->rtort_mbtest = 1; 744 rp->rtort_mbtest = 1;
736 rcu_assign_pointer(rcu_torture_current, rp); 745 rcu_assign_pointer(rcu_torture_current, rp);
737 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ 746 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d5bc43976c5a..ccdc04c47981 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -143,6 +143,11 @@ module_param(blimit, int, 0);
143module_param(qhimark, int, 0); 143module_param(qhimark, int, 0);
144module_param(qlowmark, int, 0); 144module_param(qlowmark, int, 0);
145 145
146#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
147int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
148module_param(rcu_cpu_stall_suppress, int, 0644);
149#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
150
146static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 151static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
147static int rcu_pending(int cpu); 152static int rcu_pending(int cpu);
148 153
@@ -450,7 +455,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
450 455
451#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 456#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
452 457
453int rcu_cpu_stall_panicking __read_mostly; 458int rcu_cpu_stall_suppress __read_mostly;
454 459
455static void record_gp_stall_check_time(struct rcu_state *rsp) 460static void record_gp_stall_check_time(struct rcu_state *rsp)
456{ 461{
@@ -482,8 +487,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
482 rcu_print_task_stall(rnp); 487 rcu_print_task_stall(rnp);
483 raw_spin_unlock_irqrestore(&rnp->lock, flags); 488 raw_spin_unlock_irqrestore(&rnp->lock, flags);
484 489
485 /* OK, time to rat on our buddy... */ 490 /*
486 491 * OK, time to rat on our buddy...
492 * See Documentation/RCU/stallwarn.txt for info on how to debug
493 * RCU CPU stall warnings.
494 */
487 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", 495 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
488 rsp->name); 496 rsp->name);
489 rcu_for_each_leaf_node(rsp, rnp) { 497 rcu_for_each_leaf_node(rsp, rnp) {
@@ -512,6 +520,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
512 unsigned long flags; 520 unsigned long flags;
513 struct rcu_node *rnp = rcu_get_root(rsp); 521 struct rcu_node *rnp = rcu_get_root(rsp);
514 522
523 /*
524 * OK, time to rat on ourselves...
525 * See Documentation/RCU/stallwarn.txt for info on how to debug
526 * RCU CPU stall warnings.
527 */
515 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", 528 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
516 rsp->name, smp_processor_id(), jiffies - rsp->gp_start); 529 rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
517 trigger_all_cpu_backtrace(); 530 trigger_all_cpu_backtrace();
@@ -530,11 +543,11 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
530 long delta; 543 long delta;
531 struct rcu_node *rnp; 544 struct rcu_node *rnp;
532 545
533 if (rcu_cpu_stall_panicking) 546 if (rcu_cpu_stall_suppress)
534 return; 547 return;
535 delta = jiffies - rsp->jiffies_stall; 548 delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
536 rnp = rdp->mynode; 549 rnp = rdp->mynode;
537 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { 550 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) {
538 551
539 /* We haven't checked in, so go dump stack. */ 552 /* We haven't checked in, so go dump stack. */
540 print_cpu_stall(rsp); 553 print_cpu_stall(rsp);
@@ -548,10 +561,26 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
548 561
549static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) 562static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
550{ 563{
551 rcu_cpu_stall_panicking = 1; 564 rcu_cpu_stall_suppress = 1;
552 return NOTIFY_DONE; 565 return NOTIFY_DONE;
553} 566}
554 567
568/**
569 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
570 *
571 * Set the stall-warning timeout way off into the future, thus preventing
572 * any RCU CPU stall-warning messages from appearing in the current set of
573 * RCU grace periods.
574 *
575 * The caller must disable hard irqs.
576 */
577void rcu_cpu_stall_reset(void)
578{
579 rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
580 rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
581 rcu_preempt_stall_reset();
582}
583
555static struct notifier_block rcu_panic_block = { 584static struct notifier_block rcu_panic_block = {
556 .notifier_call = rcu_panic, 585 .notifier_call = rcu_panic,
557}; 586};
@@ -571,6 +600,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
571{ 600{
572} 601}
573 602
603void rcu_cpu_stall_reset(void)
604{
605}
606
574static void __init check_cpu_stall_init(void) 607static void __init check_cpu_stall_init(void)
575{ 608{
576} 609}
@@ -712,7 +745,7 @@ static void
712rcu_start_gp(struct rcu_state *rsp, unsigned long flags) 745rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
713 __releases(rcu_get_root(rsp)->lock) 746 __releases(rcu_get_root(rsp)->lock)
714{ 747{
715 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 748 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
716 struct rcu_node *rnp = rcu_get_root(rsp); 749 struct rcu_node *rnp = rcu_get_root(rsp);
717 750
718 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { 751 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
@@ -960,7 +993,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
960static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) 993static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
961{ 994{
962 int i; 995 int i;
963 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 996 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
964 997
965 if (rdp->nxtlist == NULL) 998 if (rdp->nxtlist == NULL)
966 return; /* irqs disabled, so comparison is stable. */ 999 return; /* irqs disabled, so comparison is stable. */
@@ -971,6 +1004,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
971 for (i = 0; i < RCU_NEXT_SIZE; i++) 1004 for (i = 0; i < RCU_NEXT_SIZE; i++)
972 rdp->nxttail[i] = &rdp->nxtlist; 1005 rdp->nxttail[i] = &rdp->nxtlist;
973 rsp->orphan_qlen += rdp->qlen; 1006 rsp->orphan_qlen += rdp->qlen;
1007 rdp->n_cbs_orphaned += rdp->qlen;
974 rdp->qlen = 0; 1008 rdp->qlen = 0;
975 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1009 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
976} 1010}
@@ -984,7 +1018,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
984 struct rcu_data *rdp; 1018 struct rcu_data *rdp;
985 1019
986 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1020 raw_spin_lock_irqsave(&rsp->onofflock, flags);
987 rdp = rsp->rda[smp_processor_id()]; 1021 rdp = this_cpu_ptr(rsp->rda);
988 if (rsp->orphan_cbs_list == NULL) { 1022 if (rsp->orphan_cbs_list == NULL) {
989 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 1023 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
990 return; 1024 return;
@@ -992,6 +1026,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
992 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; 1026 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
993 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; 1027 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
994 rdp->qlen += rsp->orphan_qlen; 1028 rdp->qlen += rsp->orphan_qlen;
1029 rdp->n_cbs_adopted += rsp->orphan_qlen;
995 rsp->orphan_cbs_list = NULL; 1030 rsp->orphan_cbs_list = NULL;
996 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; 1031 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
997 rsp->orphan_qlen = 0; 1032 rsp->orphan_qlen = 0;
@@ -1007,7 +1042,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1007 unsigned long flags; 1042 unsigned long flags;
1008 unsigned long mask; 1043 unsigned long mask;
1009 int need_report = 0; 1044 int need_report = 0;
1010 struct rcu_data *rdp = rsp->rda[cpu]; 1045 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1011 struct rcu_node *rnp; 1046 struct rcu_node *rnp;
1012 1047
1013 /* Exclude any attempts to start a new grace period. */ 1048 /* Exclude any attempts to start a new grace period. */
@@ -1123,6 +1158,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1123 1158
1124 /* Update count, and requeue any remaining callbacks. */ 1159 /* Update count, and requeue any remaining callbacks. */
1125 rdp->qlen -= count; 1160 rdp->qlen -= count;
1161 rdp->n_cbs_invoked += count;
1126 if (list != NULL) { 1162 if (list != NULL) {
1127 *tail = rdp->nxtlist; 1163 *tail = rdp->nxtlist;
1128 rdp->nxtlist = list; 1164 rdp->nxtlist = list;
@@ -1226,7 +1262,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1226 cpu = rnp->grplo; 1262 cpu = rnp->grplo;
1227 bit = 1; 1263 bit = 1;
1228 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { 1264 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
1229 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1265 if ((rnp->qsmask & bit) != 0 &&
1266 f(per_cpu_ptr(rsp->rda, cpu)))
1230 mask |= bit; 1267 mask |= bit;
1231 } 1268 }
1232 if (mask != 0) { 1269 if (mask != 0) {
@@ -1402,7 +1439,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1402 * a quiescent state betweentimes. 1439 * a quiescent state betweentimes.
1403 */ 1440 */
1404 local_irq_save(flags); 1441 local_irq_save(flags);
1405 rdp = rsp->rda[smp_processor_id()]; 1442 rdp = this_cpu_ptr(rsp->rda);
1406 rcu_process_gp_end(rsp, rdp); 1443 rcu_process_gp_end(rsp, rdp);
1407 check_for_new_grace_period(rsp, rdp); 1444 check_for_new_grace_period(rsp, rdp);
1408 1445
@@ -1701,7 +1738,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1701{ 1738{
1702 unsigned long flags; 1739 unsigned long flags;
1703 int i; 1740 int i;
1704 struct rcu_data *rdp = rsp->rda[cpu]; 1741 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1705 struct rcu_node *rnp = rcu_get_root(rsp); 1742 struct rcu_node *rnp = rcu_get_root(rsp);
1706 1743
1707 /* Set up local state, ensuring consistent view of global state. */ 1744 /* Set up local state, ensuring consistent view of global state. */
@@ -1729,7 +1766,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1729{ 1766{
1730 unsigned long flags; 1767 unsigned long flags;
1731 unsigned long mask; 1768 unsigned long mask;
1732 struct rcu_data *rdp = rsp->rda[cpu]; 1769 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1733 struct rcu_node *rnp = rcu_get_root(rsp); 1770 struct rcu_node *rnp = rcu_get_root(rsp);
1734 1771
1735 /* Set up local state, ensuring consistent view of global state. */ 1772 /* Set up local state, ensuring consistent view of global state. */
@@ -1865,7 +1902,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
1865/* 1902/*
1866 * Helper function for rcu_init() that initializes one rcu_state structure. 1903 * Helper function for rcu_init() that initializes one rcu_state structure.
1867 */ 1904 */
1868static void __init rcu_init_one(struct rcu_state *rsp) 1905static void __init rcu_init_one(struct rcu_state *rsp,
1906 struct rcu_data __percpu *rda)
1869{ 1907{
1870 static char *buf[] = { "rcu_node_level_0", 1908 static char *buf[] = { "rcu_node_level_0",
1871 "rcu_node_level_1", 1909 "rcu_node_level_1",
@@ -1918,37 +1956,23 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1918 } 1956 }
1919 } 1957 }
1920 1958
1959 rsp->rda = rda;
1921 rnp = rsp->level[NUM_RCU_LVLS - 1]; 1960 rnp = rsp->level[NUM_RCU_LVLS - 1];
1922 for_each_possible_cpu(i) { 1961 for_each_possible_cpu(i) {
1923 while (i > rnp->grphi) 1962 while (i > rnp->grphi)
1924 rnp++; 1963 rnp++;
1925 rsp->rda[i]->mynode = rnp; 1964 per_cpu_ptr(rsp->rda, i)->mynode = rnp;
1926 rcu_boot_init_percpu_data(i, rsp); 1965 rcu_boot_init_percpu_data(i, rsp);
1927 } 1966 }
1928} 1967}
1929 1968
1930/*
1931 * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used
1932 * nowhere else! Assigns leaf node pointers into each CPU's rcu_data
1933 * structure.
1934 */
1935#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1936do { \
1937 int i; \
1938 \
1939 for_each_possible_cpu(i) { \
1940 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1941 } \
1942 rcu_init_one(rsp); \
1943} while (0)
1944
1945void __init rcu_init(void) 1969void __init rcu_init(void)
1946{ 1970{
1947 int cpu; 1971 int cpu;
1948 1972
1949 rcu_bootup_announce(); 1973 rcu_bootup_announce();
1950 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1974 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
1951 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1975 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
1952 __rcu_init_preempt(); 1976 __rcu_init_preempt();
1953 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1977 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1954 1978
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 14c040b18ed0..91d4170c5c13 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -202,6 +202,9 @@ struct rcu_data {
202 long qlen; /* # of queued callbacks */ 202 long qlen; /* # of queued callbacks */
203 long qlen_last_fqs_check; 203 long qlen_last_fqs_check;
204 /* qlen at last check for QS forcing */ 204 /* qlen at last check for QS forcing */
205 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
206 unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */
207 unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */
205 unsigned long n_force_qs_snap; 208 unsigned long n_force_qs_snap;
206 /* did other CPU force QS recently? */ 209 /* did other CPU force QS recently? */
207 long blimit; /* Upper limit on a processed batch */ 210 long blimit; /* Upper limit on a processed batch */
@@ -254,19 +257,23 @@ struct rcu_data {
254#define RCU_STALL_DELAY_DELTA 0 257#define RCU_STALL_DELAY_DELTA 0
255#endif 258#endif
256 259
257#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA) 260#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
261 RCU_STALL_DELAY_DELTA)
258 /* for rsp->jiffies_stall */ 262 /* for rsp->jiffies_stall */
259#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA) 263#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
260 /* for rsp->jiffies_stall */ 264 /* for rsp->jiffies_stall */
261#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 265#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
262 /* to take at least one */ 266 /* to take at least one */
263 /* scheduling clock irq */ 267 /* scheduling clock irq */
264 /* before ratting on them. */ 268 /* before ratting on them. */
265 269
266#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 270#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE
271#define RCU_CPU_STALL_SUPPRESS_INIT 0
272#else
273#define RCU_CPU_STALL_SUPPRESS_INIT 1
274#endif
267 275
268#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) 276#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
269#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
270 277
271/* 278/*
272 * RCU global state, including node hierarchy. This hierarchy is 279 * RCU global state, including node hierarchy. This hierarchy is
@@ -283,7 +290,7 @@ struct rcu_state {
283 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ 290 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
284 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ 291 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
285 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ 292 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
286 struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ 293 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
287 294
288 /* The following fields are guarded by the root rcu_node's lock. */ 295 /* The following fields are guarded by the root rcu_node's lock. */
289 296
@@ -365,6 +372,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
365#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 372#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
366static void rcu_print_detail_task_stall(struct rcu_state *rsp); 373static void rcu_print_detail_task_stall(struct rcu_state *rsp);
367static void rcu_print_task_stall(struct rcu_node *rnp); 374static void rcu_print_task_stall(struct rcu_node *rnp);
375static void rcu_preempt_stall_reset(void);
368#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 376#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
369static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 377static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
370#ifdef CONFIG_HOTPLUG_CPU 378#ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 0e4f420245d9..71a4147473f9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void)
57 printk(KERN_INFO 57 printk(KERN_INFO
58 "\tRCU-based detection of stalled CPUs is disabled.\n"); 58 "\tRCU-based detection of stalled CPUs is disabled.\n");
59#endif 59#endif
60#ifndef CONFIG_RCU_CPU_STALL_VERBOSE 60#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
61 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); 61 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
62#endif 62#endif
63#if NUM_RCU_LVL_4 != 0 63#if NUM_RCU_LVL_4 != 0
@@ -154,7 +154,7 @@ static void rcu_preempt_note_context_switch(int cpu)
154 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 154 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
155 155
156 /* Possibly blocking in an RCU read-side critical section. */ 156 /* Possibly blocking in an RCU read-side critical section. */
157 rdp = rcu_preempt_state.rda[cpu]; 157 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
158 rnp = rdp->mynode; 158 rnp = rdp->mynode;
159 raw_spin_lock_irqsave(&rnp->lock, flags); 159 raw_spin_lock_irqsave(&rnp->lock, flags);
160 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 160 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -201,7 +201,7 @@ static void rcu_preempt_note_context_switch(int cpu)
201 */ 201 */
202void __rcu_read_lock(void) 202void __rcu_read_lock(void)
203{ 203{
204 ACCESS_ONCE(current->rcu_read_lock_nesting)++; 204 current->rcu_read_lock_nesting++;
205 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ 205 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
206} 206}
207EXPORT_SYMBOL_GPL(__rcu_read_lock); 207EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -344,7 +344,9 @@ void __rcu_read_unlock(void)
344 struct task_struct *t = current; 344 struct task_struct *t = current;
345 345
346 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ 346 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
347 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && 347 --t->rcu_read_lock_nesting;
348 barrier(); /* decrement before load of ->rcu_read_unlock_special */
349 if (t->rcu_read_lock_nesting == 0 &&
348 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 350 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
349 rcu_read_unlock_special(t); 351 rcu_read_unlock_special(t);
350#ifdef CONFIG_PROVE_LOCKING 352#ifdef CONFIG_PROVE_LOCKING
@@ -417,6 +419,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
417 } 419 }
418} 420}
419 421
422/*
423 * Suppress preemptible RCU's CPU stall warnings by pushing the
424 * time of the next stall-warning message comfortably far into the
425 * future.
426 */
427static void rcu_preempt_stall_reset(void)
428{
429 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
430}
431
420#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 432#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
421 433
422/* 434/*
@@ -546,9 +558,11 @@ EXPORT_SYMBOL_GPL(call_rcu);
546 * 558 *
547 * Control will return to the caller some time after a full grace 559 * Control will return to the caller some time after a full grace
548 * period has elapsed, in other words after all currently executing RCU 560 * period has elapsed, in other words after all currently executing RCU
549 * read-side critical sections have completed. RCU read-side critical 561 * read-side critical sections have completed. Note, however, that
550 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 562 * upon return from synchronize_rcu(), the caller might well be executing
551 * and may be nested. 563 * concurrently with new RCU read-side critical sections that began while
564 * synchronize_rcu() was waiting. RCU read-side critical sections are
565 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
552 */ 566 */
553void synchronize_rcu(void) 567void synchronize_rcu(void)
554{ 568{
@@ -771,7 +785,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void)
771 */ 785 */
772static void __init __rcu_init_preempt(void) 786static void __init __rcu_init_preempt(void)
773{ 787{
774 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); 788 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
775} 789}
776 790
777/* 791/*
@@ -865,6 +879,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
865{ 879{
866} 880}
867 881
882/*
883 * Because preemptible RCU does not exist, there is no need to suppress
884 * its CPU stall warnings.
885 */
886static void rcu_preempt_stall_reset(void)
887{
888}
889
868#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 890#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
869 891
870/* 892/*
@@ -919,15 +941,6 @@ static void rcu_preempt_process_callbacks(void)
919} 941}
920 942
921/* 943/*
922 * In classic RCU, call_rcu() is just call_rcu_sched().
923 */
924void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
925{
926 call_rcu_sched(head, func);
927}
928EXPORT_SYMBOL_GPL(call_rcu);
929
930/*
931 * Wait for an rcu-preempt grace period, but make it happen quickly. 944 * Wait for an rcu-preempt grace period, but make it happen quickly.
932 * But because preemptable RCU does not exist, map to rcu-sched. 945 * But because preemptable RCU does not exist, map to rcu-sched.
933 */ 946 */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 36c95b45738e..d15430b9d122 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -64,7 +64,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
64 rdp->dynticks_fqs); 64 rdp->dynticks_fqs);
65#endif /* #ifdef CONFIG_NO_HZ */ 65#endif /* #ifdef CONFIG_NO_HZ */
66 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 66 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
67 seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit); 67 seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit);
68 seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
69 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
68} 70}
69 71
70#define PRINT_RCU_DATA(name, func, m) \ 72#define PRINT_RCU_DATA(name, func, m) \
@@ -119,7 +121,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
119 rdp->dynticks_fqs); 121 rdp->dynticks_fqs);
120#endif /* #ifdef CONFIG_NO_HZ */ 122#endif /* #ifdef CONFIG_NO_HZ */
121 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 123 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
122 seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit); 124 seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit);
125 seq_printf(m, ",%lu,%lu,%lu\n",
126 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
123} 127}
124 128
125static int show_rcudata_csv(struct seq_file *m, void *unused) 129static int show_rcudata_csv(struct seq_file *m, void *unused)
@@ -128,7 +132,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
128#ifdef CONFIG_NO_HZ 132#ifdef CONFIG_NO_HZ
129 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); 133 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
130#endif /* #ifdef CONFIG_NO_HZ */ 134#endif /* #ifdef CONFIG_NO_HZ */
131 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); 135 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
132#ifdef CONFIG_TREE_PREEMPT_RCU 136#ifdef CONFIG_TREE_PREEMPT_RCU
133 seq_puts(m, "\"rcu_preempt:\"\n"); 137 seq_puts(m, "\"rcu_preempt:\"\n");
134 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); 138 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
@@ -262,7 +266,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
262 struct rcu_data *rdp; 266 struct rcu_data *rdp;
263 267
264 for_each_possible_cpu(cpu) { 268 for_each_possible_cpu(cpu) {
265 rdp = rsp->rda[cpu]; 269 rdp = per_cpu_ptr(rsp->rda, cpu);
266 if (rdp->beenonline) 270 if (rdp->beenonline)
267 print_one_rcu_pending(m, rdp); 271 print_one_rcu_pending(m, rdp);
268 } 272 }
diff --git a/kernel/resource.c b/kernel/resource.c
index 7b36976e5dea..9fad33efd0db 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -40,6 +40,23 @@ EXPORT_SYMBOL(iomem_resource);
40 40
41static DEFINE_RWLOCK(resource_lock); 41static DEFINE_RWLOCK(resource_lock);
42 42
43/*
44 * By default, we allocate free space bottom-up. The architecture can request
45 * top-down by clearing this flag. The user can override the architecture's
46 * choice with the "resource_alloc_from_bottom" kernel boot option, but that
47 * should only be a debugging tool.
48 */
49int resource_alloc_from_bottom = 1;
50
51static __init int setup_alloc_from_bottom(char *s)
52{
53 printk(KERN_INFO
54 "resource: allocating from bottom-up; please report a bug\n");
55 resource_alloc_from_bottom = 1;
56 return 0;
57}
58early_param("resource_alloc_from_bottom", setup_alloc_from_bottom);
59
43static void *r_next(struct seq_file *m, void *v, loff_t *pos) 60static void *r_next(struct seq_file *m, void *v, loff_t *pos)
44{ 61{
45 struct resource *p = v; 62 struct resource *p = v;
@@ -357,8 +374,97 @@ int __weak page_is_ram(unsigned long pfn)
357 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; 374 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
358} 375}
359 376
377static resource_size_t simple_align_resource(void *data,
378 const struct resource *avail,
379 resource_size_t size,
380 resource_size_t align)
381{
382 return avail->start;
383}
384
385static void resource_clip(struct resource *res, resource_size_t min,
386 resource_size_t max)
387{
388 if (res->start < min)
389 res->start = min;
390 if (res->end > max)
391 res->end = max;
392}
393
394static bool resource_contains(struct resource *res1, struct resource *res2)
395{
396 return res1->start <= res2->start && res1->end >= res2->end;
397}
398
399/*
400 * Find the resource before "child" in the sibling list of "root" children.
401 */
402static struct resource *find_sibling_prev(struct resource *root, struct resource *child)
403{
404 struct resource *this;
405
406 for (this = root->child; this; this = this->sibling)
407 if (this->sibling == child)
408 return this;
409
410 return NULL;
411}
412
413/*
414 * Find empty slot in the resource tree given range and alignment.
415 * This version allocates from the end of the root resource first.
416 */
417static int find_resource_from_top(struct resource *root, struct resource *new,
418 resource_size_t size, resource_size_t min,
419 resource_size_t max, resource_size_t align,
420 resource_size_t (*alignf)(void *,
421 const struct resource *,
422 resource_size_t,
423 resource_size_t),
424 void *alignf_data)
425{
426 struct resource *this;
427 struct resource tmp, avail, alloc;
428
429 tmp.start = root->end;
430 tmp.end = root->end;
431
432 this = find_sibling_prev(root, NULL);
433 for (;;) {
434 if (this) {
435 if (this->end < root->end)
436 tmp.start = this->end + 1;
437 } else
438 tmp.start = root->start;
439
440 resource_clip(&tmp, min, max);
441
442 /* Check for overflow after ALIGN() */
443 avail = *new;
444 avail.start = ALIGN(tmp.start, align);
445 avail.end = tmp.end;
446 if (avail.start >= tmp.start) {
447 alloc.start = alignf(alignf_data, &avail, size, align);
448 alloc.end = alloc.start + size - 1;
449 if (resource_contains(&avail, &alloc)) {
450 new->start = alloc.start;
451 new->end = alloc.end;
452 return 0;
453 }
454 }
455
456 if (!this || this->start == root->start)
457 break;
458
459 tmp.end = this->start - 1;
460 this = find_sibling_prev(root, this);
461 }
462 return -EBUSY;
463}
464
360/* 465/*
361 * Find empty slot in the resource tree given range and alignment. 466 * Find empty slot in the resource tree given range and alignment.
467 * This version allocates from the beginning of the root resource first.
362 */ 468 */
363static int find_resource(struct resource *root, struct resource *new, 469static int find_resource(struct resource *root, struct resource *new,
364 resource_size_t size, resource_size_t min, 470 resource_size_t size, resource_size_t min,
@@ -370,36 +476,43 @@ static int find_resource(struct resource *root, struct resource *new,
370 void *alignf_data) 476 void *alignf_data)
371{ 477{
372 struct resource *this = root->child; 478 struct resource *this = root->child;
373 struct resource tmp = *new; 479 struct resource tmp = *new, avail, alloc;
374 480
375 tmp.start = root->start; 481 tmp.start = root->start;
376 /* 482 /*
377 * Skip past an allocated resource that starts at 0, since the assignment 483 * Skip past an allocated resource that starts at 0, since the
378 * of this->start - 1 to tmp->end below would cause an underflow. 484 * assignment of this->start - 1 to tmp->end below would cause an
485 * underflow.
379 */ 486 */
380 if (this && this->start == 0) { 487 if (this && this->start == 0) {
381 tmp.start = this->end + 1; 488 tmp.start = this->end + 1;
382 this = this->sibling; 489 this = this->sibling;
383 } 490 }
384 for(;;) { 491 for (;;) {
385 if (this) 492 if (this)
386 tmp.end = this->start - 1; 493 tmp.end = this->start - 1;
387 else 494 else
388 tmp.end = root->end; 495 tmp.end = root->end;
389 if (tmp.start < min) 496
390 tmp.start = min; 497 resource_clip(&tmp, min, max);
391 if (tmp.end > max) 498
392 tmp.end = max; 499 /* Check for overflow after ALIGN() */
393 tmp.start = ALIGN(tmp.start, align); 500 avail = *new;
394 if (alignf) 501 avail.start = ALIGN(tmp.start, align);
395 tmp.start = alignf(alignf_data, &tmp, size, align); 502 avail.end = tmp.end;
396 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { 503 if (avail.start >= tmp.start) {
397 new->start = tmp.start; 504 alloc.start = alignf(alignf_data, &avail, size, align);
398 new->end = tmp.start + size - 1; 505 alloc.end = alloc.start + size - 1;
399 return 0; 506 if (resource_contains(&avail, &alloc)) {
507 new->start = alloc.start;
508 new->end = alloc.end;
509 return 0;
510 }
400 } 511 }
512
401 if (!this) 513 if (!this)
402 break; 514 break;
515
403 tmp.start = this->end + 1; 516 tmp.start = this->end + 1;
404 this = this->sibling; 517 this = this->sibling;
405 } 518 }
@@ -428,8 +541,14 @@ int allocate_resource(struct resource *root, struct resource *new,
428{ 541{
429 int err; 542 int err;
430 543
544 if (!alignf)
545 alignf = simple_align_resource;
546
431 write_lock(&resource_lock); 547 write_lock(&resource_lock);
432 err = find_resource(root, new, size, min, max, align, alignf, alignf_data); 548 if (resource_alloc_from_bottom)
549 err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
550 else
551 err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data);
433 if (err >= 0 && __request_resource(root, new)) 552 if (err >= 0 && __request_resource(root, new))
434 err = -EBUSY; 553 err = -EBUSY;
435 write_unlock(&resource_lock); 554 write_unlock(&resource_lock);
@@ -453,6 +572,8 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
453 572
454 if (first == parent) 573 if (first == parent)
455 return first; 574 return first;
575 if (WARN_ON(first == new)) /* duplicated insertion */
576 return first;
456 577
457 if ((first->start > new->start) || (first->end < new->end)) 578 if ((first->start > new->start) || (first->end < new->end))
458 break; 579 break;
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index a56f629b057a..66cb89bc5ef1 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -76,7 +76,9 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
76 } 76 }
77 77
78 if (!lockwakeup && td->bkl == 4) { 78 if (!lockwakeup && td->bkl == 4) {
79#ifdef CONFIG_LOCK_KERNEL
79 unlock_kernel(); 80 unlock_kernel();
81#endif
80 td->bkl = 0; 82 td->bkl = 0;
81 } 83 }
82 return 0; 84 return 0;
@@ -133,14 +135,18 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
133 if (td->bkl) 135 if (td->bkl)
134 return 0; 136 return 0;
135 td->bkl = 1; 137 td->bkl = 1;
138#ifdef CONFIG_LOCK_KERNEL
136 lock_kernel(); 139 lock_kernel();
140#endif
137 td->bkl = 4; 141 td->bkl = 4;
138 return 0; 142 return 0;
139 143
140 case RTTEST_UNLOCKBKL: 144 case RTTEST_UNLOCKBKL:
141 if (td->bkl != 4) 145 if (td->bkl != 4)
142 break; 146 break;
147#ifdef CONFIG_LOCK_KERNEL
143 unlock_kernel(); 148 unlock_kernel();
149#endif
144 td->bkl = 0; 150 td->bkl = 0;
145 return 0; 151 return 0;
146 152
diff --git a/kernel/sched.c b/kernel/sched.c
index 41541d79e3c8..aa14a56f9d03 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -426,9 +426,7 @@ struct root_domain {
426 */ 426 */
427 cpumask_var_t rto_mask; 427 cpumask_var_t rto_mask;
428 atomic_t rto_count; 428 atomic_t rto_count;
429#ifdef CONFIG_SMP
430 struct cpupri cpupri; 429 struct cpupri cpupri;
431#endif
432}; 430};
433 431
434/* 432/*
@@ -437,7 +435,7 @@ struct root_domain {
437 */ 435 */
438static struct root_domain def_root_domain; 436static struct root_domain def_root_domain;
439 437
440#endif 438#endif /* CONFIG_SMP */
441 439
442/* 440/*
443 * This is the main, per-CPU runqueue data structure. 441 * This is the main, per-CPU runqueue data structure.
@@ -488,11 +486,12 @@ struct rq {
488 */ 486 */
489 unsigned long nr_uninterruptible; 487 unsigned long nr_uninterruptible;
490 488
491 struct task_struct *curr, *idle; 489 struct task_struct *curr, *idle, *stop;
492 unsigned long next_balance; 490 unsigned long next_balance;
493 struct mm_struct *prev_mm; 491 struct mm_struct *prev_mm;
494 492
495 u64 clock; 493 u64 clock;
494 u64 clock_task;
496 495
497 atomic_t nr_iowait; 496 atomic_t nr_iowait;
498 497
@@ -520,6 +519,10 @@ struct rq {
520 u64 avg_idle; 519 u64 avg_idle;
521#endif 520#endif
522 521
522#ifdef CONFIG_IRQ_TIME_ACCOUNTING
523 u64 prev_irq_time;
524#endif
525
523 /* calc_load related fields */ 526 /* calc_load related fields */
524 unsigned long calc_load_update; 527 unsigned long calc_load_update;
525 long calc_load_active; 528 long calc_load_active;
@@ -643,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p)
643 646
644#endif /* CONFIG_CGROUP_SCHED */ 647#endif /* CONFIG_CGROUP_SCHED */
645 648
649static u64 irq_time_cpu(int cpu);
650static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
651
646inline void update_rq_clock(struct rq *rq) 652inline void update_rq_clock(struct rq *rq)
647{ 653{
648 if (!rq->skip_clock_update) 654 if (!rq->skip_clock_update) {
649 rq->clock = sched_clock_cpu(cpu_of(rq)); 655 int cpu = cpu_of(rq);
656 u64 irq_time;
657
658 rq->clock = sched_clock_cpu(cpu);
659 irq_time = irq_time_cpu(cpu);
660 if (rq->clock - irq_time > rq->clock_task)
661 rq->clock_task = rq->clock - irq_time;
662
663 sched_irq_time_avg_update(rq, irq_time);
664 }
650} 665}
651 666
652/* 667/*
@@ -723,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
723 size_t cnt, loff_t *ppos) 738 size_t cnt, loff_t *ppos)
724{ 739{
725 char buf[64]; 740 char buf[64];
726 char *cmp = buf; 741 char *cmp;
727 int neg = 0; 742 int neg = 0;
728 int i; 743 int i;
729 744
@@ -734,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
734 return -EFAULT; 749 return -EFAULT;
735 750
736 buf[cnt] = 0; 751 buf[cnt] = 0;
752 cmp = strstrip(buf);
737 753
738 if (strncmp(buf, "NO_", 3) == 0) { 754 if (strncmp(buf, "NO_", 3) == 0) {
739 neg = 1; 755 neg = 1;
@@ -741,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
741 } 757 }
742 758
743 for (i = 0; sched_feat_names[i]; i++) { 759 for (i = 0; sched_feat_names[i]; i++) {
744 int len = strlen(sched_feat_names[i]); 760 if (strcmp(cmp, sched_feat_names[i]) == 0) {
745
746 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
747 if (neg) 761 if (neg)
748 sysctl_sched_features &= ~(1UL << i); 762 sysctl_sched_features &= ~(1UL << i);
749 else 763 else
@@ -1294,6 +1308,10 @@ static void resched_task(struct task_struct *p)
1294static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1308static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1295{ 1309{
1296} 1310}
1311
1312static void sched_avg_update(struct rq *rq)
1313{
1314}
1297#endif /* CONFIG_SMP */ 1315#endif /* CONFIG_SMP */
1298 1316
1299#if BITS_PER_LONG == 32 1317#if BITS_PER_LONG == 32
@@ -1836,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1836 1854
1837static const struct sched_class rt_sched_class; 1855static const struct sched_class rt_sched_class;
1838 1856
1839#define sched_class_highest (&rt_sched_class) 1857#define sched_class_highest (&stop_sched_class)
1840#define for_each_class(class) \ 1858#define for_each_class(class) \
1841 for (class = sched_class_highest; class; class = class->next) 1859 for (class = sched_class_highest; class; class = class->next)
1842 1860
@@ -1854,12 +1872,6 @@ static void dec_nr_running(struct rq *rq)
1854 1872
1855static void set_load_weight(struct task_struct *p) 1873static void set_load_weight(struct task_struct *p)
1856{ 1874{
1857 if (task_has_rt_policy(p)) {
1858 p->se.load.weight = 0;
1859 p->se.load.inv_weight = WMULT_CONST;
1860 return;
1861 }
1862
1863 /* 1875 /*
1864 * SCHED_IDLE tasks get minimal weight: 1876 * SCHED_IDLE tasks get minimal weight:
1865 */ 1877 */
@@ -1913,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1913 dec_nr_running(rq); 1925 dec_nr_running(rq);
1914} 1926}
1915 1927
1928#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1929
1930/*
1931 * There are no locks covering percpu hardirq/softirq time.
1932 * They are only modified in account_system_vtime, on corresponding CPU
1933 * with interrupts disabled. So, writes are safe.
1934 * They are read and saved off onto struct rq in update_rq_clock().
1935 * This may result in other CPU reading this CPU's irq time and can
1936 * race with irq/account_system_vtime on this CPU. We would either get old
1937 * or new value (or semi updated value on 32 bit) with a side effect of
1938 * accounting a slice of irq time to wrong task when irq is in progress
1939 * while we read rq->clock. That is a worthy compromise in place of having
1940 * locks on each irq in account_system_time.
1941 */
1942static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1943static DEFINE_PER_CPU(u64, cpu_softirq_time);
1944
1945static DEFINE_PER_CPU(u64, irq_start_time);
1946static int sched_clock_irqtime;
1947
1948void enable_sched_clock_irqtime(void)
1949{
1950 sched_clock_irqtime = 1;
1951}
1952
1953void disable_sched_clock_irqtime(void)
1954{
1955 sched_clock_irqtime = 0;
1956}
1957
1958static u64 irq_time_cpu(int cpu)
1959{
1960 if (!sched_clock_irqtime)
1961 return 0;
1962
1963 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1964}
1965
1966void account_system_vtime(struct task_struct *curr)
1967{
1968 unsigned long flags;
1969 int cpu;
1970 u64 now, delta;
1971
1972 if (!sched_clock_irqtime)
1973 return;
1974
1975 local_irq_save(flags);
1976
1977 cpu = smp_processor_id();
1978 now = sched_clock_cpu(cpu);
1979 delta = now - per_cpu(irq_start_time, cpu);
1980 per_cpu(irq_start_time, cpu) = now;
1981 /*
1982 * We do not account for softirq time from ksoftirqd here.
1983 * We want to continue accounting softirq time to ksoftirqd thread
1984 * in that case, so as not to confuse scheduler with a special task
1985 * that do not consume any time, but still wants to run.
1986 */
1987 if (hardirq_count())
1988 per_cpu(cpu_hardirq_time, cpu) += delta;
1989 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1990 per_cpu(cpu_softirq_time, cpu) += delta;
1991
1992 local_irq_restore(flags);
1993}
1994EXPORT_SYMBOL_GPL(account_system_vtime);
1995
1996static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
1997{
1998 if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
1999 u64 delta_irq = curr_irq_time - rq->prev_irq_time;
2000 rq->prev_irq_time = curr_irq_time;
2001 sched_rt_avg_update(rq, delta_irq);
2002 }
2003}
2004
2005#else
2006
2007static u64 irq_time_cpu(int cpu)
2008{
2009 return 0;
2010}
2011
2012static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
2013
2014#endif
2015
1916#include "sched_idletask.c" 2016#include "sched_idletask.c"
1917#include "sched_fair.c" 2017#include "sched_fair.c"
1918#include "sched_rt.c" 2018#include "sched_rt.c"
2019#include "sched_stoptask.c"
1919#ifdef CONFIG_SCHED_DEBUG 2020#ifdef CONFIG_SCHED_DEBUG
1920# include "sched_debug.c" 2021# include "sched_debug.c"
1921#endif 2022#endif
1922 2023
2024void sched_set_stop_task(int cpu, struct task_struct *stop)
2025{
2026 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2027 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2028
2029 if (stop) {
2030 /*
2031 * Make it appear like a SCHED_FIFO task, its something
2032 * userspace knows about and won't get confused about.
2033 *
2034 * Also, it will make PI more or less work without too
2035 * much confusion -- but then, stop work should not
2036 * rely on PI working anyway.
2037 */
2038 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2039
2040 stop->sched_class = &stop_sched_class;
2041 }
2042
2043 cpu_rq(cpu)->stop = stop;
2044
2045 if (old_stop) {
2046 /*
2047 * Reset it back to a normal scheduling class so that
2048 * it can die in pieces.
2049 */
2050 old_stop->sched_class = &rt_sched_class;
2051 }
2052}
2053
1923/* 2054/*
1924 * __normal_prio - return the priority that is based on the static prio 2055 * __normal_prio - return the priority that is based on the static prio
1925 */ 2056 */
@@ -1999,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1999 if (p->sched_class != &fair_sched_class) 2130 if (p->sched_class != &fair_sched_class)
2000 return 0; 2131 return 0;
2001 2132
2133 if (unlikely(p->policy == SCHED_IDLE))
2134 return 0;
2135
2002 /* 2136 /*
2003 * Buddy candidates are cache hot: 2137 * Buddy candidates are cache hot:
2004 */ 2138 */
@@ -2848,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2848 */ 2982 */
2849 arch_start_context_switch(prev); 2983 arch_start_context_switch(prev);
2850 2984
2851 if (likely(!mm)) { 2985 if (!mm) {
2852 next->active_mm = oldmm; 2986 next->active_mm = oldmm;
2853 atomic_inc(&oldmm->mm_count); 2987 atomic_inc(&oldmm->mm_count);
2854 enter_lazy_tlb(oldmm, next); 2988 enter_lazy_tlb(oldmm, next);
2855 } else 2989 } else
2856 switch_mm(oldmm, mm, next); 2990 switch_mm(oldmm, mm, next);
2857 2991
2858 if (likely(!prev->mm)) { 2992 if (!prev->mm) {
2859 prev->active_mm = NULL; 2993 prev->active_mm = NULL;
2860 rq->prev_mm = oldmm; 2994 rq->prev_mm = oldmm;
2861 } 2995 }
@@ -3182,6 +3316,8 @@ static void update_cpu_load(struct rq *this_rq)
3182 3316
3183 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; 3317 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3184 } 3318 }
3319
3320 sched_avg_update(this_rq);
3185} 3321}
3186 3322
3187static void update_cpu_load_active(struct rq *this_rq) 3323static void update_cpu_load_active(struct rq *this_rq)
@@ -3242,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3242 3378
3243 if (task_current(rq, p)) { 3379 if (task_current(rq, p)) {
3244 update_rq_clock(rq); 3380 update_rq_clock(rq);
3245 ns = rq->clock - p->se.exec_start; 3381 ns = rq->clock_task - p->se.exec_start;
3246 if ((s64)ns < 0) 3382 if ((s64)ns < 0)
3247 ns = 0; 3383 ns = 0;
3248 } 3384 }
@@ -3391,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3391 tmp = cputime_to_cputime64(cputime); 3527 tmp = cputime_to_cputime64(cputime);
3392 if (hardirq_count() - hardirq_offset) 3528 if (hardirq_count() - hardirq_offset)
3393 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3529 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3394 else if (softirq_count()) 3530 else if (in_serving_softirq())
3395 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3531 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3396 else 3532 else
3397 cpustat->system = cputime64_add(cpustat->system, tmp); 3533 cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -3507,9 +3643,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3507 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 3643 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3508 3644
3509 if (total) { 3645 if (total) {
3510 u64 temp; 3646 u64 temp = rtime;
3511 3647
3512 temp = (u64)(rtime * utime); 3648 temp *= utime;
3513 do_div(temp, total); 3649 do_div(temp, total);
3514 utime = (cputime_t)temp; 3650 utime = (cputime_t)temp;
3515 } else 3651 } else
@@ -3540,9 +3676,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3540 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 3676 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3541 3677
3542 if (total) { 3678 if (total) {
3543 u64 temp; 3679 u64 temp = rtime;
3544 3680
3545 temp = (u64)(rtime * cputime.utime); 3681 temp *= cputime.utime;
3546 do_div(temp, total); 3682 do_div(temp, total);
3547 utime = (cputime_t)temp; 3683 utime = (cputime_t)temp;
3548 } else 3684 } else
@@ -3578,7 +3714,7 @@ void scheduler_tick(void)
3578 curr->sched_class->task_tick(rq, curr, 0); 3714 curr->sched_class->task_tick(rq, curr, 0);
3579 raw_spin_unlock(&rq->lock); 3715 raw_spin_unlock(&rq->lock);
3580 3716
3581 perf_event_task_tick(curr); 3717 perf_event_task_tick();
3582 3718
3583#ifdef CONFIG_SMP 3719#ifdef CONFIG_SMP
3584 rq->idle_at_tick = idle_cpu(cpu); 3720 rq->idle_at_tick = idle_cpu(cpu);
@@ -3717,17 +3853,13 @@ pick_next_task(struct rq *rq)
3717 return p; 3853 return p;
3718 } 3854 }
3719 3855
3720 class = sched_class_highest; 3856 for_each_class(class) {
3721 for ( ; ; ) {
3722 p = class->pick_next_task(rq); 3857 p = class->pick_next_task(rq);
3723 if (p) 3858 if (p)
3724 return p; 3859 return p;
3725 /*
3726 * Will never be NULL as the idle class always
3727 * returns a non-NULL p:
3728 */
3729 class = class->next;
3730 } 3860 }
3861
3862 BUG(); /* the idle class will always have a runnable task */
3731} 3863}
3732 3864
3733/* 3865/*
@@ -3865,8 +3997,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3865 /* 3997 /*
3866 * Owner changed, break to re-assess state. 3998 * Owner changed, break to re-assess state.
3867 */ 3999 */
3868 if (lock->owner != owner) 4000 if (lock->owner != owner) {
4001 /*
4002 * If the lock has switched to a different owner,
4003 * we likely have heavy contention. Return 0 to quit
4004 * optimistic spinning and not contend further:
4005 */
4006 if (lock->owner)
4007 return 0;
3869 break; 4008 break;
4009 }
3870 4010
3871 /* 4011 /*
3872 * Is that owner really running on that cpu? 4012 * Is that owner really running on that cpu?
@@ -4344,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4344 4484
4345 rq = task_rq_lock(p, &flags); 4485 rq = task_rq_lock(p, &flags);
4346 4486
4487 trace_sched_pi_setprio(p, prio);
4347 oldprio = p->prio; 4488 oldprio = p->prio;
4348 prev_class = p->sched_class; 4489 prev_class = p->sched_class;
4349 on_rq = p->se.on_rq; 4490 on_rq = p->se.on_rq;
@@ -4631,7 +4772,7 @@ recheck:
4631 } 4772 }
4632 4773
4633 if (user) { 4774 if (user) {
4634 retval = security_task_setscheduler(p, policy, param); 4775 retval = security_task_setscheduler(p);
4635 if (retval) 4776 if (retval)
4636 return retval; 4777 return retval;
4637 } 4778 }
@@ -4647,6 +4788,15 @@ recheck:
4647 */ 4788 */
4648 rq = __task_rq_lock(p); 4789 rq = __task_rq_lock(p);
4649 4790
4791 /*
4792 * Changing the policy of the stop threads its a very bad idea
4793 */
4794 if (p == rq->stop) {
4795 __task_rq_unlock(rq);
4796 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4797 return -EINVAL;
4798 }
4799
4650#ifdef CONFIG_RT_GROUP_SCHED 4800#ifdef CONFIG_RT_GROUP_SCHED
4651 if (user) { 4801 if (user) {
4652 /* 4802 /*
@@ -4873,13 +5023,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4873 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5023 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
4874 goto out_unlock; 5024 goto out_unlock;
4875 5025
4876 retval = security_task_setscheduler(p, 0, NULL); 5026 retval = security_task_setscheduler(p);
4877 if (retval) 5027 if (retval)
4878 goto out_unlock; 5028 goto out_unlock;
4879 5029
4880 cpuset_cpus_allowed(p, cpus_allowed); 5030 cpuset_cpus_allowed(p, cpus_allowed);
4881 cpumask_and(new_mask, in_mask, cpus_allowed); 5031 cpumask_and(new_mask, in_mask, cpus_allowed);
4882 again: 5032again:
4883 retval = set_cpus_allowed_ptr(p, new_mask); 5033 retval = set_cpus_allowed_ptr(p, new_mask);
4884 5034
4885 if (!retval) { 5035 if (!retval) {
@@ -5323,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5323 idle->se.exec_start = sched_clock(); 5473 idle->se.exec_start = sched_clock();
5324 5474
5325 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 5475 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5476 /*
5477 * We're having a chicken and egg problem, even though we are
5478 * holding rq->lock, the cpu isn't yet set to this cpu so the
5479 * lockdep check in task_group() will fail.
5480 *
5481 * Similar case to sched_fork(). / Alternatively we could
5482 * use task_rq_lock() here and obtain the other rq->lock.
5483 *
5484 * Silence PROVE_RCU
5485 */
5486 rcu_read_lock();
5326 __set_task_cpu(idle, cpu); 5487 __set_task_cpu(idle, cpu);
5488 rcu_read_unlock();
5327 5489
5328 rq->curr = rq->idle = idle; 5490 rq->curr = rq->idle = idle;
5329#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5491#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -6500,6 +6662,7 @@ struct s_data {
6500 cpumask_var_t nodemask; 6662 cpumask_var_t nodemask;
6501 cpumask_var_t this_sibling_map; 6663 cpumask_var_t this_sibling_map;
6502 cpumask_var_t this_core_map; 6664 cpumask_var_t this_core_map;
6665 cpumask_var_t this_book_map;
6503 cpumask_var_t send_covered; 6666 cpumask_var_t send_covered;
6504 cpumask_var_t tmpmask; 6667 cpumask_var_t tmpmask;
6505 struct sched_group **sched_group_nodes; 6668 struct sched_group **sched_group_nodes;
@@ -6511,6 +6674,7 @@ enum s_alloc {
6511 sa_rootdomain, 6674 sa_rootdomain,
6512 sa_tmpmask, 6675 sa_tmpmask,
6513 sa_send_covered, 6676 sa_send_covered,
6677 sa_this_book_map,
6514 sa_this_core_map, 6678 sa_this_core_map,
6515 sa_this_sibling_map, 6679 sa_this_sibling_map,
6516 sa_nodemask, 6680 sa_nodemask,
@@ -6546,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
6546#ifdef CONFIG_SCHED_MC 6710#ifdef CONFIG_SCHED_MC
6547static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6711static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
6548static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); 6712static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6549#endif /* CONFIG_SCHED_MC */
6550 6713
6551#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6552static int 6714static int
6553cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6715cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6554 struct sched_group **sg, struct cpumask *mask) 6716 struct sched_group **sg, struct cpumask *mask)
6555{ 6717{
6556 int group; 6718 int group;
6557 6719#ifdef CONFIG_SCHED_SMT
6558 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 6720 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6559 group = cpumask_first(mask); 6721 group = cpumask_first(mask);
6722#else
6723 group = cpu;
6724#endif
6560 if (sg) 6725 if (sg)
6561 *sg = &per_cpu(sched_group_core, group).sg; 6726 *sg = &per_cpu(sched_group_core, group).sg;
6562 return group; 6727 return group;
6563} 6728}
6564#elif defined(CONFIG_SCHED_MC) 6729#endif /* CONFIG_SCHED_MC */
6730
6731/*
6732 * book sched-domains:
6733 */
6734#ifdef CONFIG_SCHED_BOOK
6735static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6736static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6737
6565static int 6738static int
6566cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6739cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
6567 struct sched_group **sg, struct cpumask *unused) 6740 struct sched_group **sg, struct cpumask *mask)
6568{ 6741{
6742 int group = cpu;
6743#ifdef CONFIG_SCHED_MC
6744 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6745 group = cpumask_first(mask);
6746#elif defined(CONFIG_SCHED_SMT)
6747 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6748 group = cpumask_first(mask);
6749#endif
6569 if (sg) 6750 if (sg)
6570 *sg = &per_cpu(sched_group_core, cpu).sg; 6751 *sg = &per_cpu(sched_group_book, group).sg;
6571 return cpu; 6752 return group;
6572} 6753}
6573#endif 6754#endif /* CONFIG_SCHED_BOOK */
6574 6755
6575static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6756static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
6576static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6757static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
@@ -6580,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6580 struct sched_group **sg, struct cpumask *mask) 6761 struct sched_group **sg, struct cpumask *mask)
6581{ 6762{
6582 int group; 6763 int group;
6583#ifdef CONFIG_SCHED_MC 6764#ifdef CONFIG_SCHED_BOOK
6765 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6766 group = cpumask_first(mask);
6767#elif defined(CONFIG_SCHED_MC)
6584 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 6768 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6585 group = cpumask_first(mask); 6769 group = cpumask_first(mask);
6586#elif defined(CONFIG_SCHED_SMT) 6770#elif defined(CONFIG_SCHED_SMT)
@@ -6841,6 +7025,9 @@ SD_INIT_FUNC(CPU)
6841#ifdef CONFIG_SCHED_MC 7025#ifdef CONFIG_SCHED_MC
6842 SD_INIT_FUNC(MC) 7026 SD_INIT_FUNC(MC)
6843#endif 7027#endif
7028#ifdef CONFIG_SCHED_BOOK
7029 SD_INIT_FUNC(BOOK)
7030#endif
6844 7031
6845static int default_relax_domain_level = -1; 7032static int default_relax_domain_level = -1;
6846 7033
@@ -6890,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6890 free_cpumask_var(d->tmpmask); /* fall through */ 7077 free_cpumask_var(d->tmpmask); /* fall through */
6891 case sa_send_covered: 7078 case sa_send_covered:
6892 free_cpumask_var(d->send_covered); /* fall through */ 7079 free_cpumask_var(d->send_covered); /* fall through */
7080 case sa_this_book_map:
7081 free_cpumask_var(d->this_book_map); /* fall through */
6893 case sa_this_core_map: 7082 case sa_this_core_map:
6894 free_cpumask_var(d->this_core_map); /* fall through */ 7083 free_cpumask_var(d->this_core_map); /* fall through */
6895 case sa_this_sibling_map: 7084 case sa_this_sibling_map:
@@ -6936,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6936 return sa_nodemask; 7125 return sa_nodemask;
6937 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) 7126 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
6938 return sa_this_sibling_map; 7127 return sa_this_sibling_map;
6939 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) 7128 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
6940 return sa_this_core_map; 7129 return sa_this_core_map;
7130 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7131 return sa_this_book_map;
6941 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) 7132 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
6942 return sa_send_covered; 7133 return sa_send_covered;
6943 d->rd = alloc_rootdomain(); 7134 d->rd = alloc_rootdomain();
@@ -6995,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
6995 return sd; 7186 return sd;
6996} 7187}
6997 7188
7189static struct sched_domain *__build_book_sched_domain(struct s_data *d,
7190 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7191 struct sched_domain *parent, int i)
7192{
7193 struct sched_domain *sd = parent;
7194#ifdef CONFIG_SCHED_BOOK
7195 sd = &per_cpu(book_domains, i).sd;
7196 SD_INIT(sd, BOOK);
7197 set_domain_attribute(sd, attr);
7198 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7199 sd->parent = parent;
7200 parent->child = sd;
7201 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7202#endif
7203 return sd;
7204}
7205
6998static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7206static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
6999 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7207 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7000 struct sched_domain *parent, int i) 7208 struct sched_domain *parent, int i)
@@ -7052,6 +7260,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
7052 d->send_covered, d->tmpmask); 7260 d->send_covered, d->tmpmask);
7053 break; 7261 break;
7054#endif 7262#endif
7263#ifdef CONFIG_SCHED_BOOK
7264 case SD_LV_BOOK: /* set up book groups */
7265 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7266 if (cpu == cpumask_first(d->this_book_map))
7267 init_sched_build_groups(d->this_book_map, cpu_map,
7268 &cpu_to_book_group,
7269 d->send_covered, d->tmpmask);
7270 break;
7271#endif
7055 case SD_LV_CPU: /* set up physical groups */ 7272 case SD_LV_CPU: /* set up physical groups */
7056 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); 7273 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7057 if (!cpumask_empty(d->nodemask)) 7274 if (!cpumask_empty(d->nodemask))
@@ -7099,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7099 7316
7100 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7317 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
7101 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7318 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
7319 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
7102 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); 7320 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7103 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); 7321 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7104 } 7322 }
7105 7323
7106 for_each_cpu(i, cpu_map) { 7324 for_each_cpu(i, cpu_map) {
7107 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); 7325 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7326 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7108 build_sched_groups(&d, SD_LV_MC, cpu_map, i); 7327 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7109 } 7328 }
7110 7329
@@ -7135,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7135 init_sched_groups_power(i, sd); 7354 init_sched_groups_power(i, sd);
7136 } 7355 }
7137#endif 7356#endif
7357#ifdef CONFIG_SCHED_BOOK
7358 for_each_cpu(i, cpu_map) {
7359 sd = &per_cpu(book_domains, i).sd;
7360 init_sched_groups_power(i, sd);
7361 }
7362#endif
7138 7363
7139 for_each_cpu(i, cpu_map) { 7364 for_each_cpu(i, cpu_map) {
7140 sd = &per_cpu(phys_domains, i).sd; 7365 sd = &per_cpu(phys_domains, i).sd;
@@ -7160,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7160 sd = &per_cpu(cpu_domains, i).sd; 7385 sd = &per_cpu(cpu_domains, i).sd;
7161#elif defined(CONFIG_SCHED_MC) 7386#elif defined(CONFIG_SCHED_MC)
7162 sd = &per_cpu(core_domains, i).sd; 7387 sd = &per_cpu(core_domains, i).sd;
7388#elif defined(CONFIG_SCHED_BOOK)
7389 sd = &per_cpu(book_domains, i).sd;
7163#else 7390#else
7164 sd = &per_cpu(phys_domains, i).sd; 7391 sd = &per_cpu(phys_domains, i).sd;
7165#endif 7392#endif
@@ -8064,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8064 8291
8065 return 1; 8292 return 1;
8066 8293
8067 err_free_rq: 8294err_free_rq:
8068 kfree(cfs_rq); 8295 kfree(cfs_rq);
8069 err: 8296err:
8070 return 0; 8297 return 0;
8071} 8298}
8072 8299
@@ -8154,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8154 8381
8155 return 1; 8382 return 1;
8156 8383
8157 err_free_rq: 8384err_free_rq:
8158 kfree(rt_rq); 8385 kfree(rt_rq);
8159 err: 8386err:
8160 return 0; 8387 return 0;
8161} 8388}
8162 8389
@@ -8283,12 +8510,12 @@ void sched_move_task(struct task_struct *tsk)
8283 if (unlikely(running)) 8510 if (unlikely(running))
8284 tsk->sched_class->put_prev_task(rq, tsk); 8511 tsk->sched_class->put_prev_task(rq, tsk);
8285 8512
8286 set_task_rq(tsk, task_cpu(tsk));
8287
8288#ifdef CONFIG_FAIR_GROUP_SCHED 8513#ifdef CONFIG_FAIR_GROUP_SCHED
8289 if (tsk->sched_class->moved_group) 8514 if (tsk->sched_class->task_move_group)
8290 tsk->sched_class->moved_group(tsk, on_rq); 8515 tsk->sched_class->task_move_group(tsk, on_rq);
8516 else
8291#endif 8517#endif
8518 set_task_rq(tsk, task_cpu(tsk));
8292 8519
8293 if (unlikely(running)) 8520 if (unlikely(running))
8294 tsk->sched_class->set_curr_task(rq); 8521 tsk->sched_class->set_curr_task(rq);
@@ -8514,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg,
8514 raw_spin_unlock(&rt_rq->rt_runtime_lock); 8741 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8515 } 8742 }
8516 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8743 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8517 unlock: 8744unlock:
8518 read_unlock(&tasklist_lock); 8745 read_unlock(&tasklist_lock);
8519 mutex_unlock(&rt_constraints_mutex); 8746 mutex_unlock(&rt_constraints_mutex);
8520 8747
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 806d1b227a21..f4f6a8326dd0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -25,7 +25,7 @@
25 25
26/* 26/*
27 * Targeted preemption latency for CPU-bound tasks: 27 * Targeted preemption latency for CPU-bound tasks:
28 * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) 28 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
29 * 29 *
30 * NOTE: this latency value is not the same as the concept of 30 * NOTE: this latency value is not the same as the concept of
31 * 'timeslice length' - timeslices in CFS are of variable length 31 * 'timeslice length' - timeslices in CFS are of variable length
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
52 52
53/* 53/*
54 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
56 */ 56 */
57unsigned int sysctl_sched_min_granularity = 2000000ULL; 57unsigned int sysctl_sched_min_granularity = 750000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL; 58unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
59 59
60/* 60/*
61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
62 */ 62 */
63static unsigned int sched_nr_latency = 3; 63static unsigned int sched_nr_latency = 8;
64 64
65/* 65/*
66 * After fork, child runs first. If set to 0 (default) then 66 * After fork, child runs first. If set to 0 (default) then
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
519static void update_curr(struct cfs_rq *cfs_rq) 519static void update_curr(struct cfs_rq *cfs_rq)
520{ 520{
521 struct sched_entity *curr = cfs_rq->curr; 521 struct sched_entity *curr = cfs_rq->curr;
522 u64 now = rq_of(cfs_rq)->clock; 522 u64 now = rq_of(cfs_rq)->clock_task;
523 unsigned long delta_exec; 523 unsigned long delta_exec;
524 524
525 if (unlikely(!curr)) 525 if (unlikely(!curr))
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
602 /* 602 /*
603 * We are starting a new run period: 603 * We are starting a new run period:
604 */ 604 */
605 se->exec_start = rq_of(cfs_rq)->clock; 605 se->exec_start = rq_of(cfs_rq)->clock_task;
606} 606}
607 607
608/************************************************** 608/**************************************************
@@ -1313,7 +1313,7 @@ static struct sched_group *
1313find_idlest_group(struct sched_domain *sd, struct task_struct *p, 1313find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1314 int this_cpu, int load_idx) 1314 int this_cpu, int load_idx)
1315{ 1315{
1316 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; 1316 struct sched_group *idlest = NULL, *group = sd->groups;
1317 unsigned long min_load = ULONG_MAX, this_load = 0; 1317 unsigned long min_load = ULONG_MAX, this_load = 0;
1318 int imbalance = 100 + (sd->imbalance_pct-100)/2; 1318 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1319 1319
@@ -1348,7 +1348,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1348 1348
1349 if (local_group) { 1349 if (local_group) {
1350 this_load = avg_load; 1350 this_load = avg_load;
1351 this = group;
1352 } else if (avg_load < min_load) { 1351 } else if (avg_load < min_load) {
1353 min_load = avg_load; 1352 min_load = avg_load;
1354 idlest = group; 1353 idlest = group;
@@ -1765,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
1765 set_task_cpu(p, this_cpu); 1764 set_task_cpu(p, this_cpu);
1766 activate_task(this_rq, p, 0); 1765 activate_task(this_rq, p, 0);
1767 check_preempt_curr(this_rq, p, 0); 1766 check_preempt_curr(this_rq, p, 0);
1767
1768 /* re-arm NEWIDLE balancing when moving tasks */
1769 src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
1770 this_rq->idle_stamp = 0;
1768} 1771}
1769 1772
1770/* 1773/*
@@ -1799,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1799 * 2) too many balance attempts have failed. 1802 * 2) too many balance attempts have failed.
1800 */ 1803 */
1801 1804
1802 tsk_cache_hot = task_hot(p, rq->clock, sd); 1805 tsk_cache_hot = task_hot(p, rq->clock_task, sd);
1803 if (!tsk_cache_hot || 1806 if (!tsk_cache_hot ||
1804 sd->nr_balance_failed > sd->cache_nice_tries) { 1807 sd->nr_balance_failed > sd->cache_nice_tries) {
1805#ifdef CONFIG_SCHEDSTATS 1808#ifdef CONFIG_SCHEDSTATS
@@ -2031,12 +2034,14 @@ struct sd_lb_stats {
2031 unsigned long this_load; 2034 unsigned long this_load;
2032 unsigned long this_load_per_task; 2035 unsigned long this_load_per_task;
2033 unsigned long this_nr_running; 2036 unsigned long this_nr_running;
2037 unsigned long this_has_capacity;
2034 2038
2035 /* Statistics of the busiest group */ 2039 /* Statistics of the busiest group */
2036 unsigned long max_load; 2040 unsigned long max_load;
2037 unsigned long busiest_load_per_task; 2041 unsigned long busiest_load_per_task;
2038 unsigned long busiest_nr_running; 2042 unsigned long busiest_nr_running;
2039 unsigned long busiest_group_capacity; 2043 unsigned long busiest_group_capacity;
2044 unsigned long busiest_has_capacity;
2040 2045
2041 int group_imb; /* Is there imbalance in this sd */ 2046 int group_imb; /* Is there imbalance in this sd */
2042#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2047#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2059,6 +2064,7 @@ struct sg_lb_stats {
2059 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 2064 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2060 unsigned long group_capacity; 2065 unsigned long group_capacity;
2061 int group_imb; /* Is there an imbalance in the group ? */ 2066 int group_imb; /* Is there an imbalance in the group ? */
2067 int group_has_capacity; /* Is there extra capacity in the group? */
2062}; 2068};
2063 2069
2064/** 2070/**
@@ -2268,10 +2274,14 @@ unsigned long scale_rt_power(int cpu)
2268 struct rq *rq = cpu_rq(cpu); 2274 struct rq *rq = cpu_rq(cpu);
2269 u64 total, available; 2275 u64 total, available;
2270 2276
2271 sched_avg_update(rq);
2272
2273 total = sched_avg_period() + (rq->clock - rq->age_stamp); 2277 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2274 available = total - rq->rt_avg; 2278
2279 if (unlikely(total < rq->rt_avg)) {
2280 /* Ensures that power won't end up being negative */
2281 available = 0;
2282 } else {
2283 available = total - rq->rt_avg;
2284 }
2275 2285
2276 if (unlikely((s64)total < SCHED_LOAD_SCALE)) 2286 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2277 total = SCHED_LOAD_SCALE; 2287 total = SCHED_LOAD_SCALE;
@@ -2381,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2381 int local_group, const struct cpumask *cpus, 2391 int local_group, const struct cpumask *cpus,
2382 int *balance, struct sg_lb_stats *sgs) 2392 int *balance, struct sg_lb_stats *sgs)
2383{ 2393{
2384 unsigned long load, max_cpu_load, min_cpu_load; 2394 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
2385 int i; 2395 int i;
2386 unsigned int balance_cpu = -1, first_idle_cpu = 0; 2396 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2387 unsigned long avg_load_per_task = 0; 2397 unsigned long avg_load_per_task = 0;
@@ -2392,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2392 /* Tally up the load of all CPUs in the group */ 2402 /* Tally up the load of all CPUs in the group */
2393 max_cpu_load = 0; 2403 max_cpu_load = 0;
2394 min_cpu_load = ~0UL; 2404 min_cpu_load = ~0UL;
2405 max_nr_running = 0;
2395 2406
2396 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 2407 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2397 struct rq *rq = cpu_rq(i); 2408 struct rq *rq = cpu_rq(i);
@@ -2409,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2409 load = target_load(i, load_idx); 2420 load = target_load(i, load_idx);
2410 } else { 2421 } else {
2411 load = source_load(i, load_idx); 2422 load = source_load(i, load_idx);
2412 if (load > max_cpu_load) 2423 if (load > max_cpu_load) {
2413 max_cpu_load = load; 2424 max_cpu_load = load;
2425 max_nr_running = rq->nr_running;
2426 }
2414 if (min_cpu_load > load) 2427 if (min_cpu_load > load)
2415 min_cpu_load = load; 2428 min_cpu_load = load;
2416 } 2429 }
@@ -2450,13 +2463,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2450 if (sgs->sum_nr_running) 2463 if (sgs->sum_nr_running)
2451 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 2464 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2452 2465
2453 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 2466 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
2454 sgs->group_imb = 1; 2467 sgs->group_imb = 1;
2455 2468
2456 sgs->group_capacity = 2469 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2457 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2458 if (!sgs->group_capacity) 2470 if (!sgs->group_capacity)
2459 sgs->group_capacity = fix_small_capacity(sd, group); 2471 sgs->group_capacity = fix_small_capacity(sd, group);
2472
2473 if (sgs->group_capacity > sgs->sum_nr_running)
2474 sgs->group_has_capacity = 1;
2460} 2475}
2461 2476
2462/** 2477/**
@@ -2545,9 +2560,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2545 /* 2560 /*
2546 * In case the child domain prefers tasks go to siblings 2561 * In case the child domain prefers tasks go to siblings
2547 * first, lower the sg capacity to one so that we'll try 2562 * first, lower the sg capacity to one so that we'll try
2548 * and move all the excess tasks away. 2563 * and move all the excess tasks away. We lower the capacity
2564 * of a group only if the local group has the capacity to fit
2565 * these excess tasks, i.e. nr_running < group_capacity. The
2566 * extra check prevents the case where you always pull from the
2567 * heaviest group when it is already under-utilized (possible
2568 * with a large weight task outweighs the tasks on the system).
2549 */ 2569 */
2550 if (prefer_sibling) 2570 if (prefer_sibling && !local_group && sds->this_has_capacity)
2551 sgs.group_capacity = min(sgs.group_capacity, 1UL); 2571 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2552 2572
2553 if (local_group) { 2573 if (local_group) {
@@ -2555,12 +2575,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2555 sds->this = sg; 2575 sds->this = sg;
2556 sds->this_nr_running = sgs.sum_nr_running; 2576 sds->this_nr_running = sgs.sum_nr_running;
2557 sds->this_load_per_task = sgs.sum_weighted_load; 2577 sds->this_load_per_task = sgs.sum_weighted_load;
2578 sds->this_has_capacity = sgs.group_has_capacity;
2558 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 2579 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2559 sds->max_load = sgs.avg_load; 2580 sds->max_load = sgs.avg_load;
2560 sds->busiest = sg; 2581 sds->busiest = sg;
2561 sds->busiest_nr_running = sgs.sum_nr_running; 2582 sds->busiest_nr_running = sgs.sum_nr_running;
2562 sds->busiest_group_capacity = sgs.group_capacity; 2583 sds->busiest_group_capacity = sgs.group_capacity;
2563 sds->busiest_load_per_task = sgs.sum_weighted_load; 2584 sds->busiest_load_per_task = sgs.sum_weighted_load;
2585 sds->busiest_has_capacity = sgs.group_has_capacity;
2564 sds->group_imb = sgs.group_imb; 2586 sds->group_imb = sgs.group_imb;
2565 } 2587 }
2566 2588
@@ -2757,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2757 return fix_small_imbalance(sds, this_cpu, imbalance); 2779 return fix_small_imbalance(sds, this_cpu, imbalance);
2758 2780
2759} 2781}
2782
2760/******* find_busiest_group() helpers end here *********************/ 2783/******* find_busiest_group() helpers end here *********************/
2761 2784
2762/** 2785/**
@@ -2808,6 +2831,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2808 * 4) This group is more busy than the avg busieness at this 2831 * 4) This group is more busy than the avg busieness at this
2809 * sched_domain. 2832 * sched_domain.
2810 * 5) The imbalance is within the specified limit. 2833 * 5) The imbalance is within the specified limit.
2834 *
2835 * Note: when doing newidle balance, if the local group has excess
2836 * capacity (i.e. nr_running < group_capacity) and the busiest group
2837 * does not have any capacity, we force a load balance to pull tasks
2838 * to the local group. In this case, we skip past checks 3, 4 and 5.
2811 */ 2839 */
2812 if (!(*balance)) 2840 if (!(*balance))
2813 goto ret; 2841 goto ret;
@@ -2819,6 +2847,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2819 if (!sds.busiest || sds.busiest_nr_running == 0) 2847 if (!sds.busiest || sds.busiest_nr_running == 0)
2820 goto out_balanced; 2848 goto out_balanced;
2821 2849
2850 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
2851 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
2852 !sds.busiest_has_capacity)
2853 goto force_balance;
2854
2822 if (sds.this_load >= sds.max_load) 2855 if (sds.this_load >= sds.max_load)
2823 goto out_balanced; 2856 goto out_balanced;
2824 2857
@@ -2830,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2830 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 2863 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2831 goto out_balanced; 2864 goto out_balanced;
2832 2865
2866force_balance:
2833 /* Looks like there is an imbalance. Compute it */ 2867 /* Looks like there is an imbalance. Compute it */
2834 calculate_imbalance(&sds, this_cpu, imbalance); 2868 calculate_imbalance(&sds, this_cpu, imbalance);
2835 return sds.busiest; 2869 return sds.busiest;
@@ -3034,7 +3068,14 @@ redo:
3034 3068
3035 if (!ld_moved) { 3069 if (!ld_moved) {
3036 schedstat_inc(sd, lb_failed[idle]); 3070 schedstat_inc(sd, lb_failed[idle]);
3037 sd->nr_balance_failed++; 3071 /*
3072 * Increment the failure counter only on periodic balance.
3073 * We do not want newidle balance, which can be very
3074 * frequent, pollute the failure counter causing
3075 * excessive cache_hot migrations and active balances.
3076 */
3077 if (idle != CPU_NEWLY_IDLE)
3078 sd->nr_balance_failed++;
3038 3079
3039 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), 3080 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
3040 this_cpu)) { 3081 this_cpu)) {
@@ -3156,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3156 interval = msecs_to_jiffies(sd->balance_interval); 3197 interval = msecs_to_jiffies(sd->balance_interval);
3157 if (time_after(next_balance, sd->last_balance + interval)) 3198 if (time_after(next_balance, sd->last_balance + interval))
3158 next_balance = sd->last_balance + interval; 3199 next_balance = sd->last_balance + interval;
3159 if (pulled_task) { 3200 if (pulled_task)
3160 this_rq->idle_stamp = 0;
3161 break; 3201 break;
3162 }
3163 } 3202 }
3164 3203
3165 raw_spin_lock(&this_rq->lock); 3204 raw_spin_lock(&this_rq->lock);
@@ -3633,7 +3672,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
3633 if (time_before(now, nohz.next_balance)) 3672 if (time_before(now, nohz.next_balance))
3634 return 0; 3673 return 0;
3635 3674
3636 if (!rq->nr_running) 3675 if (rq->idle_at_tick)
3637 return 0; 3676 return 0;
3638 3677
3639 first_pick_cpu = atomic_read(&nohz.first_pick_cpu); 3678 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
@@ -3752,8 +3791,13 @@ static void task_fork_fair(struct task_struct *p)
3752 3791
3753 raw_spin_lock_irqsave(&rq->lock, flags); 3792 raw_spin_lock_irqsave(&rq->lock, flags);
3754 3793
3755 if (unlikely(task_cpu(p) != this_cpu)) 3794 update_rq_clock(rq);
3795
3796 if (unlikely(task_cpu(p) != this_cpu)) {
3797 rcu_read_lock();
3756 __set_task_cpu(p, this_cpu); 3798 __set_task_cpu(p, this_cpu);
3799 rcu_read_unlock();
3800 }
3757 3801
3758 update_curr(cfs_rq); 3802 update_curr(cfs_rq);
3759 3803
@@ -3825,13 +3869,26 @@ static void set_curr_task_fair(struct rq *rq)
3825} 3869}
3826 3870
3827#ifdef CONFIG_FAIR_GROUP_SCHED 3871#ifdef CONFIG_FAIR_GROUP_SCHED
3828static void moved_group_fair(struct task_struct *p, int on_rq) 3872static void task_move_group_fair(struct task_struct *p, int on_rq)
3829{ 3873{
3830 struct cfs_rq *cfs_rq = task_cfs_rq(p); 3874 /*
3831 3875 * If the task was not on the rq at the time of this cgroup movement
3832 update_curr(cfs_rq); 3876 * it must have been asleep, sleeping tasks keep their ->vruntime
3877 * absolute on their old rq until wakeup (needed for the fair sleeper
3878 * bonus in place_entity()).
3879 *
3880 * If it was on the rq, we've just 'preempted' it, which does convert
3881 * ->vruntime to a relative base.
3882 *
3883 * Make sure both cases convert their relative position when migrating
3884 * to another cgroup's rq. This does somewhat interfere with the
3885 * fair sleeper stuff for the first placement, but who cares.
3886 */
3887 if (!on_rq)
3888 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
3889 set_task_rq(p, task_cpu(p));
3833 if (!on_rq) 3890 if (!on_rq)
3834 place_entity(cfs_rq, &p->se, 1); 3891 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
3835} 3892}
3836#endif 3893#endif
3837 3894
@@ -3883,7 +3940,7 @@ static const struct sched_class fair_sched_class = {
3883 .get_rr_interval = get_rr_interval_fair, 3940 .get_rr_interval = get_rr_interval_fair,
3884 3941
3885#ifdef CONFIG_FAIR_GROUP_SCHED 3942#ifdef CONFIG_FAIR_GROUP_SCHED
3886 .moved_group = moved_group_fair, 3943 .task_move_group = task_move_group_fair,
3887#endif 3944#endif
3888}; 3945};
3889 3946
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 83c66e8ad3ee..185f920ec1a2 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,3 +61,8 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1)
61 * release the lock. Decreases scheduling overhead. 61 * release the lock. Decreases scheduling overhead.
62 */ 62 */
63SCHED_FEAT(OWNER_SPIN, 1) 63SCHED_FEAT(OWNER_SPIN, 1)
64
65/*
66 * Decrement CPU power based on irq activity
67 */
68SCHED_FEAT(NONIRQ_POWER, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d10c80ebb67a..bea7d79f7e9c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -609,7 +609,7 @@ static void update_curr_rt(struct rq *rq)
609 if (!task_has_rt_policy(curr)) 609 if (!task_has_rt_policy(curr))
610 return; 610 return;
611 611
612 delta_exec = rq->clock - curr->se.exec_start; 612 delta_exec = rq->clock_task - curr->se.exec_start;
613 if (unlikely((s64)delta_exec < 0)) 613 if (unlikely((s64)delta_exec < 0))
614 delta_exec = 0; 614 delta_exec = 0;
615 615
@@ -618,7 +618,7 @@ static void update_curr_rt(struct rq *rq)
618 curr->se.sum_exec_runtime += delta_exec; 618 curr->se.sum_exec_runtime += delta_exec;
619 account_group_exec_runtime(curr, delta_exec); 619 account_group_exec_runtime(curr, delta_exec);
620 620
621 curr->se.exec_start = rq->clock; 621 curr->se.exec_start = rq->clock_task;
622 cpuacct_charge(curr, delta_exec); 622 cpuacct_charge(curr, delta_exec);
623 623
624 sched_rt_avg_update(rq, delta_exec); 624 sched_rt_avg_update(rq, delta_exec);
@@ -960,18 +960,19 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
960 * runqueue. Otherwise simply start this RT task 960 * runqueue. Otherwise simply start this RT task
961 * on its current runqueue. 961 * on its current runqueue.
962 * 962 *
963 * We want to avoid overloading runqueues. Even if 963 * We want to avoid overloading runqueues. If the woken
964 * the RT task is of higher priority than the current RT task. 964 * task is a higher priority, then it will stay on this CPU
965 * RT tasks behave differently than other tasks. If 965 * and the lower prio task should be moved to another CPU.
966 * one gets preempted, we try to push it off to another queue. 966 * Even though this will probably make the lower prio task
967 * So trying to keep a preempting RT task on the same 967 * lose its cache, we do not want to bounce a higher task
968 * cache hot CPU will force the running RT task to 968 * around just because it gave up its CPU, perhaps for a
969 * a cold CPU. So we waste all the cache for the lower 969 * lock?
970 * RT task in hopes of saving some of a RT task 970 *
971 * that is just being woken and probably will have 971 * For equal prio tasks, we just let the scheduler sort it out.
972 * cold cache anyway.
973 */ 972 */
974 if (unlikely(rt_task(rq->curr)) && 973 if (unlikely(rt_task(rq->curr)) &&
974 (rq->curr->rt.nr_cpus_allowed < 2 ||
975 rq->curr->prio < p->prio) &&
975 (p->rt.nr_cpus_allowed > 1)) { 976 (p->rt.nr_cpus_allowed > 1)) {
976 int cpu = find_lowest_rq(p); 977 int cpu = find_lowest_rq(p);
977 978
@@ -1074,7 +1075,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1074 } while (rt_rq); 1075 } while (rt_rq);
1075 1076
1076 p = rt_task_of(rt_se); 1077 p = rt_task_of(rt_se);
1077 p->se.exec_start = rq->clock; 1078 p->se.exec_start = rq->clock_task;
1078 1079
1079 return p; 1080 return p;
1080} 1081}
@@ -1139,7 +1140,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1139 for_each_leaf_rt_rq(rt_rq, rq) { 1140 for_each_leaf_rt_rq(rt_rq, rq) {
1140 array = &rt_rq->active; 1141 array = &rt_rq->active;
1141 idx = sched_find_first_bit(array->bitmap); 1142 idx = sched_find_first_bit(array->bitmap);
1142 next_idx: 1143next_idx:
1143 if (idx >= MAX_RT_PRIO) 1144 if (idx >= MAX_RT_PRIO)
1144 continue; 1145 continue;
1145 if (next && next->prio < idx) 1146 if (next && next->prio < idx)
@@ -1315,7 +1316,7 @@ static int push_rt_task(struct rq *rq)
1315 if (!next_task) 1316 if (!next_task)
1316 return 0; 1317 return 0;
1317 1318
1318 retry: 1319retry:
1319 if (unlikely(next_task == rq->curr)) { 1320 if (unlikely(next_task == rq->curr)) {
1320 WARN_ON(1); 1321 WARN_ON(1);
1321 return 0; 1322 return 0;
@@ -1463,7 +1464,7 @@ static int pull_rt_task(struct rq *this_rq)
1463 * but possible) 1464 * but possible)
1464 */ 1465 */
1465 } 1466 }
1466 skip: 1467skip:
1467 double_unlock_balance(this_rq, src_rq); 1468 double_unlock_balance(this_rq, src_rq);
1468 } 1469 }
1469 1470
@@ -1491,7 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1491 if (!task_running(rq, p) && 1492 if (!task_running(rq, p) &&
1492 !test_tsk_need_resched(rq->curr) && 1493 !test_tsk_need_resched(rq->curr) &&
1493 has_pushable_tasks(rq) && 1494 has_pushable_tasks(rq) &&
1494 p->rt.nr_cpus_allowed > 1) 1495 p->rt.nr_cpus_allowed > 1 &&
1496 rt_task(rq->curr) &&
1497 (rq->curr->rt.nr_cpus_allowed < 2 ||
1498 rq->curr->prio < p->prio))
1495 push_rt_tasks(rq); 1499 push_rt_tasks(rq);
1496} 1500}
1497 1501
@@ -1709,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq)
1709{ 1713{
1710 struct task_struct *p = rq->curr; 1714 struct task_struct *p = rq->curr;
1711 1715
1712 p->se.exec_start = rq->clock; 1716 p->se.exec_start = rq->clock_task;
1713 1717
1714 /* The running task is never eligible for pushing */ 1718 /* The running task is never eligible for pushing */
1715 dequeue_pushable_task(rq, p); 1719 dequeue_pushable_task(rq, p);
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 25c2f962f6fc..48ddf431db0e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -157,15 +157,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
157} 157}
158 158
159/* 159/*
160 * Called when a process is dequeued from the active array and given 160 * We are interested in knowing how long it was from the *first* time a
161 * the cpu. We should note that with the exception of interactive
162 * tasks, the expired queue will become the active queue after the active
163 * queue is empty, without explicitly dequeuing and requeuing tasks in the
164 * expired queue. (Interactive tasks may be requeued directly to the
165 * active queue, thus delaying tasks in the expired queue from running;
166 * see scheduler_tick()).
167 *
168 * Though we are interested in knowing how long it was from the *first* time a
169 * task was queued to the time that it finally hit a cpu, we call this routine 161 * task was queued to the time that it finally hit a cpu, we call this routine
170 * from dequeue_task() to account for possible rq->clock skew across cpus. The 162 * from dequeue_task() to account for possible rq->clock skew across cpus. The
171 * delta taken on each cpu would annul the skew. 163 * delta taken on each cpu would annul the skew.
@@ -203,16 +195,6 @@ static void sched_info_arrive(struct task_struct *t)
203} 195}
204 196
205/* 197/*
206 * Called when a process is queued into either the active or expired
207 * array. The time is noted and later used to determine how long we
208 * had to wait for us to reach the cpu. Since the expired queue will
209 * become the active queue after active queue is empty, without dequeuing
210 * and requeuing any tasks, we are interested in queuing to either. It
211 * is unusual but not impossible for tasks to be dequeued and immediately
212 * requeued in the same or another array: this can happen in sched_yield(),
213 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
214 * to runqueue.
215 *
216 * This function is only called from enqueue_task(), but also only updates 198 * This function is only called from enqueue_task(), but also only updates
217 * the timestamp if it is already not set. It's assumed that 199 * the timestamp if it is already not set. It's assumed that
218 * sched_info_dequeued() will clear that stamp when appropriate. 200 * sched_info_dequeued() will clear that stamp when appropriate.
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
new file mode 100644
index 000000000000..45bddc0c1048
--- /dev/null
+++ b/kernel/sched_stoptask.c
@@ -0,0 +1,108 @@
1/*
2 * stop-task scheduling class.
3 *
4 * The stop task is the highest priority task in the system, it preempts
5 * everything and will be preempted by nothing.
6 *
7 * See kernel/stop_machine.c
8 */
9
10#ifdef CONFIG_SMP
11static int
12select_task_rq_stop(struct rq *rq, struct task_struct *p,
13 int sd_flag, int flags)
14{
15 return task_cpu(p); /* stop tasks as never migrate */
16}
17#endif /* CONFIG_SMP */
18
19static void
20check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
21{
22 resched_task(rq->curr); /* we preempt everything */
23}
24
25static struct task_struct *pick_next_task_stop(struct rq *rq)
26{
27 struct task_struct *stop = rq->stop;
28
29 if (stop && stop->state == TASK_RUNNING)
30 return stop;
31
32 return NULL;
33}
34
35static void
36enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
37{
38}
39
40static void
41dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
42{
43}
44
45static void yield_task_stop(struct rq *rq)
46{
47 BUG(); /* the stop task should never yield, its pointless. */
48}
49
50static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
51{
52}
53
54static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
55{
56}
57
58static void set_curr_task_stop(struct rq *rq)
59{
60}
61
62static void switched_to_stop(struct rq *rq, struct task_struct *p,
63 int running)
64{
65 BUG(); /* its impossible to change to this class */
66}
67
68static void prio_changed_stop(struct rq *rq, struct task_struct *p,
69 int oldprio, int running)
70{
71 BUG(); /* how!?, what priority? */
72}
73
74static unsigned int
75get_rr_interval_stop(struct rq *rq, struct task_struct *task)
76{
77 return 0;
78}
79
80/*
81 * Simple, special scheduling class for the per-CPU stop tasks:
82 */
83static const struct sched_class stop_sched_class = {
84 .next = &rt_sched_class,
85
86 .enqueue_task = enqueue_task_stop,
87 .dequeue_task = dequeue_task_stop,
88 .yield_task = yield_task_stop,
89
90 .check_preempt_curr = check_preempt_curr_stop,
91
92 .pick_next_task = pick_next_task_stop,
93 .put_prev_task = put_prev_task_stop,
94
95#ifdef CONFIG_SMP
96 .select_task_rq = select_task_rq_stop,
97#endif
98
99 .set_curr_task = set_curr_task_stop,
100 .task_tick = task_tick_stop,
101
102 .get_rr_interval = get_rr_interval_stop,
103
104 .prio_changed = prio_changed_stop,
105 .switched_to = switched_to_stop,
106
107 /* no .task_new for stop tasks */
108};
diff --git a/kernel/signal.c b/kernel/signal.c
index bded65187780..4e3cff10fdce 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1105,7 +1105,8 @@ int zap_other_threads(struct task_struct *p)
1105 return count; 1105 return count;
1106} 1106}
1107 1107
1108struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 1108struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
1109 unsigned long *flags)
1109{ 1110{
1110 struct sighand_struct *sighand; 1111 struct sighand_struct *sighand;
1111 1112
@@ -1617,6 +1618,8 @@ static int sigkill_pending(struct task_struct *tsk)
1617 * is gone, we keep current->exit_code unless clear_code. 1618 * is gone, we keep current->exit_code unless clear_code.
1618 */ 1619 */
1619static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) 1620static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1621 __releases(&current->sighand->siglock)
1622 __acquires(&current->sighand->siglock)
1620{ 1623{
1621 if (arch_ptrace_stop_needed(exit_code, info)) { 1624 if (arch_ptrace_stop_needed(exit_code, info)) {
1622 /* 1625 /*
@@ -2215,6 +2218,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2215#ifdef __ARCH_SI_TRAPNO 2218#ifdef __ARCH_SI_TRAPNO
2216 err |= __put_user(from->si_trapno, &to->si_trapno); 2219 err |= __put_user(from->si_trapno, &to->si_trapno);
2217#endif 2220#endif
2221#ifdef BUS_MCEERR_AO
2222 /*
2223 * Other callers might not initialize the si_lsb field,
2224 * so check explicitely for the right codes here.
2225 */
2226 if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
2227 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
2228#endif
2218 break; 2229 break;
2219 case __SI_CHLD: 2230 case __SI_CHLD:
2220 err |= __put_user(from->si_pid, &to->si_pid); 2231 err |= __put_user(from->si_pid, &to->si_pid);
diff --git a/kernel/smp.c b/kernel/smp.c
index 75c970c715d3..12ed8b013e2d 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -267,7 +267,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
267 * 267 *
268 * Returns 0 on success, else a negative status code. 268 * Returns 0 on success, else a negative status code.
269 */ 269 */
270int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 270int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
271 int wait) 271 int wait)
272{ 272{
273 struct call_single_data d = { 273 struct call_single_data d = {
@@ -336,7 +336,7 @@ EXPORT_SYMBOL(smp_call_function_single);
336 * 3) any other online cpu in @mask 336 * 3) any other online cpu in @mask
337 */ 337 */
338int smp_call_function_any(const struct cpumask *mask, 338int smp_call_function_any(const struct cpumask *mask,
339 void (*func)(void *info), void *info, int wait) 339 smp_call_func_t func, void *info, int wait)
340{ 340{
341 unsigned int cpu; 341 unsigned int cpu;
342 const struct cpumask *nodemask; 342 const struct cpumask *nodemask;
@@ -365,9 +365,10 @@ call:
365EXPORT_SYMBOL_GPL(smp_call_function_any); 365EXPORT_SYMBOL_GPL(smp_call_function_any);
366 366
367/** 367/**
368 * __smp_call_function_single(): Run a function on another CPU 368 * __smp_call_function_single(): Run a function on a specific CPU
369 * @cpu: The CPU to run on. 369 * @cpu: The CPU to run on.
370 * @data: Pre-allocated and setup data structure 370 * @data: Pre-allocated and setup data structure
371 * @wait: If true, wait until function has completed on specified CPU.
371 * 372 *
372 * Like smp_call_function_single(), but allow caller to pass in a 373 * Like smp_call_function_single(), but allow caller to pass in a
373 * pre-allocated data structure. Useful for embedding @data inside 374 * pre-allocated data structure. Useful for embedding @data inside
@@ -376,8 +377,10 @@ EXPORT_SYMBOL_GPL(smp_call_function_any);
376void __smp_call_function_single(int cpu, struct call_single_data *data, 377void __smp_call_function_single(int cpu, struct call_single_data *data,
377 int wait) 378 int wait)
378{ 379{
379 csd_lock(data); 380 unsigned int this_cpu;
381 unsigned long flags;
380 382
383 this_cpu = get_cpu();
381 /* 384 /*
382 * Can deadlock when called with interrupts disabled. 385 * Can deadlock when called with interrupts disabled.
383 * We allow cpu's that are not yet online though, as no one else can 386 * We allow cpu's that are not yet online though, as no one else can
@@ -387,7 +390,15 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
387 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() 390 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
388 && !oops_in_progress); 391 && !oops_in_progress);
389 392
390 generic_exec_single(cpu, data, wait); 393 if (cpu == this_cpu) {
394 local_irq_save(flags);
395 data->func(data->info);
396 local_irq_restore(flags);
397 } else {
398 csd_lock(data);
399 generic_exec_single(cpu, data, wait);
400 }
401 put_cpu();
391} 402}
392 403
393/** 404/**
@@ -405,7 +416,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
405 * must be disabled when calling this function. 416 * must be disabled when calling this function.
406 */ 417 */
407void smp_call_function_many(const struct cpumask *mask, 418void smp_call_function_many(const struct cpumask *mask,
408 void (*func)(void *), void *info, bool wait) 419 smp_call_func_t func, void *info, bool wait)
409{ 420{
410 struct call_function_data *data; 421 struct call_function_data *data;
411 unsigned long flags; 422 unsigned long flags;
@@ -489,7 +500,7 @@ EXPORT_SYMBOL(smp_call_function_many);
489 * You must not call this function with disabled interrupts or from a 500 * You must not call this function with disabled interrupts or from a
490 * hardware interrupt handler or from a bottom half handler. 501 * hardware interrupt handler or from a bottom half handler.
491 */ 502 */
492int smp_call_function(void (*func)(void *), void *info, int wait) 503int smp_call_function(smp_call_func_t func, void *info, int wait)
493{ 504{
494 preempt_disable(); 505 preempt_disable();
495 smp_call_function_many(cpu_online_mask, func, info, wait); 506 smp_call_function_many(cpu_online_mask, func, info, wait);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 07b4f1b1a73a..18f4be0d5fe0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -67,7 +67,7 @@ char *softirq_to_name[NR_SOFTIRQS] = {
67 * to the pending events, so lets the scheduler to balance 67 * to the pending events, so lets the scheduler to balance
68 * the softirq load for us. 68 * the softirq load for us.
69 */ 69 */
70void wakeup_softirqd(void) 70static void wakeup_softirqd(void)
71{ 71{
72 /* Interrupts are disabled: no need to stop preemption */ 72 /* Interrupts are disabled: no need to stop preemption */
73 struct task_struct *tsk = __get_cpu_var(ksoftirqd); 73 struct task_struct *tsk = __get_cpu_var(ksoftirqd);
@@ -77,11 +77,21 @@ void wakeup_softirqd(void)
77} 77}
78 78
79/* 79/*
80 * preempt_count and SOFTIRQ_OFFSET usage:
81 * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
82 * softirq processing.
83 * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
84 * on local_bh_disable or local_bh_enable.
85 * This lets us distinguish between whether we are currently processing
86 * softirq and whether we just have bh disabled.
87 */
88
89/*
80 * This one is for softirq.c-internal use, 90 * This one is for softirq.c-internal use,
81 * where hardirqs are disabled legitimately: 91 * where hardirqs are disabled legitimately:
82 */ 92 */
83#ifdef CONFIG_TRACE_IRQFLAGS 93#ifdef CONFIG_TRACE_IRQFLAGS
84static void __local_bh_disable(unsigned long ip) 94static void __local_bh_disable(unsigned long ip, unsigned int cnt)
85{ 95{
86 unsigned long flags; 96 unsigned long flags;
87 97
@@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip)
95 * We must manually increment preempt_count here and manually 105 * We must manually increment preempt_count here and manually
96 * call the trace_preempt_off later. 106 * call the trace_preempt_off later.
97 */ 107 */
98 preempt_count() += SOFTIRQ_OFFSET; 108 preempt_count() += cnt;
99 /* 109 /*
100 * Were softirqs turned off above: 110 * Were softirqs turned off above:
101 */ 111 */
102 if (softirq_count() == SOFTIRQ_OFFSET) 112 if (softirq_count() == cnt)
103 trace_softirqs_off(ip); 113 trace_softirqs_off(ip);
104 raw_local_irq_restore(flags); 114 raw_local_irq_restore(flags);
105 115
106 if (preempt_count() == SOFTIRQ_OFFSET) 116 if (preempt_count() == cnt)
107 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 117 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
108} 118}
109#else /* !CONFIG_TRACE_IRQFLAGS */ 119#else /* !CONFIG_TRACE_IRQFLAGS */
110static inline void __local_bh_disable(unsigned long ip) 120static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
111{ 121{
112 add_preempt_count(SOFTIRQ_OFFSET); 122 add_preempt_count(cnt);
113 barrier(); 123 barrier();
114} 124}
115#endif /* CONFIG_TRACE_IRQFLAGS */ 125#endif /* CONFIG_TRACE_IRQFLAGS */
116 126
117void local_bh_disable(void) 127void local_bh_disable(void)
118{ 128{
119 __local_bh_disable((unsigned long)__builtin_return_address(0)); 129 __local_bh_disable((unsigned long)__builtin_return_address(0),
130 SOFTIRQ_DISABLE_OFFSET);
120} 131}
121 132
122EXPORT_SYMBOL(local_bh_disable); 133EXPORT_SYMBOL(local_bh_disable);
123 134
135static void __local_bh_enable(unsigned int cnt)
136{
137 WARN_ON_ONCE(in_irq());
138 WARN_ON_ONCE(!irqs_disabled());
139
140 if (softirq_count() == cnt)
141 trace_softirqs_on((unsigned long)__builtin_return_address(0));
142 sub_preempt_count(cnt);
143}
144
124/* 145/*
125 * Special-case - softirqs can safely be enabled in 146 * Special-case - softirqs can safely be enabled in
126 * cond_resched_softirq(), or by __do_softirq(), 147 * cond_resched_softirq(), or by __do_softirq(),
@@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable);
128 */ 149 */
129void _local_bh_enable(void) 150void _local_bh_enable(void)
130{ 151{
131 WARN_ON_ONCE(in_irq()); 152 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
132 WARN_ON_ONCE(!irqs_disabled());
133
134 if (softirq_count() == SOFTIRQ_OFFSET)
135 trace_softirqs_on((unsigned long)__builtin_return_address(0));
136 sub_preempt_count(SOFTIRQ_OFFSET);
137} 153}
138 154
139EXPORT_SYMBOL(_local_bh_enable); 155EXPORT_SYMBOL(_local_bh_enable);
@@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip)
147 /* 163 /*
148 * Are softirqs going to be turned on now: 164 * Are softirqs going to be turned on now:
149 */ 165 */
150 if (softirq_count() == SOFTIRQ_OFFSET) 166 if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
151 trace_softirqs_on(ip); 167 trace_softirqs_on(ip);
152 /* 168 /*
153 * Keep preemption disabled until we are done with 169 * Keep preemption disabled until we are done with
154 * softirq processing: 170 * softirq processing:
155 */ 171 */
156 sub_preempt_count(SOFTIRQ_OFFSET - 1); 172 sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
157 173
158 if (unlikely(!in_interrupt() && local_softirq_pending())) 174 if (unlikely(!in_interrupt() && local_softirq_pending()))
159 do_softirq(); 175 do_softirq();
@@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void)
198 pending = local_softirq_pending(); 214 pending = local_softirq_pending();
199 account_system_vtime(current); 215 account_system_vtime(current);
200 216
201 __local_bh_disable((unsigned long)__builtin_return_address(0)); 217 __local_bh_disable((unsigned long)__builtin_return_address(0),
218 SOFTIRQ_OFFSET);
202 lockdep_softirq_enter(); 219 lockdep_softirq_enter();
203 220
204 cpu = smp_processor_id(); 221 cpu = smp_processor_id();
@@ -212,18 +229,20 @@ restart:
212 229
213 do { 230 do {
214 if (pending & 1) { 231 if (pending & 1) {
232 unsigned int vec_nr = h - softirq_vec;
215 int prev_count = preempt_count(); 233 int prev_count = preempt_count();
216 kstat_incr_softirqs_this_cpu(h - softirq_vec);
217 234
218 trace_softirq_entry(h, softirq_vec); 235 kstat_incr_softirqs_this_cpu(vec_nr);
236
237 trace_softirq_entry(vec_nr);
219 h->action(h); 238 h->action(h);
220 trace_softirq_exit(h, softirq_vec); 239 trace_softirq_exit(vec_nr);
221 if (unlikely(prev_count != preempt_count())) { 240 if (unlikely(prev_count != preempt_count())) {
222 printk(KERN_ERR "huh, entered softirq %td %s %p" 241 printk(KERN_ERR "huh, entered softirq %u %s %p"
223 "with preempt_count %08x," 242 "with preempt_count %08x,"
224 " exited with %08x?\n", h - softirq_vec, 243 " exited with %08x?\n", vec_nr,
225 softirq_to_name[h - softirq_vec], 244 softirq_to_name[vec_nr], h->action,
226 h->action, prev_count, preempt_count()); 245 prev_count, preempt_count());
227 preempt_count() = prev_count; 246 preempt_count() = prev_count;
228 } 247 }
229 248
@@ -245,7 +264,7 @@ restart:
245 lockdep_softirq_exit(); 264 lockdep_softirq_exit();
246 265
247 account_system_vtime(current); 266 account_system_vtime(current);
248 _local_bh_enable(); 267 __local_bh_enable(SOFTIRQ_OFFSET);
249} 268}
250 269
251#ifndef __ARCH_HAS_DO_SOFTIRQ 270#ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -279,10 +298,16 @@ void irq_enter(void)
279 298
280 rcu_irq_enter(); 299 rcu_irq_enter();
281 if (idle_cpu(cpu) && !in_interrupt()) { 300 if (idle_cpu(cpu) && !in_interrupt()) {
282 __irq_enter(); 301 /*
302 * Prevent raise_softirq from needlessly waking up ksoftirqd
303 * here, as softirq will be serviced on return from interrupt.
304 */
305 local_bh_disable();
283 tick_check_idle(cpu); 306 tick_check_idle(cpu);
284 } else 307 _local_bh_enable();
285 __irq_enter(); 308 }
309
310 __irq_enter();
286} 311}
287 312
288#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 313#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -696,6 +721,7 @@ static int run_ksoftirqd(void * __bind_cpu)
696{ 721{
697 set_current_state(TASK_INTERRUPTIBLE); 722 set_current_state(TASK_INTERRUPTIBLE);
698 723
724 current->flags |= PF_KSOFTIRQD;
699 while (!kthread_should_stop()) { 725 while (!kthread_should_stop()) {
700 preempt_disable(); 726 preempt_disable();
701 if (!local_softirq_pending()) { 727 if (!local_softirq_pending()) {
@@ -886,17 +912,14 @@ int __init __weak early_irq_init(void)
886 return 0; 912 return 0;
887} 913}
888 914
915#ifdef CONFIG_GENERIC_HARDIRQS
889int __init __weak arch_probe_nr_irqs(void) 916int __init __weak arch_probe_nr_irqs(void)
890{ 917{
891 return 0; 918 return NR_IRQS_LEGACY;
892} 919}
893 920
894int __init __weak arch_early_irq_init(void) 921int __init __weak arch_early_irq_init(void)
895{ 922{
896 return 0; 923 return 0;
897} 924}
898 925#endif
899int __weak arch_init_chip_data(struct irq_desc *desc, int node)
900{
901 return 0;
902}
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2980da3fd509..c71e07500536 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -46,11 +46,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
46int __init_srcu_struct(struct srcu_struct *sp, const char *name, 46int __init_srcu_struct(struct srcu_struct *sp, const char *name,
47 struct lock_class_key *key) 47 struct lock_class_key *key)
48{ 48{
49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50 /* Don't re-initialize a lock while it is held. */ 49 /* Don't re-initialize a lock while it is held. */
51 debug_check_no_locks_freed((void *)sp, sizeof(*sp)); 50 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
52 lockdep_init_map(&sp->dep_map, name, key, 0); 51 lockdep_init_map(&sp->dep_map, name, key, 0);
53#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
54 return init_srcu_struct_fields(sp); 52 return init_srcu_struct_fields(sp);
55} 53}
56EXPORT_SYMBOL_GPL(__init_srcu_struct); 54EXPORT_SYMBOL_GPL(__init_srcu_struct);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 4372ccb25127..2df820b03beb 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -262,7 +262,7 @@ repeat:
262 cpu_stop_fn_t fn = work->fn; 262 cpu_stop_fn_t fn = work->fn;
263 void *arg = work->arg; 263 void *arg = work->arg;
264 struct cpu_stop_done *done = work->done; 264 struct cpu_stop_done *done = work->done;
265 char ksym_buf[KSYM_NAME_LEN]; 265 char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
266 266
267 __set_current_state(TASK_RUNNING); 267 __set_current_state(TASK_RUNNING);
268 268
@@ -287,11 +287,12 @@ repeat:
287 goto repeat; 287 goto repeat;
288} 288}
289 289
290extern void sched_set_stop_task(int cpu, struct task_struct *stop);
291
290/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ 292/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
291static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, 293static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
292 unsigned long action, void *hcpu) 294 unsigned long action, void *hcpu)
293{ 295{
294 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
295 unsigned int cpu = (unsigned long)hcpu; 296 unsigned int cpu = (unsigned long)hcpu;
296 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 297 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
297 struct task_struct *p; 298 struct task_struct *p;
@@ -303,14 +304,14 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
303 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", 304 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
304 cpu); 305 cpu);
305 if (IS_ERR(p)) 306 if (IS_ERR(p))
306 return NOTIFY_BAD; 307 return notifier_from_errno(PTR_ERR(p));
307 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
308 get_task_struct(p); 308 get_task_struct(p);
309 kthread_bind(p, cpu);
310 sched_set_stop_task(cpu, p);
309 stopper->thread = p; 311 stopper->thread = p;
310 break; 312 break;
311 313
312 case CPU_ONLINE: 314 case CPU_ONLINE:
313 kthread_bind(stopper->thread, cpu);
314 /* strictly unnecessary, as first user will wake it */ 315 /* strictly unnecessary, as first user will wake it */
315 wake_up_process(stopper->thread); 316 wake_up_process(stopper->thread);
316 /* mark enabled */ 317 /* mark enabled */
@@ -325,6 +326,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
325 { 326 {
326 struct cpu_stop_work *work; 327 struct cpu_stop_work *work;
327 328
329 sched_set_stop_task(cpu, NULL);
328 /* kill the stopper */ 330 /* kill the stopper */
329 kthread_stop(stopper->thread); 331 kthread_stop(stopper->thread);
330 /* drain remaining works */ 332 /* drain remaining works */
@@ -370,7 +372,7 @@ static int __init cpu_stop_init(void)
370 /* start one for the boot cpu */ 372 /* start one for the boot cpu */
371 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, 373 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
372 bcpu); 374 bcpu);
373 BUG_ON(err == NOTIFY_BAD); 375 BUG_ON(err != NOTIFY_OK);
374 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); 376 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
375 register_cpu_notifier(&cpu_stop_cpu_notifier); 377 register_cpu_notifier(&cpu_stop_cpu_notifier);
376 378
diff --git a/kernel/sys.c b/kernel/sys.c
index e9ad44489828..7f5a0cd296a9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -931,6 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
931 pgid = pid; 931 pgid = pid;
932 if (pgid < 0) 932 if (pgid < 0)
933 return -EINVAL; 933 return -EINVAL;
934 rcu_read_lock();
934 935
935 /* From this point forward we keep holding onto the tasklist lock 936 /* From this point forward we keep holding onto the tasklist lock
936 * so that our parent does not change from under us. -DaveM 937 * so that our parent does not change from under us. -DaveM
@@ -984,6 +985,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
984out: 985out:
985 /* All paths lead to here, thus we are safe. -DaveM */ 986 /* All paths lead to here, thus we are safe. -DaveM */
986 write_unlock_irq(&tasklist_lock); 987 write_unlock_irq(&tasklist_lock);
988 rcu_read_unlock();
987 return err; 989 return err;
988} 990}
989 991
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bad369ec5403..c782fe9924c7 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -50,6 +50,7 @@ cond_syscall(compat_sys_sendmsg);
50cond_syscall(sys_recvmsg); 50cond_syscall(sys_recvmsg);
51cond_syscall(sys_recvmmsg); 51cond_syscall(sys_recvmmsg);
52cond_syscall(compat_sys_recvmsg); 52cond_syscall(compat_sys_recvmsg);
53cond_syscall(compat_sys_recv);
53cond_syscall(compat_sys_recvfrom); 54cond_syscall(compat_sys_recvfrom);
54cond_syscall(compat_sys_recvmmsg); 55cond_syscall(compat_sys_recvmmsg);
55cond_syscall(sys_socketcall); 56cond_syscall(sys_socketcall);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ca38e8e3e907..c33a1edb799f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -161,8 +161,6 @@ extern int no_unaligned_warning;
161extern int unaligned_dump_stack; 161extern int unaligned_dump_stack;
162#endif 162#endif
163 163
164extern struct ratelimit_state printk_ratelimit_state;
165
166#ifdef CONFIG_PROC_SYSCTL 164#ifdef CONFIG_PROC_SYSCTL
167static int proc_do_cad_pid(struct ctl_table *table, int write, 165static int proc_do_cad_pid(struct ctl_table *table, int write,
168 void __user *buffer, size_t *lenp, loff_t *ppos); 166 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -1340,28 +1338,28 @@ static struct ctl_table fs_table[] = {
1340 .data = &inodes_stat, 1338 .data = &inodes_stat,
1341 .maxlen = 2*sizeof(int), 1339 .maxlen = 2*sizeof(int),
1342 .mode = 0444, 1340 .mode = 0444,
1343 .proc_handler = proc_dointvec, 1341 .proc_handler = proc_nr_inodes,
1344 }, 1342 },
1345 { 1343 {
1346 .procname = "inode-state", 1344 .procname = "inode-state",
1347 .data = &inodes_stat, 1345 .data = &inodes_stat,
1348 .maxlen = 7*sizeof(int), 1346 .maxlen = 7*sizeof(int),
1349 .mode = 0444, 1347 .mode = 0444,
1350 .proc_handler = proc_dointvec, 1348 .proc_handler = proc_nr_inodes,
1351 }, 1349 },
1352 { 1350 {
1353 .procname = "file-nr", 1351 .procname = "file-nr",
1354 .data = &files_stat, 1352 .data = &files_stat,
1355 .maxlen = 3*sizeof(int), 1353 .maxlen = sizeof(files_stat),
1356 .mode = 0444, 1354 .mode = 0444,
1357 .proc_handler = proc_nr_files, 1355 .proc_handler = proc_nr_files,
1358 }, 1356 },
1359 { 1357 {
1360 .procname = "file-max", 1358 .procname = "file-max",
1361 .data = &files_stat.max_files, 1359 .data = &files_stat.max_files,
1362 .maxlen = sizeof(int), 1360 .maxlen = sizeof(files_stat.max_files),
1363 .mode = 0644, 1361 .mode = 0644,
1364 .proc_handler = proc_dointvec, 1362 .proc_handler = proc_doulongvec_minmax,
1365 }, 1363 },
1366 { 1364 {
1367 .procname = "nr_open", 1365 .procname = "nr_open",
@@ -1377,7 +1375,7 @@ static struct ctl_table fs_table[] = {
1377 .data = &dentry_stat, 1375 .data = &dentry_stat,
1378 .maxlen = 6*sizeof(int), 1376 .maxlen = 6*sizeof(int),
1379 .mode = 0444, 1377 .mode = 0444,
1380 .proc_handler = proc_dointvec, 1378 .proc_handler = proc_nr_dentry,
1381 }, 1379 },
1382 { 1380 {
1383 .procname = "overflowuid", 1381 .procname = "overflowuid",
@@ -1713,10 +1711,7 @@ static __init int sysctl_init(void)
1713{ 1711{
1714 sysctl_set_parent(NULL, root_table); 1712 sysctl_set_parent(NULL, root_table);
1715#ifdef CONFIG_SYSCTL_SYSCALL_CHECK 1713#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1716 { 1714 sysctl_check_table(current->nsproxy, root_table);
1717 int err;
1718 err = sysctl_check_table(current->nsproxy, root_table);
1719 }
1720#endif 1715#endif
1721 return 0; 1716 return 0;
1722} 1717}
@@ -2488,7 +2483,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2488 kbuf[left] = 0; 2483 kbuf[left] = 0;
2489 } 2484 }
2490 2485
2491 for (; left && vleft--; i++, min++, max++, first=0) { 2486 for (; left && vleft--; i++, first = 0) {
2492 unsigned long val; 2487 unsigned long val;
2493 2488
2494 if (write) { 2489 if (write) {
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 04cdcf72c827..10b90d8a03c4 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -143,15 +143,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
143 if (!table->maxlen) 143 if (!table->maxlen)
144 set_fail(&fail, table, "No maxlen"); 144 set_fail(&fail, table, "No maxlen");
145 } 145 }
146 if ((table->proc_handler == proc_doulongvec_minmax) ||
147 (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
148 if (table->maxlen > sizeof (unsigned long)) {
149 if (!table->extra1)
150 set_fail(&fail, table, "No min");
151 if (!table->extra2)
152 set_fail(&fail, table, "No max");
153 }
154 }
155#ifdef CONFIG_PROC_SYSCTL 146#ifdef CONFIG_PROC_SYSCTL
156 if (table->procname && !table->proc_handler) 147 if (table->procname && !table->proc_handler)
157 set_fail(&fail, table, "No proc_handler"); 148 set_fail(&fail, table, "No proc_handler");
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 11281d5792bd..c8231fb15708 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -175,22 +175,8 @@ static void send_cpu_listeners(struct sk_buff *skb,
175 up_write(&listeners->sem); 175 up_write(&listeners->sem);
176} 176}
177 177
178static int fill_pid(pid_t pid, struct task_struct *tsk, 178static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
179 struct taskstats *stats)
180{ 179{
181 int rc = 0;
182
183 if (!tsk) {
184 rcu_read_lock();
185 tsk = find_task_by_vpid(pid);
186 if (tsk)
187 get_task_struct(tsk);
188 rcu_read_unlock();
189 if (!tsk)
190 return -ESRCH;
191 } else
192 get_task_struct(tsk);
193
194 memset(stats, 0, sizeof(*stats)); 180 memset(stats, 0, sizeof(*stats));
195 /* 181 /*
196 * Each accounting subsystem adds calls to its functions to 182 * Each accounting subsystem adds calls to its functions to
@@ -209,17 +195,27 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
209 195
210 /* fill in extended acct fields */ 196 /* fill in extended acct fields */
211 xacct_add_tsk(stats, tsk); 197 xacct_add_tsk(stats, tsk);
198}
212 199
213 /* Define err: label here if needed */ 200static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
214 put_task_struct(tsk); 201{
215 return rc; 202 struct task_struct *tsk;
216 203
204 rcu_read_lock();
205 tsk = find_task_by_vpid(pid);
206 if (tsk)
207 get_task_struct(tsk);
208 rcu_read_unlock();
209 if (!tsk)
210 return -ESRCH;
211 fill_stats(tsk, stats);
212 put_task_struct(tsk);
213 return 0;
217} 214}
218 215
219static int fill_tgid(pid_t tgid, struct task_struct *first, 216static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
220 struct taskstats *stats)
221{ 217{
222 struct task_struct *tsk; 218 struct task_struct *tsk, *first;
223 unsigned long flags; 219 unsigned long flags;
224 int rc = -ESRCH; 220 int rc = -ESRCH;
225 221
@@ -228,8 +224,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
228 * leaders who are already counted with the dead tasks 224 * leaders who are already counted with the dead tasks
229 */ 225 */
230 rcu_read_lock(); 226 rcu_read_lock();
231 if (!first) 227 first = find_task_by_vpid(tgid);
232 first = find_task_by_vpid(tgid);
233 228
234 if (!first || !lock_task_sighand(first, &flags)) 229 if (!first || !lock_task_sighand(first, &flags))
235 goto out; 230 goto out;
@@ -268,7 +263,6 @@ out:
268 return rc; 263 return rc;
269} 264}
270 265
271
272static void fill_tgid_exit(struct task_struct *tsk) 266static void fill_tgid_exit(struct task_struct *tsk)
273{ 267{
274 unsigned long flags; 268 unsigned long flags;
@@ -360,6 +354,12 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
360 struct nlattr *na, *ret; 354 struct nlattr *na, *ret;
361 int aggr; 355 int aggr;
362 356
357 /* If we don't pad, we end up with alignment on a 4 byte boundary.
358 * This causes lots of runtime warnings on systems requiring 8 byte
359 * alignment */
360 u32 pids[2] = { pid, 0 };
361 int pid_size = ALIGN(sizeof(pid), sizeof(long));
362
363 aggr = (type == TASKSTATS_TYPE_PID) 363 aggr = (type == TASKSTATS_TYPE_PID)
364 ? TASKSTATS_TYPE_AGGR_PID 364 ? TASKSTATS_TYPE_AGGR_PID
365 : TASKSTATS_TYPE_AGGR_TGID; 365 : TASKSTATS_TYPE_AGGR_TGID;
@@ -367,7 +367,7 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
367 na = nla_nest_start(skb, aggr); 367 na = nla_nest_start(skb, aggr);
368 if (!na) 368 if (!na)
369 goto err; 369 goto err;
370 if (nla_put(skb, type, sizeof(pid), &pid) < 0) 370 if (nla_put(skb, type, pid_size, pids) < 0)
371 goto err; 371 goto err;
372 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); 372 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
373 if (!ret) 373 if (!ret)
@@ -424,39 +424,46 @@ err:
424 return rc; 424 return rc;
425} 425}
426 426
427static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 427static int cmd_attr_register_cpumask(struct genl_info *info)
428{ 428{
429 int rc;
430 struct sk_buff *rep_skb;
431 struct taskstats *stats;
432 size_t size;
433 cpumask_var_t mask; 429 cpumask_var_t mask;
430 int rc;
434 431
435 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 432 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
436 return -ENOMEM; 433 return -ENOMEM;
437
438 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 434 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
439 if (rc < 0) 435 if (rc < 0)
440 goto free_return_rc; 436 goto out;
441 if (rc == 0) { 437 rc = add_del_listener(info->snd_pid, mask, REGISTER);
442 rc = add_del_listener(info->snd_pid, mask, REGISTER); 438out:
443 goto free_return_rc; 439 free_cpumask_var(mask);
444 } 440 return rc;
441}
442
443static int cmd_attr_deregister_cpumask(struct genl_info *info)
444{
445 cpumask_var_t mask;
446 int rc;
445 447
448 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
449 return -ENOMEM;
446 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 450 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
447 if (rc < 0) 451 if (rc < 0)
448 goto free_return_rc; 452 goto out;
449 if (rc == 0) { 453 rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
450 rc = add_del_listener(info->snd_pid, mask, DEREGISTER); 454out:
451free_return_rc:
452 free_cpumask_var(mask);
453 return rc;
454 }
455 free_cpumask_var(mask); 455 free_cpumask_var(mask);
456 return rc;
457}
458
459static int cmd_attr_pid(struct genl_info *info)
460{
461 struct taskstats *stats;
462 struct sk_buff *rep_skb;
463 size_t size;
464 u32 pid;
465 int rc;
456 466
457 /*
458 * Size includes space for nested attributes
459 */
460 size = nla_total_size(sizeof(u32)) + 467 size = nla_total_size(sizeof(u32)) +
461 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 468 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
462 469
@@ -465,33 +472,64 @@ free_return_rc:
465 return rc; 472 return rc;
466 473
467 rc = -EINVAL; 474 rc = -EINVAL;
468 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { 475 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
469 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 476 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
470 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 477 if (!stats)
471 if (!stats) 478 goto err;
472 goto err; 479
473 480 rc = fill_stats_for_pid(pid, stats);
474 rc = fill_pid(pid, NULL, stats); 481 if (rc < 0)
475 if (rc < 0) 482 goto err;
476 goto err; 483 return send_reply(rep_skb, info);
477 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { 484err:
478 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 485 nlmsg_free(rep_skb);
479 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); 486 return rc;
480 if (!stats) 487}
481 goto err; 488
482 489static int cmd_attr_tgid(struct genl_info *info)
483 rc = fill_tgid(tgid, NULL, stats); 490{
484 if (rc < 0) 491 struct taskstats *stats;
485 goto err; 492 struct sk_buff *rep_skb;
486 } else 493 size_t size;
494 u32 tgid;
495 int rc;
496
497 size = nla_total_size(sizeof(u32)) +
498 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
499
500 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
501 if (rc < 0)
502 return rc;
503
504 rc = -EINVAL;
505 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
506 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
507 if (!stats)
487 goto err; 508 goto err;
488 509
510 rc = fill_stats_for_tgid(tgid, stats);
511 if (rc < 0)
512 goto err;
489 return send_reply(rep_skb, info); 513 return send_reply(rep_skb, info);
490err: 514err:
491 nlmsg_free(rep_skb); 515 nlmsg_free(rep_skb);
492 return rc; 516 return rc;
493} 517}
494 518
519static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
520{
521 if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK])
522 return cmd_attr_register_cpumask(info);
523 else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK])
524 return cmd_attr_deregister_cpumask(info);
525 else if (info->attrs[TASKSTATS_CMD_ATTR_PID])
526 return cmd_attr_pid(info);
527 else if (info->attrs[TASKSTATS_CMD_ATTR_TGID])
528 return cmd_attr_tgid(info);
529 else
530 return -EINVAL;
531}
532
495static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 533static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
496{ 534{
497 struct signal_struct *sig = tsk->signal; 535 struct signal_struct *sig = tsk->signal;
@@ -555,9 +593,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
555 if (!stats) 593 if (!stats)
556 goto err; 594 goto err;
557 595
558 rc = fill_pid(-1, tsk, stats); 596 fill_stats(tsk, stats);
559 if (rc < 0)
560 goto err;
561 597
562 /* 598 /*
563 * Doesn't matter if tsk is the leader or the last group member leaving 599 * Doesn't matter if tsk is the leader or the last group member leaving
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 4f104515a19b..f8b11a283171 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -115,7 +115,9 @@ static int test_kprobes(void)
115 int ret; 115 int ret;
116 struct kprobe *kps[2] = {&kp, &kp2}; 116 struct kprobe *kps[2] = {&kp, &kp2};
117 117
118 kp.addr = 0; /* addr should be cleard for reusing kprobe. */ 118 /* addr and flags should be cleard for reusing kprobe. */
119 kp.addr = NULL;
120 kp.flags = 0;
119 ret = register_kprobes(kps, 2); 121 ret = register_kprobes(kps, 2);
120 if (ret < 0) { 122 if (ret < 0) {
121 printk(KERN_ERR "Kprobe smoke test failed: " 123 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -210,7 +212,9 @@ static int test_jprobes(void)
210 int ret; 212 int ret;
211 struct jprobe *jps[2] = {&jp, &jp2}; 213 struct jprobe *jps[2] = {&jp, &jp2};
212 214
213 jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ 215 /* addr and flags should be cleard for reusing kprobe. */
216 jp.kp.addr = NULL;
217 jp.kp.flags = 0;
214 ret = register_jprobes(jps, 2); 218 ret = register_jprobes(jps, 2);
215 if (ret < 0) { 219 if (ret < 0) {
216 printk(KERN_ERR "Kprobe smoke test failed: " 220 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -323,7 +327,9 @@ static int test_kretprobes(void)
323 int ret; 327 int ret;
324 struct kretprobe *rps[2] = {&rp, &rp2}; 328 struct kretprobe *rps[2] = {&rp, &rp2};
325 329
326 rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ 330 /* addr and flags should be cleard for reusing kprobe. */
331 rp.kp.addr = NULL;
332 rp.kp.flags = 0;
327 ret = register_kretprobes(rps, 2); 333 ret = register_kretprobes(rps, 2);
328 if (ret < 0) { 334 if (ret < 0) {
329 printk(KERN_ERR "Kprobe smoke test failed: " 335 printk(KERN_ERR "Kprobe smoke test failed: "
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c63116863a80..d2321891538f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -149,10 +149,18 @@ static void ntp_update_offset(long offset)
149 time_reftime = get_seconds(); 149 time_reftime = get_seconds();
150 150
151 offset64 = offset; 151 offset64 = offset;
152 freq_adj = (offset64 * secs) << 152 freq_adj = ntp_update_offset_fll(offset64, secs);
153 (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
154 153
155 freq_adj += ntp_update_offset_fll(offset64, secs); 154 /*
155 * Clamp update interval to reduce PLL gain with low
156 * sampling rate (e.g. intermittent network connection)
157 * to avoid instability.
158 */
159 if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant)))
160 secs = 1 << (SHIFT_PLL + 1 + time_constant);
161
162 freq_adj += (offset64 * secs) <<
163 (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
156 164
157 freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); 165 freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED);
158 166
diff --git a/kernel/timer.c b/kernel/timer.c
index 97bf05baade7..68a9ae7679b7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_event.h> 40#include <linux/irq_work.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43 43
@@ -1279,7 +1279,10 @@ void update_process_times(int user_tick)
1279 run_local_timers(); 1279 run_local_timers();
1280 rcu_check_callbacks(cpu, user_tick); 1280 rcu_check_callbacks(cpu, user_tick);
1281 printk_tick(); 1281 printk_tick();
1282 perf_event_do_pending(); 1282#ifdef CONFIG_IRQ_WORK
1283 if (in_irq())
1284 irq_work_run();
1285#endif
1283 scheduler_tick(); 1286 scheduler_tick();
1284 run_posix_cpu_timers(p); 1287 run_posix_cpu_timers(p);
1285} 1288}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 538501c6ea50..e04b8bcdef88 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS
49 help 49 help
50 See Documentation/trace/ftrace-design.txt 50 See Documentation/trace/ftrace-design.txt
51 51
52config HAVE_C_RECORDMCOUNT
53 bool
54 help
55 C version of recordmcount available?
56
52config TRACER_MAX_TRACE 57config TRACER_MAX_TRACE
53 bool 58 bool
54 59
@@ -121,7 +126,7 @@ if FTRACE
121config FUNCTION_TRACER 126config FUNCTION_TRACER
122 bool "Kernel Function Tracer" 127 bool "Kernel Function Tracer"
123 depends on HAVE_FUNCTION_TRACER 128 depends on HAVE_FUNCTION_TRACER
124 select FRAME_POINTER 129 select FRAME_POINTER if (!ARM_UNWIND)
125 select KALLSYMS 130 select KALLSYMS
126 select GENERIC_TRACER 131 select GENERIC_TRACER
127 select CONTEXT_SWITCH_TRACER 132 select CONTEXT_SWITCH_TRACER
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 959f8d6c8cc1..bc251ed66724 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,7 +23,6 @@
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/debugfs.h> 25#include <linux/debugfs.h>
26#include <linux/smp_lock.h>
27#include <linux/time.h> 26#include <linux/time.h>
28#include <linux/uaccess.h> 27#include <linux/uaccess.h>
29 28
@@ -326,6 +325,7 @@ static const struct file_operations blk_dropped_fops = {
326 .owner = THIS_MODULE, 325 .owner = THIS_MODULE,
327 .open = blk_dropped_open, 326 .open = blk_dropped_open,
328 .read = blk_dropped_read, 327 .read = blk_dropped_read,
328 .llseek = default_llseek,
329}; 329};
330 330
331static int blk_msg_open(struct inode *inode, struct file *filp) 331static int blk_msg_open(struct inode *inode, struct file *filp)
@@ -365,6 +365,7 @@ static const struct file_operations blk_msg_fops = {
365 .owner = THIS_MODULE, 365 .owner = THIS_MODULE,
366 .open = blk_msg_open, 366 .open = blk_msg_open,
367 .write = blk_msg_write, 367 .write = blk_msg_write,
368 .llseek = noop_llseek,
368}; 369};
369 370
370/* 371/*
@@ -639,7 +640,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
639 if (!q) 640 if (!q)
640 return -ENXIO; 641 return -ENXIO;
641 642
642 lock_kernel();
643 mutex_lock(&bdev->bd_mutex); 643 mutex_lock(&bdev->bd_mutex);
644 644
645 switch (cmd) { 645 switch (cmd) {
@@ -667,7 +667,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
667 } 667 }
668 668
669 mutex_unlock(&bdev->bd_mutex); 669 mutex_unlock(&bdev->bd_mutex);
670 unlock_kernel();
671 return ret; 670 return ret;
672} 671}
673 672
@@ -1652,10 +1651,9 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1652 struct block_device *bdev; 1651 struct block_device *bdev;
1653 ssize_t ret = -ENXIO; 1652 ssize_t ret = -ENXIO;
1654 1653
1655 lock_kernel();
1656 bdev = bdget(part_devt(p)); 1654 bdev = bdget(part_devt(p));
1657 if (bdev == NULL) 1655 if (bdev == NULL)
1658 goto out_unlock_kernel; 1656 goto out;
1659 1657
1660 q = blk_trace_get_queue(bdev); 1658 q = blk_trace_get_queue(bdev);
1661 if (q == NULL) 1659 if (q == NULL)
@@ -1683,8 +1681,7 @@ out_unlock_bdev:
1683 mutex_unlock(&bdev->bd_mutex); 1681 mutex_unlock(&bdev->bd_mutex);
1684out_bdput: 1682out_bdput:
1685 bdput(bdev); 1683 bdput(bdev);
1686out_unlock_kernel: 1684out:
1687 unlock_kernel();
1688 return ret; 1685 return ret;
1689} 1686}
1690 1687
@@ -1714,11 +1711,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1714 1711
1715 ret = -ENXIO; 1712 ret = -ENXIO;
1716 1713
1717 lock_kernel();
1718 p = dev_to_part(dev); 1714 p = dev_to_part(dev);
1719 bdev = bdget(part_devt(p)); 1715 bdev = bdget(part_devt(p));
1720 if (bdev == NULL) 1716 if (bdev == NULL)
1721 goto out_unlock_kernel; 1717 goto out;
1722 1718
1723 q = blk_trace_get_queue(bdev); 1719 q = blk_trace_get_queue(bdev);
1724 if (q == NULL) 1720 if (q == NULL)
@@ -1753,8 +1749,6 @@ out_unlock_bdev:
1753 mutex_unlock(&bdev->bd_mutex); 1749 mutex_unlock(&bdev->bd_mutex);
1754out_bdput: 1750out_bdput:
1755 bdput(bdev); 1751 bdput(bdev);
1756out_unlock_kernel:
1757 unlock_kernel();
1758out: 1752out:
1759 return ret ? ret : count; 1753 return ret ? ret : count;
1760} 1754}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0d88ce9b9fb8..f3dadae83883 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -381,12 +381,19 @@ static int function_stat_show(struct seq_file *m, void *v)
381{ 381{
382 struct ftrace_profile *rec = v; 382 struct ftrace_profile *rec = v;
383 char str[KSYM_SYMBOL_LEN]; 383 char str[KSYM_SYMBOL_LEN];
384 int ret = 0;
384#ifdef CONFIG_FUNCTION_GRAPH_TRACER 385#ifdef CONFIG_FUNCTION_GRAPH_TRACER
385 static DEFINE_MUTEX(mutex);
386 static struct trace_seq s; 386 static struct trace_seq s;
387 unsigned long long avg; 387 unsigned long long avg;
388 unsigned long long stddev; 388 unsigned long long stddev;
389#endif 389#endif
390 mutex_lock(&ftrace_profile_lock);
391
392 /* we raced with function_profile_reset() */
393 if (unlikely(rec->counter == 0)) {
394 ret = -EBUSY;
395 goto out;
396 }
390 397
391 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 398 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
392 seq_printf(m, " %-30.30s %10lu", str, rec->counter); 399 seq_printf(m, " %-30.30s %10lu", str, rec->counter);
@@ -408,7 +415,6 @@ static int function_stat_show(struct seq_file *m, void *v)
408 do_div(stddev, (rec->counter - 1) * 1000); 415 do_div(stddev, (rec->counter - 1) * 1000);
409 } 416 }
410 417
411 mutex_lock(&mutex);
412 trace_seq_init(&s); 418 trace_seq_init(&s);
413 trace_print_graph_duration(rec->time, &s); 419 trace_print_graph_duration(rec->time, &s);
414 trace_seq_puts(&s, " "); 420 trace_seq_puts(&s, " ");
@@ -416,11 +422,12 @@ static int function_stat_show(struct seq_file *m, void *v)
416 trace_seq_puts(&s, " "); 422 trace_seq_puts(&s, " ");
417 trace_print_graph_duration(stddev, &s); 423 trace_print_graph_duration(stddev, &s);
418 trace_print_seq(m, &s); 424 trace_print_seq(m, &s);
419 mutex_unlock(&mutex);
420#endif 425#endif
421 seq_putc(m, '\n'); 426 seq_putc(m, '\n');
427out:
428 mutex_unlock(&ftrace_profile_lock);
422 429
423 return 0; 430 return ret;
424} 431}
425 432
426static void ftrace_profile_reset(struct ftrace_profile_stat *stat) 433static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
@@ -793,6 +800,7 @@ static const struct file_operations ftrace_profile_fops = {
793 .open = tracing_open_generic, 800 .open = tracing_open_generic,
794 .read = ftrace_profile_read, 801 .read = ftrace_profile_read,
795 .write = ftrace_profile_write, 802 .write = ftrace_profile_write,
803 .llseek = default_llseek,
796}; 804};
797 805
798/* used to initialize the real stat files */ 806/* used to initialize the real stat files */
@@ -877,10 +885,8 @@ enum {
877 FTRACE_ENABLE_CALLS = (1 << 0), 885 FTRACE_ENABLE_CALLS = (1 << 0),
878 FTRACE_DISABLE_CALLS = (1 << 1), 886 FTRACE_DISABLE_CALLS = (1 << 1),
879 FTRACE_UPDATE_TRACE_FUNC = (1 << 2), 887 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
880 FTRACE_ENABLE_MCOUNT = (1 << 3), 888 FTRACE_START_FUNC_RET = (1 << 3),
881 FTRACE_DISABLE_MCOUNT = (1 << 4), 889 FTRACE_STOP_FUNC_RET = (1 << 4),
882 FTRACE_START_FUNC_RET = (1 << 5),
883 FTRACE_STOP_FUNC_RET = (1 << 6),
884}; 890};
885 891
886static int ftrace_filtered; 892static int ftrace_filtered;
@@ -1219,8 +1225,6 @@ static void ftrace_shutdown(int command)
1219 1225
1220static void ftrace_startup_sysctl(void) 1226static void ftrace_startup_sysctl(void)
1221{ 1227{
1222 int command = FTRACE_ENABLE_MCOUNT;
1223
1224 if (unlikely(ftrace_disabled)) 1228 if (unlikely(ftrace_disabled))
1225 return; 1229 return;
1226 1230
@@ -1228,23 +1232,17 @@ static void ftrace_startup_sysctl(void)
1228 saved_ftrace_func = NULL; 1232 saved_ftrace_func = NULL;
1229 /* ftrace_start_up is true if we want ftrace running */ 1233 /* ftrace_start_up is true if we want ftrace running */
1230 if (ftrace_start_up) 1234 if (ftrace_start_up)
1231 command |= FTRACE_ENABLE_CALLS; 1235 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1232
1233 ftrace_run_update_code(command);
1234} 1236}
1235 1237
1236static void ftrace_shutdown_sysctl(void) 1238static void ftrace_shutdown_sysctl(void)
1237{ 1239{
1238 int command = FTRACE_DISABLE_MCOUNT;
1239
1240 if (unlikely(ftrace_disabled)) 1240 if (unlikely(ftrace_disabled))
1241 return; 1241 return;
1242 1242
1243 /* ftrace_start_up is true if ftrace is running */ 1243 /* ftrace_start_up is true if ftrace is running */
1244 if (ftrace_start_up) 1244 if (ftrace_start_up)
1245 command |= FTRACE_DISABLE_CALLS; 1245 ftrace_run_update_code(FTRACE_DISABLE_CALLS);
1246
1247 ftrace_run_update_code(command);
1248} 1246}
1249 1247
1250static cycle_t ftrace_update_time; 1248static cycle_t ftrace_update_time;
@@ -1361,24 +1359,29 @@ enum {
1361#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1359#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
1362 1360
1363struct ftrace_iterator { 1361struct ftrace_iterator {
1364 struct ftrace_page *pg; 1362 loff_t pos;
1365 int hidx; 1363 loff_t func_pos;
1366 int idx; 1364 struct ftrace_page *pg;
1367 unsigned flags; 1365 struct dyn_ftrace *func;
1368 struct trace_parser parser; 1366 struct ftrace_func_probe *probe;
1367 struct trace_parser parser;
1368 int hidx;
1369 int idx;
1370 unsigned flags;
1369}; 1371};
1370 1372
1371static void * 1373static void *
1372t_hash_next(struct seq_file *m, void *v, loff_t *pos) 1374t_hash_next(struct seq_file *m, loff_t *pos)
1373{ 1375{
1374 struct ftrace_iterator *iter = m->private; 1376 struct ftrace_iterator *iter = m->private;
1375 struct hlist_node *hnd = v; 1377 struct hlist_node *hnd = NULL;
1376 struct hlist_head *hhd; 1378 struct hlist_head *hhd;
1377 1379
1378 WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
1379
1380 (*pos)++; 1380 (*pos)++;
1381 iter->pos = *pos;
1381 1382
1383 if (iter->probe)
1384 hnd = &iter->probe->node;
1382 retry: 1385 retry:
1383 if (iter->hidx >= FTRACE_FUNC_HASHSIZE) 1386 if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
1384 return NULL; 1387 return NULL;
@@ -1401,7 +1404,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos)
1401 } 1404 }
1402 } 1405 }
1403 1406
1404 return hnd; 1407 if (WARN_ON_ONCE(!hnd))
1408 return NULL;
1409
1410 iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
1411
1412 return iter;
1405} 1413}
1406 1414
1407static void *t_hash_start(struct seq_file *m, loff_t *pos) 1415static void *t_hash_start(struct seq_file *m, loff_t *pos)
@@ -1410,26 +1418,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
1410 void *p = NULL; 1418 void *p = NULL;
1411 loff_t l; 1419 loff_t l;
1412 1420
1413 if (!(iter->flags & FTRACE_ITER_HASH)) 1421 if (iter->func_pos > *pos)
1414 *pos = 0; 1422 return NULL;
1415
1416 iter->flags |= FTRACE_ITER_HASH;
1417 1423
1418 iter->hidx = 0; 1424 iter->hidx = 0;
1419 for (l = 0; l <= *pos; ) { 1425 for (l = 0; l <= (*pos - iter->func_pos); ) {
1420 p = t_hash_next(m, p, &l); 1426 p = t_hash_next(m, &l);
1421 if (!p) 1427 if (!p)
1422 break; 1428 break;
1423 } 1429 }
1424 return p; 1430 if (!p)
1431 return NULL;
1432
1433 /* Only set this if we have an item */
1434 iter->flags |= FTRACE_ITER_HASH;
1435
1436 return iter;
1425} 1437}
1426 1438
1427static int t_hash_show(struct seq_file *m, void *v) 1439static int
1440t_hash_show(struct seq_file *m, struct ftrace_iterator *iter)
1428{ 1441{
1429 struct ftrace_func_probe *rec; 1442 struct ftrace_func_probe *rec;
1430 struct hlist_node *hnd = v;
1431 1443
1432 rec = hlist_entry(hnd, struct ftrace_func_probe, node); 1444 rec = iter->probe;
1445 if (WARN_ON_ONCE(!rec))
1446 return -EIO;
1433 1447
1434 if (rec->ops->print) 1448 if (rec->ops->print)
1435 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 1449 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
@@ -1450,12 +1464,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1450 struct dyn_ftrace *rec = NULL; 1464 struct dyn_ftrace *rec = NULL;
1451 1465
1452 if (iter->flags & FTRACE_ITER_HASH) 1466 if (iter->flags & FTRACE_ITER_HASH)
1453 return t_hash_next(m, v, pos); 1467 return t_hash_next(m, pos);
1454 1468
1455 (*pos)++; 1469 (*pos)++;
1470 iter->pos = *pos;
1456 1471
1457 if (iter->flags & FTRACE_ITER_PRINTALL) 1472 if (iter->flags & FTRACE_ITER_PRINTALL)
1458 return NULL; 1473 return t_hash_start(m, pos);
1459 1474
1460 retry: 1475 retry:
1461 if (iter->idx >= iter->pg->index) { 1476 if (iter->idx >= iter->pg->index) {
@@ -1484,7 +1499,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1484 } 1499 }
1485 } 1500 }
1486 1501
1487 return rec; 1502 if (!rec)
1503 return t_hash_start(m, pos);
1504
1505 iter->func_pos = *pos;
1506 iter->func = rec;
1507
1508 return iter;
1509}
1510
1511static void reset_iter_read(struct ftrace_iterator *iter)
1512{
1513 iter->pos = 0;
1514 iter->func_pos = 0;
1515 iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH);
1488} 1516}
1489 1517
1490static void *t_start(struct seq_file *m, loff_t *pos) 1518static void *t_start(struct seq_file *m, loff_t *pos)
@@ -1495,6 +1523,12 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1495 1523
1496 mutex_lock(&ftrace_lock); 1524 mutex_lock(&ftrace_lock);
1497 /* 1525 /*
1526 * If an lseek was done, then reset and start from beginning.
1527 */
1528 if (*pos < iter->pos)
1529 reset_iter_read(iter);
1530
1531 /*
1498 * For set_ftrace_filter reading, if we have the filter 1532 * For set_ftrace_filter reading, if we have the filter
1499 * off, we can short cut and just print out that all 1533 * off, we can short cut and just print out that all
1500 * functions are enabled. 1534 * functions are enabled.
@@ -1503,12 +1537,19 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1503 if (*pos > 0) 1537 if (*pos > 0)
1504 return t_hash_start(m, pos); 1538 return t_hash_start(m, pos);
1505 iter->flags |= FTRACE_ITER_PRINTALL; 1539 iter->flags |= FTRACE_ITER_PRINTALL;
1540 /* reset in case of seek/pread */
1541 iter->flags &= ~FTRACE_ITER_HASH;
1506 return iter; 1542 return iter;
1507 } 1543 }
1508 1544
1509 if (iter->flags & FTRACE_ITER_HASH) 1545 if (iter->flags & FTRACE_ITER_HASH)
1510 return t_hash_start(m, pos); 1546 return t_hash_start(m, pos);
1511 1547
1548 /*
1549 * Unfortunately, we need to restart at ftrace_pages_start
1550 * every time we let go of the ftrace_mutex. This is because
1551 * those pointers can change without the lock.
1552 */
1512 iter->pg = ftrace_pages_start; 1553 iter->pg = ftrace_pages_start;
1513 iter->idx = 0; 1554 iter->idx = 0;
1514 for (l = 0; l <= *pos; ) { 1555 for (l = 0; l <= *pos; ) {
@@ -1517,10 +1558,14 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1517 break; 1558 break;
1518 } 1559 }
1519 1560
1520 if (!p && iter->flags & FTRACE_ITER_FILTER) 1561 if (!p) {
1521 return t_hash_start(m, pos); 1562 if (iter->flags & FTRACE_ITER_FILTER)
1563 return t_hash_start(m, pos);
1522 1564
1523 return p; 1565 return NULL;
1566 }
1567
1568 return iter;
1524} 1569}
1525 1570
1526static void t_stop(struct seq_file *m, void *p) 1571static void t_stop(struct seq_file *m, void *p)
@@ -1531,16 +1576,18 @@ static void t_stop(struct seq_file *m, void *p)
1531static int t_show(struct seq_file *m, void *v) 1576static int t_show(struct seq_file *m, void *v)
1532{ 1577{
1533 struct ftrace_iterator *iter = m->private; 1578 struct ftrace_iterator *iter = m->private;
1534 struct dyn_ftrace *rec = v; 1579 struct dyn_ftrace *rec;
1535 1580
1536 if (iter->flags & FTRACE_ITER_HASH) 1581 if (iter->flags & FTRACE_ITER_HASH)
1537 return t_hash_show(m, v); 1582 return t_hash_show(m, iter);
1538 1583
1539 if (iter->flags & FTRACE_ITER_PRINTALL) { 1584 if (iter->flags & FTRACE_ITER_PRINTALL) {
1540 seq_printf(m, "#### all functions enabled ####\n"); 1585 seq_printf(m, "#### all functions enabled ####\n");
1541 return 0; 1586 return 0;
1542 } 1587 }
1543 1588
1589 rec = iter->func;
1590
1544 if (!rec) 1591 if (!rec)
1545 return 0; 1592 return 0;
1546 1593
@@ -1592,8 +1639,8 @@ ftrace_failures_open(struct inode *inode, struct file *file)
1592 1639
1593 ret = ftrace_avail_open(inode, file); 1640 ret = ftrace_avail_open(inode, file);
1594 if (!ret) { 1641 if (!ret) {
1595 m = (struct seq_file *)file->private_data; 1642 m = file->private_data;
1596 iter = (struct ftrace_iterator *)m->private; 1643 iter = m->private;
1597 iter->flags = FTRACE_ITER_FAILURES; 1644 iter->flags = FTRACE_ITER_FAILURES;
1598 } 1645 }
1599 1646
@@ -2623,6 +2670,7 @@ static const struct file_operations ftrace_graph_fops = {
2623 .read = seq_read, 2670 .read = seq_read,
2624 .write = ftrace_graph_write, 2671 .write = ftrace_graph_write,
2625 .release = ftrace_graph_release, 2672 .release = ftrace_graph_release,
2673 .llseek = seq_lseek,
2626}; 2674};
2627#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2675#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2628 2676
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3632ce87674f..9ed509a015d8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -224,6 +224,9 @@ enum {
224 RB_LEN_TIME_STAMP = 16, 224 RB_LEN_TIME_STAMP = 16,
225}; 225};
226 226
227#define skip_time_extend(event) \
228 ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
229
227static inline int rb_null_event(struct ring_buffer_event *event) 230static inline int rb_null_event(struct ring_buffer_event *event)
228{ 231{
229 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; 232 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event)
248 return length + RB_EVNT_HDR_SIZE; 251 return length + RB_EVNT_HDR_SIZE;
249} 252}
250 253
251/* inline for ring buffer fast paths */ 254/*
252static unsigned 255 * Return the length of the given event. Will return
256 * the length of the time extend if the event is a
257 * time extend.
258 */
259static inline unsigned
253rb_event_length(struct ring_buffer_event *event) 260rb_event_length(struct ring_buffer_event *event)
254{ 261{
255 switch (event->type_len) { 262 switch (event->type_len) {
@@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event)
274 return 0; 281 return 0;
275} 282}
276 283
284/*
285 * Return total length of time extend and data,
286 * or just the event length for all other events.
287 */
288static inline unsigned
289rb_event_ts_length(struct ring_buffer_event *event)
290{
291 unsigned len = 0;
292
293 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
294 /* time extends include the data event after it */
295 len = RB_LEN_TIME_EXTEND;
296 event = skip_time_extend(event);
297 }
298 return len + rb_event_length(event);
299}
300
277/** 301/**
278 * ring_buffer_event_length - return the length of the event 302 * ring_buffer_event_length - return the length of the event
279 * @event: the event to get the length of 303 * @event: the event to get the length of
304 *
305 * Returns the size of the data load of a data event.
306 * If the event is something other than a data event, it
307 * returns the size of the event itself. With the exception
308 * of a TIME EXTEND, where it still returns the size of the
309 * data load of the data event after it.
280 */ 310 */
281unsigned ring_buffer_event_length(struct ring_buffer_event *event) 311unsigned ring_buffer_event_length(struct ring_buffer_event *event)
282{ 312{
283 unsigned length = rb_event_length(event); 313 unsigned length;
314
315 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
316 event = skip_time_extend(event);
317
318 length = rb_event_length(event);
284 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 319 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
285 return length; 320 return length;
286 length -= RB_EVNT_HDR_SIZE; 321 length -= RB_EVNT_HDR_SIZE;
@@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
294static void * 329static void *
295rb_event_data(struct ring_buffer_event *event) 330rb_event_data(struct ring_buffer_event *event)
296{ 331{
332 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
333 event = skip_time_extend(event);
297 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); 334 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
298 /* If length is in len field, then array[0] has the data */ 335 /* If length is in len field, then array[0] has the data */
299 if (event->type_len) 336 if (event->type_len)
@@ -404,9 +441,6 @@ static inline int test_time_stamp(u64 delta)
404/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ 441/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
405#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) 442#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
406 443
407/* Max number of timestamps that can fit on a page */
408#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
409
410int ring_buffer_print_page_header(struct trace_seq *s) 444int ring_buffer_print_page_header(struct trace_seq *s)
411{ 445{
412 struct buffer_data_page field; 446 struct buffer_data_page field;
@@ -1546,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1546 iter->head = 0; 1580 iter->head = 0;
1547} 1581}
1548 1582
1583/* Slow path, do not inline */
1584static noinline struct ring_buffer_event *
1585rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
1586{
1587 event->type_len = RINGBUF_TYPE_TIME_EXTEND;
1588
1589 /* Not the first event on the page? */
1590 if (rb_event_index(event)) {
1591 event->time_delta = delta & TS_MASK;
1592 event->array[0] = delta >> TS_SHIFT;
1593 } else {
1594 /* nope, just zero it */
1595 event->time_delta = 0;
1596 event->array[0] = 0;
1597 }
1598
1599 return skip_time_extend(event);
1600}
1601
1549/** 1602/**
1550 * ring_buffer_update_event - update event type and data 1603 * ring_buffer_update_event - update event type and data
1551 * @event: the even to update 1604 * @event: the even to update
@@ -1558,28 +1611,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1558 * data field. 1611 * data field.
1559 */ 1612 */
1560static void 1613static void
1561rb_update_event(struct ring_buffer_event *event, 1614rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
1562 unsigned type, unsigned length) 1615 struct ring_buffer_event *event, unsigned length,
1616 int add_timestamp, u64 delta)
1563{ 1617{
1564 event->type_len = type; 1618 /* Only a commit updates the timestamp */
1565 1619 if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
1566 switch (type) { 1620 delta = 0;
1567
1568 case RINGBUF_TYPE_PADDING:
1569 case RINGBUF_TYPE_TIME_EXTEND:
1570 case RINGBUF_TYPE_TIME_STAMP:
1571 break;
1572 1621
1573 case 0: 1622 /*
1574 length -= RB_EVNT_HDR_SIZE; 1623 * If we need to add a timestamp, then we
1575 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) 1624 * add it to the start of the resevered space.
1576 event->array[0] = length; 1625 */
1577 else 1626 if (unlikely(add_timestamp)) {
1578 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); 1627 event = rb_add_time_stamp(event, delta);
1579 break; 1628 length -= RB_LEN_TIME_EXTEND;
1580 default: 1629 delta = 0;
1581 BUG();
1582 } 1630 }
1631
1632 event->time_delta = delta;
1633 length -= RB_EVNT_HDR_SIZE;
1634 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
1635 event->type_len = 0;
1636 event->array[0] = length;
1637 } else
1638 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
1583} 1639}
1584 1640
1585/* 1641/*
@@ -1823,10 +1879,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1823 local_sub(length, &tail_page->write); 1879 local_sub(length, &tail_page->write);
1824} 1880}
1825 1881
1826static struct ring_buffer_event * 1882/*
1883 * This is the slow path, force gcc not to inline it.
1884 */
1885static noinline struct ring_buffer_event *
1827rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 1886rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1828 unsigned long length, unsigned long tail, 1887 unsigned long length, unsigned long tail,
1829 struct buffer_page *tail_page, u64 *ts) 1888 struct buffer_page *tail_page, u64 ts)
1830{ 1889{
1831 struct buffer_page *commit_page = cpu_buffer->commit_page; 1890 struct buffer_page *commit_page = cpu_buffer->commit_page;
1832 struct ring_buffer *buffer = cpu_buffer->buffer; 1891 struct ring_buffer *buffer = cpu_buffer->buffer;
@@ -1909,8 +1968,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1909 * Nested commits always have zero deltas, so 1968 * Nested commits always have zero deltas, so
1910 * just reread the time stamp 1969 * just reread the time stamp
1911 */ 1970 */
1912 *ts = rb_time_stamp(buffer); 1971 ts = rb_time_stamp(buffer);
1913 next_page->page->time_stamp = *ts; 1972 next_page->page->time_stamp = ts;
1914 } 1973 }
1915 1974
1916 out_again: 1975 out_again:
@@ -1929,12 +1988,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1929 1988
1930static struct ring_buffer_event * 1989static struct ring_buffer_event *
1931__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 1990__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1932 unsigned type, unsigned long length, u64 *ts) 1991 unsigned long length, u64 ts,
1992 u64 delta, int add_timestamp)
1933{ 1993{
1934 struct buffer_page *tail_page; 1994 struct buffer_page *tail_page;
1935 struct ring_buffer_event *event; 1995 struct ring_buffer_event *event;
1936 unsigned long tail, write; 1996 unsigned long tail, write;
1937 1997
1998 /*
1999 * If the time delta since the last event is too big to
2000 * hold in the time field of the event, then we append a
2001 * TIME EXTEND event ahead of the data event.
2002 */
2003 if (unlikely(add_timestamp))
2004 length += RB_LEN_TIME_EXTEND;
2005
1938 tail_page = cpu_buffer->tail_page; 2006 tail_page = cpu_buffer->tail_page;
1939 write = local_add_return(length, &tail_page->write); 2007 write = local_add_return(length, &tail_page->write);
1940 2008
@@ -1943,7 +2011,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1943 tail = write - length; 2011 tail = write - length;
1944 2012
1945 /* See if we shot pass the end of this buffer page */ 2013 /* See if we shot pass the end of this buffer page */
1946 if (write > BUF_PAGE_SIZE) 2014 if (unlikely(write > BUF_PAGE_SIZE))
1947 return rb_move_tail(cpu_buffer, length, tail, 2015 return rb_move_tail(cpu_buffer, length, tail,
1948 tail_page, ts); 2016 tail_page, ts);
1949 2017
@@ -1951,18 +2019,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1951 2019
1952 event = __rb_page_index(tail_page, tail); 2020 event = __rb_page_index(tail_page, tail);
1953 kmemcheck_annotate_bitfield(event, bitfield); 2021 kmemcheck_annotate_bitfield(event, bitfield);
1954 rb_update_event(event, type, length); 2022 rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
1955 2023
1956 /* The passed in type is zero for DATA */ 2024 local_inc(&tail_page->entries);
1957 if (likely(!type))
1958 local_inc(&tail_page->entries);
1959 2025
1960 /* 2026 /*
1961 * If this is the first commit on the page, then update 2027 * If this is the first commit on the page, then update
1962 * its timestamp. 2028 * its timestamp.
1963 */ 2029 */
1964 if (!tail) 2030 if (!tail)
1965 tail_page->page->time_stamp = *ts; 2031 tail_page->page->time_stamp = ts;
1966 2032
1967 return event; 2033 return event;
1968} 2034}
@@ -1977,7 +2043,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1977 unsigned long addr; 2043 unsigned long addr;
1978 2044
1979 new_index = rb_event_index(event); 2045 new_index = rb_event_index(event);
1980 old_index = new_index + rb_event_length(event); 2046 old_index = new_index + rb_event_ts_length(event);
1981 addr = (unsigned long)event; 2047 addr = (unsigned long)event;
1982 addr &= PAGE_MASK; 2048 addr &= PAGE_MASK;
1983 2049
@@ -2003,76 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
2003 return 0; 2069 return 0;
2004} 2070}
2005 2071
2006static int
2007rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2008 u64 *ts, u64 *delta)
2009{
2010 struct ring_buffer_event *event;
2011 int ret;
2012
2013 WARN_ONCE(*delta > (1ULL << 59),
2014 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
2015 (unsigned long long)*delta,
2016 (unsigned long long)*ts,
2017 (unsigned long long)cpu_buffer->write_stamp);
2018
2019 /*
2020 * The delta is too big, we to add a
2021 * new timestamp.
2022 */
2023 event = __rb_reserve_next(cpu_buffer,
2024 RINGBUF_TYPE_TIME_EXTEND,
2025 RB_LEN_TIME_EXTEND,
2026 ts);
2027 if (!event)
2028 return -EBUSY;
2029
2030 if (PTR_ERR(event) == -EAGAIN)
2031 return -EAGAIN;
2032
2033 /* Only a commited time event can update the write stamp */
2034 if (rb_event_is_commit(cpu_buffer, event)) {
2035 /*
2036 * If this is the first on the page, then it was
2037 * updated with the page itself. Try to discard it
2038 * and if we can't just make it zero.
2039 */
2040 if (rb_event_index(event)) {
2041 event->time_delta = *delta & TS_MASK;
2042 event->array[0] = *delta >> TS_SHIFT;
2043 } else {
2044 /* try to discard, since we do not need this */
2045 if (!rb_try_to_discard(cpu_buffer, event)) {
2046 /* nope, just zero it */
2047 event->time_delta = 0;
2048 event->array[0] = 0;
2049 }
2050 }
2051 cpu_buffer->write_stamp = *ts;
2052 /* let the caller know this was the commit */
2053 ret = 1;
2054 } else {
2055 /* Try to discard the event */
2056 if (!rb_try_to_discard(cpu_buffer, event)) {
2057 /* Darn, this is just wasted space */
2058 event->time_delta = 0;
2059 event->array[0] = 0;
2060 }
2061 ret = 0;
2062 }
2063
2064 *delta = 0;
2065
2066 return ret;
2067}
2068
2069static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) 2072static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
2070{ 2073{
2071 local_inc(&cpu_buffer->committing); 2074 local_inc(&cpu_buffer->committing);
2072 local_inc(&cpu_buffer->commits); 2075 local_inc(&cpu_buffer->commits);
2073} 2076}
2074 2077
2075static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) 2078static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
2076{ 2079{
2077 unsigned long commits; 2080 unsigned long commits;
2078 2081
@@ -2110,9 +2113,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2110 unsigned long length) 2113 unsigned long length)
2111{ 2114{
2112 struct ring_buffer_event *event; 2115 struct ring_buffer_event *event;
2113 u64 ts, delta = 0; 2116 u64 ts, delta;
2114 int commit = 0;
2115 int nr_loops = 0; 2117 int nr_loops = 0;
2118 int add_timestamp;
2119 u64 diff;
2116 2120
2117 rb_start_commit(cpu_buffer); 2121 rb_start_commit(cpu_buffer);
2118 2122
@@ -2133,6 +2137,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2133 2137
2134 length = rb_calculate_event_length(length); 2138 length = rb_calculate_event_length(length);
2135 again: 2139 again:
2140 add_timestamp = 0;
2141 delta = 0;
2142
2136 /* 2143 /*
2137 * We allow for interrupts to reenter here and do a trace. 2144 * We allow for interrupts to reenter here and do a trace.
2138 * If one does, it will cause this original code to loop 2145 * If one does, it will cause this original code to loop
@@ -2146,56 +2153,32 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2146 goto out_fail; 2153 goto out_fail;
2147 2154
2148 ts = rb_time_stamp(cpu_buffer->buffer); 2155 ts = rb_time_stamp(cpu_buffer->buffer);
2156 diff = ts - cpu_buffer->write_stamp;
2149 2157
2150 /* 2158 /* make sure this diff is calculated here */
2151 * Only the first commit can update the timestamp. 2159 barrier();
2152 * Yes there is a race here. If an interrupt comes in
2153 * just after the conditional and it traces too, then it
2154 * will also check the deltas. More than one timestamp may
2155 * also be made. But only the entry that did the actual
2156 * commit will be something other than zero.
2157 */
2158 if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
2159 rb_page_write(cpu_buffer->tail_page) ==
2160 rb_commit_index(cpu_buffer))) {
2161 u64 diff;
2162
2163 diff = ts - cpu_buffer->write_stamp;
2164
2165 /* make sure this diff is calculated here */
2166 barrier();
2167
2168 /* Did the write stamp get updated already? */
2169 if (unlikely(ts < cpu_buffer->write_stamp))
2170 goto get_event;
2171 2160
2161 /* Did the write stamp get updated already? */
2162 if (likely(ts >= cpu_buffer->write_stamp)) {
2172 delta = diff; 2163 delta = diff;
2173 if (unlikely(test_time_stamp(delta))) { 2164 if (unlikely(test_time_stamp(delta))) {
2174 2165 WARN_ONCE(delta > (1ULL << 59),
2175 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); 2166 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
2176 if (commit == -EBUSY) 2167 (unsigned long long)delta,
2177 goto out_fail; 2168 (unsigned long long)ts,
2178 2169 (unsigned long long)cpu_buffer->write_stamp);
2179 if (commit == -EAGAIN) 2170 add_timestamp = 1;
2180 goto again;
2181
2182 RB_WARN_ON(cpu_buffer, commit < 0);
2183 } 2171 }
2184 } 2172 }
2185 2173
2186 get_event: 2174 event = __rb_reserve_next(cpu_buffer, length, ts,
2187 event = __rb_reserve_next(cpu_buffer, 0, length, &ts); 2175 delta, add_timestamp);
2188 if (unlikely(PTR_ERR(event) == -EAGAIN)) 2176 if (unlikely(PTR_ERR(event) == -EAGAIN))
2189 goto again; 2177 goto again;
2190 2178
2191 if (!event) 2179 if (!event)
2192 goto out_fail; 2180 goto out_fail;
2193 2181
2194 if (!rb_event_is_commit(cpu_buffer, event))
2195 delta = 0;
2196
2197 event->time_delta = delta;
2198
2199 return event; 2182 return event;
2200 2183
2201 out_fail: 2184 out_fail:
@@ -2207,13 +2190,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2207 2190
2208#define TRACE_RECURSIVE_DEPTH 16 2191#define TRACE_RECURSIVE_DEPTH 16
2209 2192
2210static int trace_recursive_lock(void) 2193/* Keep this code out of the fast path cache */
2194static noinline void trace_recursive_fail(void)
2211{ 2195{
2212 current->trace_recursion++;
2213
2214 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
2215 return 0;
2216
2217 /* Disable all tracing before we do anything else */ 2196 /* Disable all tracing before we do anything else */
2218 tracing_off_permanent(); 2197 tracing_off_permanent();
2219 2198
@@ -2225,10 +2204,21 @@ static int trace_recursive_lock(void)
2225 in_nmi()); 2204 in_nmi());
2226 2205
2227 WARN_ON_ONCE(1); 2206 WARN_ON_ONCE(1);
2207}
2208
2209static inline int trace_recursive_lock(void)
2210{
2211 current->trace_recursion++;
2212
2213 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
2214 return 0;
2215
2216 trace_recursive_fail();
2217
2228 return -1; 2218 return -1;
2229} 2219}
2230 2220
2231static void trace_recursive_unlock(void) 2221static inline void trace_recursive_unlock(void)
2232{ 2222{
2233 WARN_ON_ONCE(!current->trace_recursion); 2223 WARN_ON_ONCE(!current->trace_recursion);
2234 2224
@@ -2308,12 +2298,28 @@ static void
2308rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, 2298rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2309 struct ring_buffer_event *event) 2299 struct ring_buffer_event *event)
2310{ 2300{
2301 u64 delta;
2302
2311 /* 2303 /*
2312 * The event first in the commit queue updates the 2304 * The event first in the commit queue updates the
2313 * time stamp. 2305 * time stamp.
2314 */ 2306 */
2315 if (rb_event_is_commit(cpu_buffer, event)) 2307 if (rb_event_is_commit(cpu_buffer, event)) {
2316 cpu_buffer->write_stamp += event->time_delta; 2308 /*
2309 * A commit event that is first on a page
2310 * updates the write timestamp with the page stamp
2311 */
2312 if (!rb_event_index(event))
2313 cpu_buffer->write_stamp =
2314 cpu_buffer->commit_page->page->time_stamp;
2315 else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
2316 delta = event->array[0];
2317 delta <<= TS_SHIFT;
2318 delta += event->time_delta;
2319 cpu_buffer->write_stamp += delta;
2320 } else
2321 cpu_buffer->write_stamp += event->time_delta;
2322 }
2317} 2323}
2318 2324
2319static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 2325static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2353,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
2353 2359
2354static inline void rb_event_discard(struct ring_buffer_event *event) 2360static inline void rb_event_discard(struct ring_buffer_event *event)
2355{ 2361{
2362 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
2363 event = skip_time_extend(event);
2364
2356 /* array[0] holds the actual length for the discarded event */ 2365 /* array[0] holds the actual length for the discarded event */
2357 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; 2366 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
2358 event->type_len = RINGBUF_TYPE_PADDING; 2367 event->type_len = RINGBUF_TYPE_PADDING;
@@ -2606,6 +2615,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
2606} 2615}
2607EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); 2616EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
2608 2617
2618/*
2619 * The total entries in the ring buffer is the running counter
2620 * of entries entered into the ring buffer, minus the sum of
2621 * the entries read from the ring buffer and the number of
2622 * entries that were overwritten.
2623 */
2624static inline unsigned long
2625rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
2626{
2627 return local_read(&cpu_buffer->entries) -
2628 (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
2629}
2630
2609/** 2631/**
2610 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer 2632 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
2611 * @buffer: The ring buffer 2633 * @buffer: The ring buffer
@@ -2614,16 +2636,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
2614unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) 2636unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
2615{ 2637{
2616 struct ring_buffer_per_cpu *cpu_buffer; 2638 struct ring_buffer_per_cpu *cpu_buffer;
2617 unsigned long ret;
2618 2639
2619 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2640 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2620 return 0; 2641 return 0;
2621 2642
2622 cpu_buffer = buffer->buffers[cpu]; 2643 cpu_buffer = buffer->buffers[cpu];
2623 ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
2624 - cpu_buffer->read;
2625 2644
2626 return ret; 2645 return rb_num_of_entries(cpu_buffer);
2627} 2646}
2628EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); 2647EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
2629 2648
@@ -2684,8 +2703,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2684 /* if you care about this being correct, lock the buffer */ 2703 /* if you care about this being correct, lock the buffer */
2685 for_each_buffer_cpu(buffer, cpu) { 2704 for_each_buffer_cpu(buffer, cpu) {
2686 cpu_buffer = buffer->buffers[cpu]; 2705 cpu_buffer = buffer->buffers[cpu];
2687 entries += (local_read(&cpu_buffer->entries) - 2706 entries += rb_num_of_entries(cpu_buffer);
2688 local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
2689 } 2707 }
2690 2708
2691 return entries; 2709 return entries;
@@ -2985,13 +3003,11 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
2985 3003
2986static void rb_advance_iter(struct ring_buffer_iter *iter) 3004static void rb_advance_iter(struct ring_buffer_iter *iter)
2987{ 3005{
2988 struct ring_buffer *buffer;
2989 struct ring_buffer_per_cpu *cpu_buffer; 3006 struct ring_buffer_per_cpu *cpu_buffer;
2990 struct ring_buffer_event *event; 3007 struct ring_buffer_event *event;
2991 unsigned length; 3008 unsigned length;
2992 3009
2993 cpu_buffer = iter->cpu_buffer; 3010 cpu_buffer = iter->cpu_buffer;
2994 buffer = cpu_buffer->buffer;
2995 3011
2996 /* 3012 /*
2997 * Check if we are at the end of the buffer. 3013 * Check if we are at the end of the buffer.
@@ -3042,12 +3058,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
3042 3058
3043 again: 3059 again:
3044 /* 3060 /*
3045 * We repeat when a timestamp is encountered. It is possible 3061 * We repeat when a time extend is encountered.
3046 * to get multiple timestamps from an interrupt entering just 3062 * Since the time extend is always attached to a data event,
3047 * as one timestamp is about to be written, or from discarded 3063 * we should never loop more than once.
3048 * commits. The most that we can have is the number on a single page. 3064 * (We never hit the following condition more than twice).
3049 */ 3065 */
3050 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) 3066 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
3051 return NULL; 3067 return NULL;
3052 3068
3053 reader = rb_get_reader_page(cpu_buffer); 3069 reader = rb_get_reader_page(cpu_buffer);
@@ -3123,14 +3139,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3123 return NULL; 3139 return NULL;
3124 3140
3125 /* 3141 /*
3126 * We repeat when a timestamp is encountered. 3142 * We repeat when a time extend is encountered.
3127 * We can get multiple timestamps by nested interrupts or also 3143 * Since the time extend is always attached to a data event,
3128 * if filtering is on (discarding commits). Since discarding 3144 * we should never loop more than once.
3129 * commits can be frequent we can get a lot of timestamps. 3145 * (We never hit the following condition more than twice).
3130 * But we limit them by not adding timestamps if they begin
3131 * at the start of a page.
3132 */ 3146 */
3133 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) 3147 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
3134 return NULL; 3148 return NULL;
3135 3149
3136 if (rb_per_cpu_empty(cpu_buffer)) 3150 if (rb_per_cpu_empty(cpu_buffer))
@@ -3828,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3828 if (len > (commit - read)) 3842 if (len > (commit - read))
3829 len = (commit - read); 3843 len = (commit - read);
3830 3844
3831 size = rb_event_length(event); 3845 /* Always keep the time extend and data together */
3846 size = rb_event_ts_length(event);
3832 3847
3833 if (len < size) 3848 if (len < size)
3834 goto out_unlock; 3849 goto out_unlock;
@@ -3846,8 +3861,12 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3846 rpos = reader->read; 3861 rpos = reader->read;
3847 pos += size; 3862 pos += size;
3848 3863
3864 if (rpos >= commit)
3865 break;
3866
3849 event = rb_reader_event(cpu_buffer); 3867 event = rb_reader_event(cpu_buffer);
3850 size = rb_event_length(event); 3868 /* Always keep the time extend and data together */
3869 size = rb_event_ts_length(event);
3851 } while (len > size); 3870 } while (len > size);
3852 3871
3853 /* update bpage */ 3872 /* update bpage */
@@ -3964,6 +3983,7 @@ static const struct file_operations rb_simple_fops = {
3964 .open = tracing_open_generic, 3983 .open = tracing_open_generic,
3965 .read = rb_simple_read, 3984 .read = rb_simple_read,
3966 .write = rb_simple_write, 3985 .write = rb_simple_write,
3986 .llseek = default_llseek,
3967}; 3987};
3968 3988
3969 3989
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ba14a22be4cc..82d9b8106cd0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2196,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
2196 2196
2197static int tracing_release(struct inode *inode, struct file *file) 2197static int tracing_release(struct inode *inode, struct file *file)
2198{ 2198{
2199 struct seq_file *m = (struct seq_file *)file->private_data; 2199 struct seq_file *m = file->private_data;
2200 struct trace_iterator *iter; 2200 struct trace_iterator *iter;
2201 int cpu; 2201 int cpu;
2202 2202
@@ -3463,6 +3463,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3463 size_t cnt, loff_t *fpos) 3463 size_t cnt, loff_t *fpos)
3464{ 3464{
3465 char *buf; 3465 char *buf;
3466 size_t written;
3466 3467
3467 if (tracing_disabled) 3468 if (tracing_disabled)
3468 return -EINVAL; 3469 return -EINVAL;
@@ -3484,11 +3485,15 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3484 } else 3485 } else
3485 buf[cnt] = '\0'; 3486 buf[cnt] = '\0';
3486 3487
3487 cnt = mark_printk("%s", buf); 3488 written = mark_printk("%s", buf);
3488 kfree(buf); 3489 kfree(buf);
3489 *fpos += cnt; 3490 *fpos += written;
3490 3491
3491 return cnt; 3492 /* don't tell userspace we wrote more - it might confuse them */
3493 if (written > cnt)
3494 written = cnt;
3495
3496 return written;
3492} 3497}
3493 3498
3494static int tracing_clock_show(struct seq_file *m, void *v) 3499static int tracing_clock_show(struct seq_file *m, void *v)
@@ -3991,13 +3996,9 @@ static void tracing_init_debugfs_percpu(long cpu)
3991{ 3996{
3992 struct dentry *d_percpu = tracing_dentry_percpu(); 3997 struct dentry *d_percpu = tracing_dentry_percpu();
3993 struct dentry *d_cpu; 3998 struct dentry *d_cpu;
3994 /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ 3999 char cpu_dir[30]; /* 30 characters should be more than enough */
3995 char cpu_dir[7];
3996
3997 if (cpu > 999 || cpu < 0)
3998 return;
3999 4000
4000 sprintf(cpu_dir, "cpu%ld", cpu); 4001 snprintf(cpu_dir, 30, "cpu%ld", cpu);
4001 d_cpu = debugfs_create_dir(cpu_dir, d_percpu); 4002 d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
4002 if (!d_cpu) { 4003 if (!d_cpu) {
4003 pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); 4004 pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d39b3c5454a5..9021f8c0c0c3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr,
343 unsigned long ip, 343 unsigned long ip,
344 unsigned long parent_ip, 344 unsigned long parent_ip,
345 unsigned long flags, int pc); 345 unsigned long flags, int pc);
346void trace_graph_function(struct trace_array *tr,
347 unsigned long ip,
348 unsigned long parent_ip,
349 unsigned long flags, int pc);
346void trace_default_header(struct seq_file *m); 350void trace_default_header(struct seq_file *m);
347void print_trace_header(struct seq_file *m, struct trace_iterator *iter); 351void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
348int trace_empty(struct trace_iterator *iter); 352int trace_empty(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 000e6e85b445..39c059ca670e 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,7 +9,7 @@
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include "trace.h" 10#include "trace.h"
11 11
12static char *perf_trace_buf[4]; 12static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
13 13
14/* 14/*
15 * Force it to be aligned to unsigned long to avoid misaligned accesses 15 * Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -24,7 +24,7 @@ static int total_ref_count;
24static int perf_trace_event_init(struct ftrace_event_call *tp_event, 24static int perf_trace_event_init(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event) 25 struct perf_event *p_event)
26{ 26{
27 struct hlist_head *list; 27 struct hlist_head __percpu *list;
28 int ret = -ENOMEM; 28 int ret = -ENOMEM;
29 int cpu; 29 int cpu;
30 30
@@ -42,11 +42,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
42 tp_event->perf_events = list; 42 tp_event->perf_events = list;
43 43
44 if (!total_ref_count) { 44 if (!total_ref_count) {
45 char *buf; 45 char __percpu *buf;
46 int i; 46 int i;
47 47
48 for (i = 0; i < 4; i++) { 48 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
49 buf = (char *)alloc_percpu(perf_trace_t); 49 buf = (char __percpu *)alloc_percpu(perf_trace_t);
50 if (!buf) 50 if (!buf)
51 goto fail; 51 goto fail;
52 52
@@ -65,7 +65,7 @@ fail:
65 if (!total_ref_count) { 65 if (!total_ref_count) {
66 int i; 66 int i;
67 67
68 for (i = 0; i < 4; i++) { 68 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
69 free_percpu(perf_trace_buf[i]); 69 free_percpu(perf_trace_buf[i]);
70 perf_trace_buf[i] = NULL; 70 perf_trace_buf[i] = NULL;
71 } 71 }
@@ -91,6 +91,8 @@ int perf_trace_init(struct perf_event *p_event)
91 tp_event->class && tp_event->class->reg && 91 tp_event->class && tp_event->class->reg &&
92 try_module_get(tp_event->mod)) { 92 try_module_get(tp_event->mod)) {
93 ret = perf_trace_event_init(tp_event, p_event); 93 ret = perf_trace_event_init(tp_event, p_event);
94 if (ret)
95 module_put(tp_event->mod);
94 break; 96 break;
95 } 97 }
96 } 98 }
@@ -99,22 +101,26 @@ int perf_trace_init(struct perf_event *p_event)
99 return ret; 101 return ret;
100} 102}
101 103
102int perf_trace_enable(struct perf_event *p_event) 104int perf_trace_add(struct perf_event *p_event, int flags)
103{ 105{
104 struct ftrace_event_call *tp_event = p_event->tp_event; 106 struct ftrace_event_call *tp_event = p_event->tp_event;
107 struct hlist_head __percpu *pcpu_list;
105 struct hlist_head *list; 108 struct hlist_head *list;
106 109
107 list = tp_event->perf_events; 110 pcpu_list = tp_event->perf_events;
108 if (WARN_ON_ONCE(!list)) 111 if (WARN_ON_ONCE(!pcpu_list))
109 return -EINVAL; 112 return -EINVAL;
110 113
111 list = this_cpu_ptr(list); 114 if (!(flags & PERF_EF_START))
115 p_event->hw.state = PERF_HES_STOPPED;
116
117 list = this_cpu_ptr(pcpu_list);
112 hlist_add_head_rcu(&p_event->hlist_entry, list); 118 hlist_add_head_rcu(&p_event->hlist_entry, list);
113 119
114 return 0; 120 return 0;
115} 121}
116 122
117void perf_trace_disable(struct perf_event *p_event) 123void perf_trace_del(struct perf_event *p_event, int flags)
118{ 124{
119 hlist_del_rcu(&p_event->hlist_entry); 125 hlist_del_rcu(&p_event->hlist_entry);
120} 126}
@@ -140,12 +146,13 @@ void perf_trace_destroy(struct perf_event *p_event)
140 tp_event->perf_events = NULL; 146 tp_event->perf_events = NULL;
141 147
142 if (!--total_ref_count) { 148 if (!--total_ref_count) {
143 for (i = 0; i < 4; i++) { 149 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
144 free_percpu(perf_trace_buf[i]); 150 free_percpu(perf_trace_buf[i]);
145 perf_trace_buf[i] = NULL; 151 perf_trace_buf[i] = NULL;
146 } 152 }
147 } 153 }
148out: 154out:
155 module_put(tp_event->mod);
149 mutex_unlock(&event_mutex); 156 mutex_unlock(&event_mutex);
150} 157}
151 158
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 09b4fa6e4d3b..0725eeab1937 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -598,88 +598,146 @@ out:
598 return ret; 598 return ret;
599} 599}
600 600
601static void print_event_fields(struct trace_seq *s, struct list_head *head) 601enum {
602 FORMAT_HEADER = 1,
603 FORMAT_FIELD_SEPERATOR = 2,
604 FORMAT_PRINTFMT = 3,
605};
606
607static void *f_next(struct seq_file *m, void *v, loff_t *pos)
602{ 608{
609 struct ftrace_event_call *call = m->private;
603 struct ftrace_event_field *field; 610 struct ftrace_event_field *field;
611 struct list_head *common_head = &ftrace_common_fields;
612 struct list_head *head = trace_get_fields(call);
604 613
605 list_for_each_entry_reverse(field, head, link) { 614 (*pos)++;
606 /*
607 * Smartly shows the array type(except dynamic array).
608 * Normal:
609 * field:TYPE VAR
610 * If TYPE := TYPE[LEN], it is shown:
611 * field:TYPE VAR[LEN]
612 */
613 const char *array_descriptor = strchr(field->type, '[');
614 615
615 if (!strncmp(field->type, "__data_loc", 10)) 616 switch ((unsigned long)v) {
616 array_descriptor = NULL; 617 case FORMAT_HEADER:
618 if (unlikely(list_empty(common_head)))
619 return NULL;
617 620
618 if (!array_descriptor) { 621 field = list_entry(common_head->prev,
619 trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" 622 struct ftrace_event_field, link);
620 "\tsize:%u;\tsigned:%d;\n", 623 return field;
621 field->type, field->name, field->offset, 624
622 field->size, !!field->is_signed); 625 case FORMAT_FIELD_SEPERATOR:
623 } else { 626 if (unlikely(list_empty(head)))
624 trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" 627 return NULL;
625 "\tsize:%u;\tsigned:%d;\n", 628
626 (int)(array_descriptor - field->type), 629 field = list_entry(head->prev, struct ftrace_event_field, link);
627 field->type, field->name, 630 return field;
628 array_descriptor, field->offset, 631
629 field->size, !!field->is_signed); 632 case FORMAT_PRINTFMT:
630 } 633 /* all done */
634 return NULL;
631 } 635 }
636
637 field = v;
638 if (field->link.prev == common_head)
639 return (void *)FORMAT_FIELD_SEPERATOR;
640 else if (field->link.prev == head)
641 return (void *)FORMAT_PRINTFMT;
642
643 field = list_entry(field->link.prev, struct ftrace_event_field, link);
644
645 return field;
632} 646}
633 647
634static ssize_t 648static void *f_start(struct seq_file *m, loff_t *pos)
635event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
636 loff_t *ppos)
637{ 649{
638 struct ftrace_event_call *call = filp->private_data; 650 loff_t l = 0;
639 struct list_head *head; 651 void *p;
640 struct trace_seq *s;
641 char *buf;
642 int r;
643 652
644 if (*ppos) 653 /* Start by showing the header */
654 if (!*pos)
655 return (void *)FORMAT_HEADER;
656
657 p = (void *)FORMAT_HEADER;
658 do {
659 p = f_next(m, p, &l);
660 } while (p && l < *pos);
661
662 return p;
663}
664
665static int f_show(struct seq_file *m, void *v)
666{
667 struct ftrace_event_call *call = m->private;
668 struct ftrace_event_field *field;
669 const char *array_descriptor;
670
671 switch ((unsigned long)v) {
672 case FORMAT_HEADER:
673 seq_printf(m, "name: %s\n", call->name);
674 seq_printf(m, "ID: %d\n", call->event.type);
675 seq_printf(m, "format:\n");
645 return 0; 676 return 0;
646 677
647 s = kmalloc(sizeof(*s), GFP_KERNEL); 678 case FORMAT_FIELD_SEPERATOR:
648 if (!s) 679 seq_putc(m, '\n');
649 return -ENOMEM; 680 return 0;
650 681
651 trace_seq_init(s); 682 case FORMAT_PRINTFMT:
683 seq_printf(m, "\nprint fmt: %s\n",
684 call->print_fmt);
685 return 0;
686 }
652 687
653 trace_seq_printf(s, "name: %s\n", call->name); 688 field = v;
654 trace_seq_printf(s, "ID: %d\n", call->event.type);
655 trace_seq_printf(s, "format:\n");
656 689
657 /* print common fields */ 690 /*
658 print_event_fields(s, &ftrace_common_fields); 691 * Smartly shows the array type(except dynamic array).
692 * Normal:
693 * field:TYPE VAR
694 * If TYPE := TYPE[LEN], it is shown:
695 * field:TYPE VAR[LEN]
696 */
697 array_descriptor = strchr(field->type, '[');
659 698
660 trace_seq_putc(s, '\n'); 699 if (!strncmp(field->type, "__data_loc", 10))
700 array_descriptor = NULL;
661 701
662 /* print event specific fields */ 702 if (!array_descriptor)
663 head = trace_get_fields(call); 703 seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
664 print_event_fields(s, head); 704 field->type, field->name, field->offset,
705 field->size, !!field->is_signed);
706 else
707 seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
708 (int)(array_descriptor - field->type),
709 field->type, field->name,
710 array_descriptor, field->offset,
711 field->size, !!field->is_signed);
665 712
666 r = trace_seq_printf(s, "\nprint fmt: %s\n", call->print_fmt); 713 return 0;
714}
667 715
668 if (!r) { 716static void f_stop(struct seq_file *m, void *p)
669 /* 717{
670 * ug! The format output is bigger than a PAGE!! 718}
671 */
672 buf = "FORMAT TOO BIG\n";
673 r = simple_read_from_buffer(ubuf, cnt, ppos,
674 buf, strlen(buf));
675 goto out;
676 }
677 719
678 r = simple_read_from_buffer(ubuf, cnt, ppos, 720static const struct seq_operations trace_format_seq_ops = {
679 s->buffer, s->len); 721 .start = f_start,
680 out: 722 .next = f_next,
681 kfree(s); 723 .stop = f_stop,
682 return r; 724 .show = f_show,
725};
726
727static int trace_format_open(struct inode *inode, struct file *file)
728{
729 struct ftrace_event_call *call = inode->i_private;
730 struct seq_file *m;
731 int ret;
732
733 ret = seq_open(file, &trace_format_seq_ops);
734 if (ret < 0)
735 return ret;
736
737 m = file->private_data;
738 m->private = call;
739
740 return 0;
683} 741}
684 742
685static ssize_t 743static ssize_t
@@ -874,39 +932,47 @@ static const struct file_operations ftrace_enable_fops = {
874 .open = tracing_open_generic, 932 .open = tracing_open_generic,
875 .read = event_enable_read, 933 .read = event_enable_read,
876 .write = event_enable_write, 934 .write = event_enable_write,
935 .llseek = default_llseek,
877}; 936};
878 937
879static const struct file_operations ftrace_event_format_fops = { 938static const struct file_operations ftrace_event_format_fops = {
880 .open = tracing_open_generic, 939 .open = trace_format_open,
881 .read = event_format_read, 940 .read = seq_read,
941 .llseek = seq_lseek,
942 .release = seq_release,
882}; 943};
883 944
884static const struct file_operations ftrace_event_id_fops = { 945static const struct file_operations ftrace_event_id_fops = {
885 .open = tracing_open_generic, 946 .open = tracing_open_generic,
886 .read = event_id_read, 947 .read = event_id_read,
948 .llseek = default_llseek,
887}; 949};
888 950
889static const struct file_operations ftrace_event_filter_fops = { 951static const struct file_operations ftrace_event_filter_fops = {
890 .open = tracing_open_generic, 952 .open = tracing_open_generic,
891 .read = event_filter_read, 953 .read = event_filter_read,
892 .write = event_filter_write, 954 .write = event_filter_write,
955 .llseek = default_llseek,
893}; 956};
894 957
895static const struct file_operations ftrace_subsystem_filter_fops = { 958static const struct file_operations ftrace_subsystem_filter_fops = {
896 .open = tracing_open_generic, 959 .open = tracing_open_generic,
897 .read = subsystem_filter_read, 960 .read = subsystem_filter_read,
898 .write = subsystem_filter_write, 961 .write = subsystem_filter_write,
962 .llseek = default_llseek,
899}; 963};
900 964
901static const struct file_operations ftrace_system_enable_fops = { 965static const struct file_operations ftrace_system_enable_fops = {
902 .open = tracing_open_generic, 966 .open = tracing_open_generic,
903 .read = system_enable_read, 967 .read = system_enable_read,
904 .write = system_enable_write, 968 .write = system_enable_write,
969 .llseek = default_llseek,
905}; 970};
906 971
907static const struct file_operations ftrace_show_header_fops = { 972static const struct file_operations ftrace_show_header_fops = {
908 .open = tracing_open_generic, 973 .open = tracing_open_generic,
909 .read = show_header, 974 .read = show_header,
975 .llseek = default_llseek,
910}; 976};
911 977
912static struct dentry *event_trace_events_dir(void) 978static struct dentry *event_trace_events_dir(void)
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 6bff23625781..76b05980225c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -15,15 +15,19 @@
15#include "trace.h" 15#include "trace.h"
16#include "trace_output.h" 16#include "trace_output.h"
17 17
18/* When set, irq functions will be ignored */
19static int ftrace_graph_skip_irqs;
20
18struct fgraph_cpu_data { 21struct fgraph_cpu_data {
19 pid_t last_pid; 22 pid_t last_pid;
20 int depth; 23 int depth;
24 int depth_irq;
21 int ignore; 25 int ignore;
22 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; 26 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
23}; 27};
24 28
25struct fgraph_data { 29struct fgraph_data {
26 struct fgraph_cpu_data *cpu_data; 30 struct fgraph_cpu_data __percpu *cpu_data;
27 31
28 /* Place to preserve last processed entry. */ 32 /* Place to preserve last processed entry. */
29 struct ftrace_graph_ent_entry ent; 33 struct ftrace_graph_ent_entry ent;
@@ -41,6 +45,7 @@ struct fgraph_data {
41#define TRACE_GRAPH_PRINT_PROC 0x8 45#define TRACE_GRAPH_PRINT_PROC 0x8
42#define TRACE_GRAPH_PRINT_DURATION 0x10 46#define TRACE_GRAPH_PRINT_DURATION 0x10
43#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 47#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
48#define TRACE_GRAPH_PRINT_IRQS 0x40
44 49
45static struct tracer_opt trace_opts[] = { 50static struct tracer_opt trace_opts[] = {
46 /* Display overruns? (for self-debug purpose) */ 51 /* Display overruns? (for self-debug purpose) */
@@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = {
55 { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, 60 { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
56 /* Display absolute time of an entry */ 61 /* Display absolute time of an entry */
57 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, 62 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
63 /* Display interrupts */
64 { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
58 { } /* Empty entry */ 65 { } /* Empty entry */
59}; 66};
60 67
61static struct tracer_flags tracer_flags = { 68static struct tracer_flags tracer_flags = {
62 /* Don't display overruns and proc by default */ 69 /* Don't display overruns and proc by default */
63 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | 70 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
64 TRACE_GRAPH_PRINT_DURATION, 71 TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
65 .opts = trace_opts 72 .opts = trace_opts
66}; 73};
67 74
@@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr,
204 return 1; 211 return 1;
205} 212}
206 213
214static inline int ftrace_graph_ignore_irqs(void)
215{
216 if (!ftrace_graph_skip_irqs)
217 return 0;
218
219 return in_irq();
220}
221
207int trace_graph_entry(struct ftrace_graph_ent *trace) 222int trace_graph_entry(struct ftrace_graph_ent *trace)
208{ 223{
209 struct trace_array *tr = graph_array; 224 struct trace_array *tr = graph_array;
@@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
218 return 0; 233 return 0;
219 234
220 /* trace it when it is-nested-in or is a function enabled. */ 235 /* trace it when it is-nested-in or is a function enabled. */
221 if (!(trace->depth || ftrace_graph_addr(trace->func))) 236 if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
237 ftrace_graph_ignore_irqs())
222 return 0; 238 return 0;
223 239
224 local_irq_save(flags); 240 local_irq_save(flags);
@@ -246,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
246 return trace_graph_entry(trace); 262 return trace_graph_entry(trace);
247} 263}
248 264
265static void
266__trace_graph_function(struct trace_array *tr,
267 unsigned long ip, unsigned long flags, int pc)
268{
269 u64 time = trace_clock_local();
270 struct ftrace_graph_ent ent = {
271 .func = ip,
272 .depth = 0,
273 };
274 struct ftrace_graph_ret ret = {
275 .func = ip,
276 .depth = 0,
277 .calltime = time,
278 .rettime = time,
279 };
280
281 __trace_graph_entry(tr, &ent, flags, pc);
282 __trace_graph_return(tr, &ret, flags, pc);
283}
284
285void
286trace_graph_function(struct trace_array *tr,
287 unsigned long ip, unsigned long parent_ip,
288 unsigned long flags, int pc)
289{
290 __trace_graph_function(tr, ip, flags, pc);
291}
292
249void __trace_graph_return(struct trace_array *tr, 293void __trace_graph_return(struct trace_array *tr,
250 struct ftrace_graph_ret *trace, 294 struct ftrace_graph_ret *trace,
251 unsigned long flags, 295 unsigned long flags,
@@ -507,7 +551,15 @@ get_return_for_leaf(struct trace_iterator *iter,
507 * if the output fails. 551 * if the output fails.
508 */ 552 */
509 data->ent = *curr; 553 data->ent = *curr;
510 data->ret = *next; 554 /*
555 * If the next event is not a return type, then
556 * we only care about what type it is. Otherwise we can
557 * safely copy the entire event.
558 */
559 if (next->ent.type == TRACE_GRAPH_RET)
560 data->ret = *next;
561 else
562 data->ret.ent.type = next->ent.type;
511 } 563 }
512 } 564 }
513 565
@@ -641,8 +693,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
641 693
642 /* Print nsecs (we don't want to exceed 7 numbers) */ 694 /* Print nsecs (we don't want to exceed 7 numbers) */
643 if (len < 7) { 695 if (len < 7) {
644 snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu", 696 size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
645 nsecs_rem); 697
698 snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
646 ret = trace_seq_printf(s, ".%s", nsecs_str); 699 ret = trace_seq_printf(s, ".%s", nsecs_str);
647 if (!ret) 700 if (!ret)
648 return TRACE_TYPE_PARTIAL_LINE; 701 return TRACE_TYPE_PARTIAL_LINE;
@@ -847,6 +900,108 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
847 return 0; 900 return 0;
848} 901}
849 902
903/*
904 * Entry check for irq code
905 *
906 * returns 1 if
907 * - we are inside irq code
908 * - we just extered irq code
909 *
910 * retunns 0 if
911 * - funcgraph-interrupts option is set
912 * - we are not inside irq code
913 */
914static int
915check_irq_entry(struct trace_iterator *iter, u32 flags,
916 unsigned long addr, int depth)
917{
918 int cpu = iter->cpu;
919 int *depth_irq;
920 struct fgraph_data *data = iter->private;
921
922 /*
923 * If we are either displaying irqs, or we got called as
924 * a graph event and private data does not exist,
925 * then we bypass the irq check.
926 */
927 if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
928 (!data))
929 return 0;
930
931 depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
932
933 /*
934 * We are inside the irq code
935 */
936 if (*depth_irq >= 0)
937 return 1;
938
939 if ((addr < (unsigned long)__irqentry_text_start) ||
940 (addr >= (unsigned long)__irqentry_text_end))
941 return 0;
942
943 /*
944 * We are entering irq code.
945 */
946 *depth_irq = depth;
947 return 1;
948}
949
950/*
951 * Return check for irq code
952 *
953 * returns 1 if
954 * - we are inside irq code
955 * - we just left irq code
956 *
957 * returns 0 if
958 * - funcgraph-interrupts option is set
959 * - we are not inside irq code
960 */
961static int
962check_irq_return(struct trace_iterator *iter, u32 flags, int depth)
963{
964 int cpu = iter->cpu;
965 int *depth_irq;
966 struct fgraph_data *data = iter->private;
967
968 /*
969 * If we are either displaying irqs, or we got called as
970 * a graph event and private data does not exist,
971 * then we bypass the irq check.
972 */
973 if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
974 (!data))
975 return 0;
976
977 depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
978
979 /*
980 * We are not inside the irq code.
981 */
982 if (*depth_irq == -1)
983 return 0;
984
985 /*
986 * We are inside the irq code, and this is returning entry.
987 * Let's not trace it and clear the entry depth, since
988 * we are out of irq code.
989 *
990 * This condition ensures that we 'leave the irq code' once
991 * we are out of the entry depth. Thus protecting us from
992 * the RETURN entry loss.
993 */
994 if (*depth_irq >= depth) {
995 *depth_irq = -1;
996 return 1;
997 }
998
999 /*
1000 * We are inside the irq code, and this is not the entry.
1001 */
1002 return 1;
1003}
1004
850static enum print_line_t 1005static enum print_line_t
851print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 1006print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
852 struct trace_iterator *iter, u32 flags) 1007 struct trace_iterator *iter, u32 flags)
@@ -857,6 +1012,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
857 static enum print_line_t ret; 1012 static enum print_line_t ret;
858 int cpu = iter->cpu; 1013 int cpu = iter->cpu;
859 1014
1015 if (check_irq_entry(iter, flags, call->func, call->depth))
1016 return TRACE_TYPE_HANDLED;
1017
860 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) 1018 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
861 return TRACE_TYPE_PARTIAL_LINE; 1019 return TRACE_TYPE_PARTIAL_LINE;
862 1020
@@ -894,6 +1052,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
894 int ret; 1052 int ret;
895 int i; 1053 int i;
896 1054
1055 if (check_irq_return(iter, flags, trace->depth))
1056 return TRACE_TYPE_HANDLED;
1057
897 if (data) { 1058 if (data) {
898 struct fgraph_cpu_data *cpu_data; 1059 struct fgraph_cpu_data *cpu_data;
899 int cpu = iter->cpu; 1060 int cpu = iter->cpu;
@@ -1046,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1046 1207
1047 1208
1048enum print_line_t 1209enum print_line_t
1049print_graph_function_flags(struct trace_iterator *iter, u32 flags) 1210__print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1050{ 1211{
1051 struct ftrace_graph_ent_entry *field; 1212 struct ftrace_graph_ent_entry *field;
1052 struct fgraph_data *data = iter->private; 1213 struct fgraph_data *data = iter->private;
@@ -1109,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1109static enum print_line_t 1270static enum print_line_t
1110print_graph_function(struct trace_iterator *iter) 1271print_graph_function(struct trace_iterator *iter)
1111{ 1272{
1112 return print_graph_function_flags(iter, tracer_flags.val); 1273 return __print_graph_function_flags(iter, tracer_flags.val);
1274}
1275
1276enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
1277 u32 flags)
1278{
1279 if (trace_flags & TRACE_ITER_LATENCY_FMT)
1280 flags |= TRACE_GRAPH_PRINT_DURATION;
1281 else
1282 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1283
1284 return __print_graph_function_flags(iter, flags);
1113} 1285}
1114 1286
1115static enum print_line_t 1287static enum print_line_t
@@ -1141,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
1141 seq_printf(s, "#%.*s|||| / \n", size, spaces); 1313 seq_printf(s, "#%.*s|||| / \n", size, spaces);
1142} 1314}
1143 1315
1144void print_graph_headers_flags(struct seq_file *s, u32 flags) 1316static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1145{ 1317{
1146 int lat = trace_flags & TRACE_ITER_LATENCY_FMT; 1318 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
1147 1319
@@ -1182,6 +1354,23 @@ void print_graph_headers(struct seq_file *s)
1182 print_graph_headers_flags(s, tracer_flags.val); 1354 print_graph_headers_flags(s, tracer_flags.val);
1183} 1355}
1184 1356
1357void print_graph_headers_flags(struct seq_file *s, u32 flags)
1358{
1359 struct trace_iterator *iter = s->private;
1360
1361 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
1362 /* print nothing if the buffers are empty */
1363 if (trace_empty(iter))
1364 return;
1365
1366 print_trace_header(s, iter);
1367 flags |= TRACE_GRAPH_PRINT_DURATION;
1368 } else
1369 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1370
1371 __print_graph_headers_flags(s, flags);
1372}
1373
1185void graph_trace_open(struct trace_iterator *iter) 1374void graph_trace_open(struct trace_iterator *iter)
1186{ 1375{
1187 /* pid and depth on the last trace processed */ 1376 /* pid and depth on the last trace processed */
@@ -1202,9 +1391,12 @@ void graph_trace_open(struct trace_iterator *iter)
1202 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); 1391 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
1203 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 1392 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
1204 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); 1393 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
1394 int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
1395
1205 *pid = -1; 1396 *pid = -1;
1206 *depth = 0; 1397 *depth = 0;
1207 *ignore = 0; 1398 *ignore = 0;
1399 *depth_irq = -1;
1208 } 1400 }
1209 1401
1210 iter->private = data; 1402 iter->private = data;
@@ -1227,6 +1419,14 @@ void graph_trace_close(struct trace_iterator *iter)
1227 } 1419 }
1228} 1420}
1229 1421
1422static int func_graph_set_flag(u32 old_flags, u32 bit, int set)
1423{
1424 if (bit == TRACE_GRAPH_PRINT_IRQS)
1425 ftrace_graph_skip_irqs = !set;
1426
1427 return 0;
1428}
1429
1230static struct trace_event_functions graph_functions = { 1430static struct trace_event_functions graph_functions = {
1231 .trace = print_graph_function_event, 1431 .trace = print_graph_function_event,
1232}; 1432};
@@ -1253,6 +1453,7 @@ static struct tracer graph_trace __read_mostly = {
1253 .print_line = print_graph_function, 1453 .print_line = print_graph_function,
1254 .print_header = print_graph_headers, 1454 .print_header = print_graph_headers,
1255 .flags = &tracer_flags, 1455 .flags = &tracer_flags,
1456 .set_flag = func_graph_set_flag,
1256#ifdef CONFIG_FTRACE_SELFTEST 1457#ifdef CONFIG_FTRACE_SELFTEST
1257 .selftest = trace_selftest_startup_function_graph, 1458 .selftest = trace_selftest_startup_function_graph,
1258#endif 1459#endif
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 73a6b0601f2e..5cf8c602b880 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp unsigned long max_sequence;
87 87
88#ifdef CONFIG_FUNCTION_TRACER 88#ifdef CONFIG_FUNCTION_TRACER
89/* 89/*
90 * irqsoff uses its own tracer function to keep the overhead down: 90 * Prologue for the preempt and irqs off function tracers.
91 *
92 * Returns 1 if it is OK to continue, and data->disabled is
93 * incremented.
94 * 0 if the trace is to be ignored, and data->disabled
95 * is kept the same.
96 *
97 * Note, this function is also used outside this ifdef but
98 * inside the #ifdef of the function graph tracer below.
99 * This is OK, since the function graph tracer is
100 * dependent on the function tracer.
91 */ 101 */
92static void 102static int func_prolog_dec(struct trace_array *tr,
93irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) 103 struct trace_array_cpu **data,
104 unsigned long *flags)
94{ 105{
95 struct trace_array *tr = irqsoff_trace;
96 struct trace_array_cpu *data;
97 unsigned long flags;
98 long disabled; 106 long disabled;
99 int cpu; 107 int cpu;
100 108
@@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
106 */ 114 */
107 cpu = raw_smp_processor_id(); 115 cpu = raw_smp_processor_id();
108 if (likely(!per_cpu(tracing_cpu, cpu))) 116 if (likely(!per_cpu(tracing_cpu, cpu)))
109 return; 117 return 0;
110 118
111 local_save_flags(flags); 119 local_save_flags(*flags);
112 /* slight chance to get a false positive on tracing_cpu */ 120 /* slight chance to get a false positive on tracing_cpu */
113 if (!irqs_disabled_flags(flags)) 121 if (!irqs_disabled_flags(*flags))
114 return; 122 return 0;
115 123
116 data = tr->data[cpu]; 124 *data = tr->data[cpu];
117 disabled = atomic_inc_return(&data->disabled); 125 disabled = atomic_inc_return(&(*data)->disabled);
118 126
119 if (likely(disabled == 1)) 127 if (likely(disabled == 1))
120 trace_function(tr, ip, parent_ip, flags, preempt_count()); 128 return 1;
129
130 atomic_dec(&(*data)->disabled);
131
132 return 0;
133}
134
135/*
136 * irqsoff uses its own tracer function to keep the overhead down:
137 */
138static void
139irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
140{
141 struct trace_array *tr = irqsoff_trace;
142 struct trace_array_cpu *data;
143 unsigned long flags;
144
145 if (!func_prolog_dec(tr, &data, &flags))
146 return;
147
148 trace_function(tr, ip, parent_ip, flags, preempt_count());
121 149
122 atomic_dec(&data->disabled); 150 atomic_dec(&data->disabled);
123} 151}
@@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
155 struct trace_array *tr = irqsoff_trace; 183 struct trace_array *tr = irqsoff_trace;
156 struct trace_array_cpu *data; 184 struct trace_array_cpu *data;
157 unsigned long flags; 185 unsigned long flags;
158 long disabled;
159 int ret; 186 int ret;
160 int cpu;
161 int pc; 187 int pc;
162 188
163 cpu = raw_smp_processor_id(); 189 if (!func_prolog_dec(tr, &data, &flags))
164 if (likely(!per_cpu(tracing_cpu, cpu)))
165 return 0; 190 return 0;
166 191
167 local_save_flags(flags); 192 pc = preempt_count();
168 /* slight chance to get a false positive on tracing_cpu */ 193 ret = __trace_graph_entry(tr, trace, flags, pc);
169 if (!irqs_disabled_flags(flags))
170 return 0;
171
172 data = tr->data[cpu];
173 disabled = atomic_inc_return(&data->disabled);
174
175 if (likely(disabled == 1)) {
176 pc = preempt_count();
177 ret = __trace_graph_entry(tr, trace, flags, pc);
178 } else
179 ret = 0;
180
181 atomic_dec(&data->disabled); 194 atomic_dec(&data->disabled);
195
182 return ret; 196 return ret;
183} 197}
184 198
@@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
187 struct trace_array *tr = irqsoff_trace; 201 struct trace_array *tr = irqsoff_trace;
188 struct trace_array_cpu *data; 202 struct trace_array_cpu *data;
189 unsigned long flags; 203 unsigned long flags;
190 long disabled;
191 int cpu;
192 int pc; 204 int pc;
193 205
194 cpu = raw_smp_processor_id(); 206 if (!func_prolog_dec(tr, &data, &flags))
195 if (likely(!per_cpu(tracing_cpu, cpu)))
196 return; 207 return;
197 208
198 local_save_flags(flags); 209 pc = preempt_count();
199 /* slight chance to get a false positive on tracing_cpu */ 210 __trace_graph_return(tr, trace, flags, pc);
200 if (!irqs_disabled_flags(flags))
201 return;
202
203 data = tr->data[cpu];
204 disabled = atomic_inc_return(&data->disabled);
205
206 if (likely(disabled == 1)) {
207 pc = preempt_count();
208 __trace_graph_return(tr, trace, flags, pc);
209 }
210
211 atomic_dec(&data->disabled); 211 atomic_dec(&data->disabled);
212} 212}
213 213
@@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
229 229
230static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) 230static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
231{ 231{
232 u32 flags = GRAPH_TRACER_FLAGS;
233
234 if (trace_flags & TRACE_ITER_LATENCY_FMT)
235 flags |= TRACE_GRAPH_PRINT_DURATION;
236 else
237 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
238
239 /* 232 /*
240 * In graph mode call the graph tracer output function, 233 * In graph mode call the graph tracer output function,
241 * otherwise go with the TRACE_FN event handler 234 * otherwise go with the TRACE_FN event handler
242 */ 235 */
243 if (is_graph()) 236 if (is_graph())
244 return print_graph_function_flags(iter, flags); 237 return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
245 238
246 return TRACE_TYPE_UNHANDLED; 239 return TRACE_TYPE_UNHANDLED;
247} 240}
248 241
249static void irqsoff_print_header(struct seq_file *s) 242static void irqsoff_print_header(struct seq_file *s)
250{ 243{
251 if (is_graph()) { 244 if (is_graph())
252 struct trace_iterator *iter = s->private; 245 print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
253 u32 flags = GRAPH_TRACER_FLAGS; 246 else
254
255 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
256 /* print nothing if the buffers are empty */
257 if (trace_empty(iter))
258 return;
259
260 print_trace_header(s, iter);
261 flags |= TRACE_GRAPH_PRINT_DURATION;
262 } else
263 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
264
265 print_graph_headers_flags(s, flags);
266 } else
267 trace_default_header(s); 247 trace_default_header(s);
268} 248}
269 249
270static void 250static void
271trace_graph_function(struct trace_array *tr,
272 unsigned long ip, unsigned long flags, int pc)
273{
274 u64 time = trace_clock_local();
275 struct ftrace_graph_ent ent = {
276 .func = ip,
277 .depth = 0,
278 };
279 struct ftrace_graph_ret ret = {
280 .func = ip,
281 .depth = 0,
282 .calltime = time,
283 .rettime = time,
284 };
285
286 __trace_graph_entry(tr, &ent, flags, pc);
287 __trace_graph_return(tr, &ret, flags, pc);
288}
289
290static void
291__trace_function(struct trace_array *tr, 251__trace_function(struct trace_array *tr,
292 unsigned long ip, unsigned long parent_ip, 252 unsigned long ip, unsigned long parent_ip,
293 unsigned long flags, int pc) 253 unsigned long flags, int pc)
294{ 254{
295 if (!is_graph()) 255 if (is_graph())
256 trace_graph_function(tr, ip, parent_ip, flags, pc);
257 else
296 trace_function(tr, ip, parent_ip, flags, pc); 258 trace_function(tr, ip, parent_ip, flags, pc);
297 else {
298 trace_graph_function(tr, parent_ip, flags, pc);
299 trace_graph_function(tr, ip, flags, pc);
300 }
301} 259}
302 260
303#else 261#else
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 7b8ecd751d93..3c5c5dfea0b3 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -13,7 +13,6 @@
13#include <linux/kdb.h> 13#include <linux/kdb.h>
14#include <linux/ftrace.h> 14#include <linux/ftrace.h>
15 15
16#include "../debug/kdb/kdb_private.h"
17#include "trace.h" 16#include "trace.h"
18#include "trace_output.h" 17#include "trace_output.h"
19 18
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8b27c9849b42..2dec9bcde8b4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -31,7 +31,6 @@
31#include <linux/perf_event.h> 31#include <linux/perf_event.h>
32#include <linux/stringify.h> 32#include <linux/stringify.h>
33#include <linux/limits.h> 33#include <linux/limits.h>
34#include <linux/uaccess.h>
35#include <asm/bitsperlong.h> 34#include <asm/bitsperlong.h>
36 35
37#include "trace.h" 36#include "trace.h"
@@ -514,8 +513,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
514static int kretprobe_dispatcher(struct kretprobe_instance *ri, 513static int kretprobe_dispatcher(struct kretprobe_instance *ri,
515 struct pt_regs *regs); 514 struct pt_regs *regs);
516 515
517/* Check the name is good for event/group */ 516/* Check the name is good for event/group/fields */
518static int check_event_name(const char *name) 517static int is_good_name(const char *name)
519{ 518{
520 if (!isalpha(*name) && *name != '_') 519 if (!isalpha(*name) && *name != '_')
521 return 0; 520 return 0;
@@ -557,7 +556,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
557 else 556 else
558 tp->rp.kp.pre_handler = kprobe_dispatcher; 557 tp->rp.kp.pre_handler = kprobe_dispatcher;
559 558
560 if (!event || !check_event_name(event)) { 559 if (!event || !is_good_name(event)) {
561 ret = -EINVAL; 560 ret = -EINVAL;
562 goto error; 561 goto error;
563 } 562 }
@@ -567,7 +566,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
567 if (!tp->call.name) 566 if (!tp->call.name)
568 goto error; 567 goto error;
569 568
570 if (!group || !check_event_name(group)) { 569 if (!group || !is_good_name(group)) {
571 ret = -EINVAL; 570 ret = -EINVAL;
572 goto error; 571 goto error;
573 } 572 }
@@ -648,7 +647,7 @@ static int register_trace_probe(struct trace_probe *tp)
648 } 647 }
649 ret = register_probe_event(tp); 648 ret = register_probe_event(tp);
650 if (ret) { 649 if (ret) {
651 pr_warning("Faild to register probe event(%d)\n", ret); 650 pr_warning("Failed to register probe event(%d)\n", ret);
652 goto end; 651 goto end;
653 } 652 }
654 653
@@ -883,7 +882,7 @@ static int create_trace_probe(int argc, char **argv)
883 int i, ret = 0; 882 int i, ret = 0;
884 int is_return = 0, is_delete = 0; 883 int is_return = 0, is_delete = 0;
885 char *symbol = NULL, *event = NULL, *group = NULL; 884 char *symbol = NULL, *event = NULL, *group = NULL;
886 char *arg, *tmp; 885 char *arg;
887 unsigned long offset = 0; 886 unsigned long offset = 0;
888 void *addr = NULL; 887 void *addr = NULL;
889 char buf[MAX_EVENT_NAME_LEN]; 888 char buf[MAX_EVENT_NAME_LEN];
@@ -992,26 +991,36 @@ static int create_trace_probe(int argc, char **argv)
992 /* parse arguments */ 991 /* parse arguments */
993 ret = 0; 992 ret = 0;
994 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { 993 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
994 /* Increment count for freeing args in error case */
995 tp->nr_args++;
996
995 /* Parse argument name */ 997 /* Parse argument name */
996 arg = strchr(argv[i], '='); 998 arg = strchr(argv[i], '=');
997 if (arg) 999 if (arg) {
998 *arg++ = '\0'; 1000 *arg++ = '\0';
999 else 1001 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
1002 } else {
1000 arg = argv[i]; 1003 arg = argv[i];
1004 /* If argument name is omitted, set "argN" */
1005 snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
1006 tp->args[i].name = kstrdup(buf, GFP_KERNEL);
1007 }
1001 1008
1002 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
1003 if (!tp->args[i].name) { 1009 if (!tp->args[i].name) {
1004 pr_info("Failed to allocate argument%d name '%s'.\n", 1010 pr_info("Failed to allocate argument[%d] name.\n", i);
1005 i, argv[i]);
1006 ret = -ENOMEM; 1011 ret = -ENOMEM;
1007 goto error; 1012 goto error;
1008 } 1013 }
1009 tmp = strchr(tp->args[i].name, ':'); 1014
1010 if (tmp) 1015 if (!is_good_name(tp->args[i].name)) {
1011 *tmp = '_'; /* convert : to _ */ 1016 pr_info("Invalid argument[%d] name: %s\n",
1017 i, tp->args[i].name);
1018 ret = -EINVAL;
1019 goto error;
1020 }
1012 1021
1013 if (conflict_field_name(tp->args[i].name, tp->args, i)) { 1022 if (conflict_field_name(tp->args[i].name, tp->args, i)) {
1014 pr_info("Argument%d name '%s' conflicts with " 1023 pr_info("Argument[%d] name '%s' conflicts with "
1015 "another field.\n", i, argv[i]); 1024 "another field.\n", i, argv[i]);
1016 ret = -EINVAL; 1025 ret = -EINVAL;
1017 goto error; 1026 goto error;
@@ -1020,12 +1029,9 @@ static int create_trace_probe(int argc, char **argv)
1020 /* Parse fetch argument */ 1029 /* Parse fetch argument */
1021 ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); 1030 ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
1022 if (ret) { 1031 if (ret) {
1023 pr_info("Parse error at argument%d. (%d)\n", i, ret); 1032 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
1024 kfree(tp->args[i].name);
1025 goto error; 1033 goto error;
1026 } 1034 }
1027
1028 tp->nr_args++;
1029 } 1035 }
1030 1036
1031 ret = register_trace_probe(tp); 1037 ret = register_trace_probe(tp);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 4086eae6e81b..7319559ed59f 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -31,48 +31,98 @@ static int wakeup_rt;
31static arch_spinlock_t wakeup_lock = 31static arch_spinlock_t wakeup_lock =
32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
33 33
34static void wakeup_reset(struct trace_array *tr);
34static void __wakeup_reset(struct trace_array *tr); 35static void __wakeup_reset(struct trace_array *tr);
36static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
37static void wakeup_graph_return(struct ftrace_graph_ret *trace);
35 38
36static int save_lat_flag; 39static int save_lat_flag;
37 40
41#define TRACE_DISPLAY_GRAPH 1
42
43static struct tracer_opt trace_opts[] = {
44#ifdef CONFIG_FUNCTION_GRAPH_TRACER
45 /* display latency trace as call graph */
46 { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
47#endif
48 { } /* Empty entry */
49};
50
51static struct tracer_flags tracer_flags = {
52 .val = 0,
53 .opts = trace_opts,
54};
55
56#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
57
38#ifdef CONFIG_FUNCTION_TRACER 58#ifdef CONFIG_FUNCTION_TRACER
59
39/* 60/*
40 * irqsoff uses its own tracer function to keep the overhead down: 61 * Prologue for the wakeup function tracers.
62 *
63 * Returns 1 if it is OK to continue, and preemption
64 * is disabled and data->disabled is incremented.
65 * 0 if the trace is to be ignored, and preemption
66 * is not disabled and data->disabled is
67 * kept the same.
68 *
69 * Note, this function is also used outside this ifdef but
70 * inside the #ifdef of the function graph tracer below.
71 * This is OK, since the function graph tracer is
72 * dependent on the function tracer.
41 */ 73 */
42static void 74static int
43wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) 75func_prolog_preempt_disable(struct trace_array *tr,
76 struct trace_array_cpu **data,
77 int *pc)
44{ 78{
45 struct trace_array *tr = wakeup_trace;
46 struct trace_array_cpu *data;
47 unsigned long flags;
48 long disabled; 79 long disabled;
49 int cpu; 80 int cpu;
50 int pc;
51 81
52 if (likely(!wakeup_task)) 82 if (likely(!wakeup_task))
53 return; 83 return 0;
54 84
55 pc = preempt_count(); 85 *pc = preempt_count();
56 preempt_disable_notrace(); 86 preempt_disable_notrace();
57 87
58 cpu = raw_smp_processor_id(); 88 cpu = raw_smp_processor_id();
59 if (cpu != wakeup_current_cpu) 89 if (cpu != wakeup_current_cpu)
60 goto out_enable; 90 goto out_enable;
61 91
62 data = tr->data[cpu]; 92 *data = tr->data[cpu];
63 disabled = atomic_inc_return(&data->disabled); 93 disabled = atomic_inc_return(&(*data)->disabled);
64 if (unlikely(disabled != 1)) 94 if (unlikely(disabled != 1))
65 goto out; 95 goto out;
66 96
67 local_irq_save(flags); 97 return 1;
68 98
69 trace_function(tr, ip, parent_ip, flags, pc); 99out:
100 atomic_dec(&(*data)->disabled);
101
102out_enable:
103 preempt_enable_notrace();
104 return 0;
105}
70 106
107/*
108 * wakeup uses its own tracer function to keep the overhead down:
109 */
110static void
111wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
112{
113 struct trace_array *tr = wakeup_trace;
114 struct trace_array_cpu *data;
115 unsigned long flags;
116 int pc;
117
118 if (!func_prolog_preempt_disable(tr, &data, &pc))
119 return;
120
121 local_irq_save(flags);
122 trace_function(tr, ip, parent_ip, flags, pc);
71 local_irq_restore(flags); 123 local_irq_restore(flags);
72 124
73 out:
74 atomic_dec(&data->disabled); 125 atomic_dec(&data->disabled);
75 out_enable:
76 preempt_enable_notrace(); 126 preempt_enable_notrace();
77} 127}
78 128
@@ -82,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly =
82}; 132};
83#endif /* CONFIG_FUNCTION_TRACER */ 133#endif /* CONFIG_FUNCTION_TRACER */
84 134
135static int start_func_tracer(int graph)
136{
137 int ret;
138
139 if (!graph)
140 ret = register_ftrace_function(&trace_ops);
141 else
142 ret = register_ftrace_graph(&wakeup_graph_return,
143 &wakeup_graph_entry);
144
145 if (!ret && tracing_is_enabled())
146 tracer_enabled = 1;
147 else
148 tracer_enabled = 0;
149
150 return ret;
151}
152
153static void stop_func_tracer(int graph)
154{
155 tracer_enabled = 0;
156
157 if (!graph)
158 unregister_ftrace_function(&trace_ops);
159 else
160 unregister_ftrace_graph();
161}
162
163#ifdef CONFIG_FUNCTION_GRAPH_TRACER
164static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
165{
166
167 if (!(bit & TRACE_DISPLAY_GRAPH))
168 return -EINVAL;
169
170 if (!(is_graph() ^ set))
171 return 0;
172
173 stop_func_tracer(!set);
174
175 wakeup_reset(wakeup_trace);
176 tracing_max_latency = 0;
177
178 return start_func_tracer(set);
179}
180
181static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
182{
183 struct trace_array *tr = wakeup_trace;
184 struct trace_array_cpu *data;
185 unsigned long flags;
186 int pc, ret = 0;
187
188 if (!func_prolog_preempt_disable(tr, &data, &pc))
189 return 0;
190
191 local_save_flags(flags);
192 ret = __trace_graph_entry(tr, trace, flags, pc);
193 atomic_dec(&data->disabled);
194 preempt_enable_notrace();
195
196 return ret;
197}
198
199static void wakeup_graph_return(struct ftrace_graph_ret *trace)
200{
201 struct trace_array *tr = wakeup_trace;
202 struct trace_array_cpu *data;
203 unsigned long flags;
204 int pc;
205
206 if (!func_prolog_preempt_disable(tr, &data, &pc))
207 return;
208
209 local_save_flags(flags);
210 __trace_graph_return(tr, trace, flags, pc);
211 atomic_dec(&data->disabled);
212
213 preempt_enable_notrace();
214 return;
215}
216
217static void wakeup_trace_open(struct trace_iterator *iter)
218{
219 if (is_graph())
220 graph_trace_open(iter);
221}
222
223static void wakeup_trace_close(struct trace_iterator *iter)
224{
225 if (iter->private)
226 graph_trace_close(iter);
227}
228
229#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC)
230
231static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
232{
233 /*
234 * In graph mode call the graph tracer output function,
235 * otherwise go with the TRACE_FN event handler
236 */
237 if (is_graph())
238 return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
239
240 return TRACE_TYPE_UNHANDLED;
241}
242
243static void wakeup_print_header(struct seq_file *s)
244{
245 if (is_graph())
246 print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
247 else
248 trace_default_header(s);
249}
250
251static void
252__trace_function(struct trace_array *tr,
253 unsigned long ip, unsigned long parent_ip,
254 unsigned long flags, int pc)
255{
256 if (is_graph())
257 trace_graph_function(tr, ip, parent_ip, flags, pc);
258 else
259 trace_function(tr, ip, parent_ip, flags, pc);
260}
261#else
262#define __trace_function trace_function
263
264static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
265{
266 return -EINVAL;
267}
268
269static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
270{
271 return -1;
272}
273
274static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
275{
276 return TRACE_TYPE_UNHANDLED;
277}
278
279static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
280static void wakeup_print_header(struct seq_file *s) { }
281static void wakeup_trace_open(struct trace_iterator *iter) { }
282static void wakeup_trace_close(struct trace_iterator *iter) { }
283#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
284
85/* 285/*
86 * Should this new latency be reported/recorded? 286 * Should this new latency be reported/recorded?
87 */ 287 */
@@ -152,7 +352,7 @@ probe_wakeup_sched_switch(void *ignore,
152 /* The task we are waiting for is waking up */ 352 /* The task we are waiting for is waking up */
153 data = wakeup_trace->data[wakeup_cpu]; 353 data = wakeup_trace->data[wakeup_cpu];
154 354
155 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 355 __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
156 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 356 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
157 357
158 T0 = data->preempt_timestamp; 358 T0 = data->preempt_timestamp;
@@ -252,7 +452,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
252 * is not called by an assembly function (where as schedule is) 452 * is not called by an assembly function (where as schedule is)
253 * it should be safe to use it here. 453 * it should be safe to use it here.
254 */ 454 */
255 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); 455 __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
256 456
257out_locked: 457out_locked:
258 arch_spin_unlock(&wakeup_lock); 458 arch_spin_unlock(&wakeup_lock);
@@ -303,12 +503,8 @@ static void start_wakeup_tracer(struct trace_array *tr)
303 */ 503 */
304 smp_wmb(); 504 smp_wmb();
305 505
306 register_ftrace_function(&trace_ops); 506 if (start_func_tracer(is_graph()))
307 507 printk(KERN_ERR "failed to start wakeup tracer\n");
308 if (tracing_is_enabled())
309 tracer_enabled = 1;
310 else
311 tracer_enabled = 0;
312 508
313 return; 509 return;
314fail_deprobe_wake_new: 510fail_deprobe_wake_new:
@@ -320,7 +516,7 @@ fail_deprobe:
320static void stop_wakeup_tracer(struct trace_array *tr) 516static void stop_wakeup_tracer(struct trace_array *tr)
321{ 517{
322 tracer_enabled = 0; 518 tracer_enabled = 0;
323 unregister_ftrace_function(&trace_ops); 519 stop_func_tracer(is_graph());
324 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); 520 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
325 unregister_trace_sched_wakeup_new(probe_wakeup, NULL); 521 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
326 unregister_trace_sched_wakeup(probe_wakeup, NULL); 522 unregister_trace_sched_wakeup(probe_wakeup, NULL);
@@ -379,9 +575,15 @@ static struct tracer wakeup_tracer __read_mostly =
379 .start = wakeup_tracer_start, 575 .start = wakeup_tracer_start,
380 .stop = wakeup_tracer_stop, 576 .stop = wakeup_tracer_stop,
381 .print_max = 1, 577 .print_max = 1,
578 .print_header = wakeup_print_header,
579 .print_line = wakeup_print_line,
580 .flags = &tracer_flags,
581 .set_flag = wakeup_set_flag,
382#ifdef CONFIG_FTRACE_SELFTEST 582#ifdef CONFIG_FTRACE_SELFTEST
383 .selftest = trace_selftest_startup_wakeup, 583 .selftest = trace_selftest_startup_wakeup,
384#endif 584#endif
585 .open = wakeup_trace_open,
586 .close = wakeup_trace_close,
385 .use_max_tr = 1, 587 .use_max_tr = 1,
386}; 588};
387 589
@@ -394,9 +596,15 @@ static struct tracer wakeup_rt_tracer __read_mostly =
394 .stop = wakeup_tracer_stop, 596 .stop = wakeup_tracer_stop,
395 .wait_pipe = poll_wait_pipe, 597 .wait_pipe = poll_wait_pipe,
396 .print_max = 1, 598 .print_max = 1,
599 .print_header = wakeup_print_header,
600 .print_line = wakeup_print_line,
601 .flags = &tracer_flags,
602 .set_flag = wakeup_set_flag,
397#ifdef CONFIG_FTRACE_SELFTEST 603#ifdef CONFIG_FTRACE_SELFTEST
398 .selftest = trace_selftest_startup_wakeup, 604 .selftest = trace_selftest_startup_wakeup,
399#endif 605#endif
606 .open = wakeup_trace_open,
607 .close = wakeup_trace_close,
400 .use_max_tr = 1, 608 .use_max_tr = 1,
401}; 609};
402 610
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 056468eae7cf..4c5dead0c239 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -195,6 +195,7 @@ static const struct file_operations stack_max_size_fops = {
195 .open = tracing_open_generic, 195 .open = tracing_open_generic,
196 .read = stack_max_size_read, 196 .read = stack_max_size_read,
197 .write = stack_max_size_write, 197 .write = stack_max_size_write,
198 .llseek = default_llseek,
198}; 199};
199 200
200static void * 201static void *
@@ -249,7 +250,7 @@ static int trace_lookup_stack(struct seq_file *m, long i)
249{ 250{
250 unsigned long addr = stack_dump_trace[i]; 251 unsigned long addr = stack_dump_trace[i];
251 252
252 return seq_printf(m, "%pF\n", (void *)addr); 253 return seq_printf(m, "%pS\n", (void *)addr);
253} 254}
254 255
255static void print_disabled(struct seq_file *m) 256static void print_disabled(struct seq_file *m)
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index a7cc3793baf6..209b379a4721 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void)
263{ 263{
264 int ret, cpu; 264 int ret, cpu;
265 265
266 for_each_possible_cpu(cpu) {
267 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
268 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
269 }
270
266 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); 271 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
267 if (ret) 272 if (ret)
268 goto out; 273 goto out;
@@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void)
279 if (ret) 284 if (ret)
280 goto no_creation; 285 goto no_creation;
281 286
282 for_each_possible_cpu(cpu) {
283 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
284 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
285 }
286
287 return 0; 287 return 0;
288 288
289no_creation: 289no_creation:
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index c77f3eceea25..e95ee7f31d43 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,6 +25,7 @@
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/jump_label.h>
28 29
29extern struct tracepoint __start___tracepoints[]; 30extern struct tracepoint __start___tracepoints[];
30extern struct tracepoint __stop___tracepoints[]; 31extern struct tracepoint __stop___tracepoints[];
@@ -263,7 +264,13 @@ static void set_tracepoint(struct tracepoint_entry **entry,
263 * is used. 264 * is used.
264 */ 265 */
265 rcu_assign_pointer(elem->funcs, (*entry)->funcs); 266 rcu_assign_pointer(elem->funcs, (*entry)->funcs);
266 elem->state = active; 267 if (!elem->state && active) {
268 jump_label_enable(&elem->state);
269 elem->state = active;
270 } else if (elem->state && !active) {
271 jump_label_disable(&elem->state);
272 elem->state = active;
273 }
267} 274}
268 275
269/* 276/*
@@ -277,7 +284,10 @@ static void disable_tracepoint(struct tracepoint *elem)
277 if (elem->unregfunc && elem->state) 284 if (elem->unregfunc && elem->state)
278 elem->unregfunc(); 285 elem->unregfunc();
279 286
280 elem->state = 0; 287 if (elem->state) {
288 jump_label_disable(&elem->state);
289 elem->state = 0;
290 }
281 rcu_assign_pointer(elem->funcs, NULL); 291 rcu_assign_pointer(elem->funcs, NULL);
282} 292}
283 293
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 0a67e041edf8..24dc60d9fa1f 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -63,12 +63,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
63 stats->ac_ppid = pid_alive(tsk) ? 63 stats->ac_ppid = pid_alive(tsk) ?
64 rcu_dereference(tsk->real_parent)->tgid : 0; 64 rcu_dereference(tsk->real_parent)->tgid : 0;
65 rcu_read_unlock(); 65 rcu_read_unlock();
66 stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; 66 stats->ac_utime = cputime_to_usecs(tsk->utime);
67 stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; 67 stats->ac_stime = cputime_to_usecs(tsk->stime);
68 stats->ac_utimescaled = 68 stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled);
69 cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC; 69 stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled);
70 stats->ac_stimescaled =
71 cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC;
72 stats->ac_minflt = tsk->min_flt; 70 stats->ac_minflt = tsk->min_flt;
73 stats->ac_majflt = tsk->maj_flt; 71 stats->ac_majflt = tsk->maj_flt;
74 72
diff --git a/kernel/user.c b/kernel/user.c
index 7e72614b736d..2c7d8d5914b1 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -91,6 +91,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
91 * upon function exit. 91 * upon function exit.
92 */ 92 */
93static void free_user(struct user_struct *up, unsigned long flags) 93static void free_user(struct user_struct *up, unsigned long flags)
94 __releases(&uidhash_lock)
94{ 95{
95 uid_hash_remove(up); 96 uid_hash_remove(up);
96 spin_unlock_irqrestore(&uidhash_lock, flags); 97 spin_unlock_irqrestore(&uidhash_lock, flags);
diff --git a/kernel/wait.c b/kernel/wait.c
index c4bd3d825f35..b0310eb6cc1e 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -92,7 +92,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
92} 92}
93EXPORT_SYMBOL(prepare_to_wait_exclusive); 93EXPORT_SYMBOL(prepare_to_wait_exclusive);
94 94
95/* 95/**
96 * finish_wait - clean up after waiting in a queue 96 * finish_wait - clean up after waiting in a queue
97 * @q: waitqueue waited on 97 * @q: waitqueue waited on
98 * @wait: wait descriptor 98 * @wait: wait descriptor
@@ -127,11 +127,11 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
127} 127}
128EXPORT_SYMBOL(finish_wait); 128EXPORT_SYMBOL(finish_wait);
129 129
130/* 130/**
131 * abort_exclusive_wait - abort exclusive waiting in a queue 131 * abort_exclusive_wait - abort exclusive waiting in a queue
132 * @q: waitqueue waited on 132 * @q: waitqueue waited on
133 * @wait: wait descriptor 133 * @wait: wait descriptor
134 * @state: runstate of the waiter to be woken 134 * @mode: runstate of the waiter to be woken
135 * @key: key to identify a wait bit queue or %NULL 135 * @key: key to identify a wait bit queue or %NULL
136 * 136 *
137 * Sets current thread back to running state and removes 137 * Sets current thread back to running state and removes
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 613bc1f04610..bafba687a6d8 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -43,7 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif 44#endif
45 45
46static int __read_mostly did_panic;
47static int __initdata no_watchdog; 46static int __initdata no_watchdog;
48 47
49 48
@@ -122,7 +121,7 @@ static void __touch_watchdog(void)
122 121
123void touch_softlockup_watchdog(void) 122void touch_softlockup_watchdog(void)
124{ 123{
125 __get_cpu_var(watchdog_touch_ts) = 0; 124 __raw_get_cpu_var(watchdog_touch_ts) = 0;
126} 125}
127EXPORT_SYMBOL(touch_softlockup_watchdog); 126EXPORT_SYMBOL(touch_softlockup_watchdog);
128 127
@@ -142,7 +141,14 @@ void touch_all_softlockup_watchdogs(void)
142#ifdef CONFIG_HARDLOCKUP_DETECTOR 141#ifdef CONFIG_HARDLOCKUP_DETECTOR
143void touch_nmi_watchdog(void) 142void touch_nmi_watchdog(void)
144{ 143{
145 __get_cpu_var(watchdog_nmi_touch) = true; 144 if (watchdog_enabled) {
145 unsigned cpu;
146
147 for_each_present_cpu(cpu) {
148 if (per_cpu(watchdog_nmi_touch, cpu) != true)
149 per_cpu(watchdog_nmi_touch, cpu) = true;
150 }
151 }
146 touch_softlockup_watchdog(); 152 touch_softlockup_watchdog();
147} 153}
148EXPORT_SYMBOL(touch_nmi_watchdog); 154EXPORT_SYMBOL(touch_nmi_watchdog);
@@ -180,18 +186,6 @@ static int is_softlockup(unsigned long touch_ts)
180 return 0; 186 return 0;
181} 187}
182 188
183static int
184watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
185{
186 did_panic = 1;
187
188 return NOTIFY_DONE;
189}
190
191static struct notifier_block panic_block = {
192 .notifier_call = watchdog_panic,
193};
194
195#ifdef CONFIG_HARDLOCKUP_DETECTOR 189#ifdef CONFIG_HARDLOCKUP_DETECTOR
196static struct perf_event_attr wd_hw_attr = { 190static struct perf_event_attr wd_hw_attr = {
197 .type = PERF_TYPE_HARDWARE, 191 .type = PERF_TYPE_HARDWARE,
@@ -202,10 +196,13 @@ static struct perf_event_attr wd_hw_attr = {
202}; 196};
203 197
204/* Callback function for perf event subsystem */ 198/* Callback function for perf event subsystem */
205void watchdog_overflow_callback(struct perf_event *event, int nmi, 199static void watchdog_overflow_callback(struct perf_event *event, int nmi,
206 struct perf_sample_data *data, 200 struct perf_sample_data *data,
207 struct pt_regs *regs) 201 struct pt_regs *regs)
208{ 202{
203 /* Ensure the watchdog never gets throttled */
204 event->hw.interrupts = 0;
205
209 if (__get_cpu_var(watchdog_nmi_touch) == true) { 206 if (__get_cpu_var(watchdog_nmi_touch) == true) {
210 __get_cpu_var(watchdog_nmi_touch) = false; 207 __get_cpu_var(watchdog_nmi_touch) = false;
211 return; 208 return;
@@ -361,14 +358,14 @@ static int watchdog_nmi_enable(int cpu)
361 /* Try to register using hardware perf events */ 358 /* Try to register using hardware perf events */
362 wd_attr = &wd_hw_attr; 359 wd_attr = &wd_hw_attr;
363 wd_attr->sample_period = hw_nmi_get_sample_period(); 360 wd_attr->sample_period = hw_nmi_get_sample_period();
364 event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback); 361 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
365 if (!IS_ERR(event)) { 362 if (!IS_ERR(event)) {
366 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); 363 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
367 goto out_save; 364 goto out_save;
368 } 365 }
369 366
370 printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); 367 printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
371 return -1; 368 return PTR_ERR(event);
372 369
373 /* success path */ 370 /* success path */
374out_save: 371out_save:
@@ -412,17 +409,19 @@ static int watchdog_prepare_cpu(int cpu)
412static int watchdog_enable(int cpu) 409static int watchdog_enable(int cpu)
413{ 410{
414 struct task_struct *p = per_cpu(softlockup_watchdog, cpu); 411 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
412 int err;
415 413
416 /* enable the perf event */ 414 /* enable the perf event */
417 if (watchdog_nmi_enable(cpu) != 0) 415 err = watchdog_nmi_enable(cpu);
418 return -1; 416 if (err)
417 return err;
419 418
420 /* create the watchdog thread */ 419 /* create the watchdog thread */
421 if (!p) { 420 if (!p) {
422 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); 421 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
423 if (IS_ERR(p)) { 422 if (IS_ERR(p)) {
424 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); 423 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
425 return -1; 424 return PTR_ERR(p);
426 } 425 }
427 kthread_bind(p, cpu); 426 kthread_bind(p, cpu);
428 per_cpu(watchdog_touch_ts, cpu) = 0; 427 per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -430,6 +429,9 @@ static int watchdog_enable(int cpu)
430 wake_up_process(p); 429 wake_up_process(p);
431 } 430 }
432 431
432 /* if any cpu succeeds, watchdog is considered enabled for the system */
433 watchdog_enabled = 1;
434
433 return 0; 435 return 0;
434} 436}
435 437
@@ -452,9 +454,6 @@ static void watchdog_disable(int cpu)
452 per_cpu(softlockup_watchdog, cpu) = NULL; 454 per_cpu(softlockup_watchdog, cpu) = NULL;
453 kthread_stop(p); 455 kthread_stop(p);
454 } 456 }
455
456 /* if any cpu succeeds, watchdog is considered enabled for the system */
457 watchdog_enabled = 1;
458} 457}
459 458
460static void watchdog_enable_all_cpus(void) 459static void watchdog_enable_all_cpus(void)
@@ -474,6 +473,9 @@ static void watchdog_disable_all_cpus(void)
474{ 473{
475 int cpu; 474 int cpu;
476 475
476 if (no_watchdog)
477 return;
478
477 for_each_online_cpu(cpu) 479 for_each_online_cpu(cpu)
478 watchdog_disable(cpu); 480 watchdog_disable(cpu);
479 481
@@ -516,17 +518,16 @@ static int __cpuinit
516cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 518cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
517{ 519{
518 int hotcpu = (unsigned long)hcpu; 520 int hotcpu = (unsigned long)hcpu;
521 int err = 0;
519 522
520 switch (action) { 523 switch (action) {
521 case CPU_UP_PREPARE: 524 case CPU_UP_PREPARE:
522 case CPU_UP_PREPARE_FROZEN: 525 case CPU_UP_PREPARE_FROZEN:
523 if (watchdog_prepare_cpu(hotcpu)) 526 err = watchdog_prepare_cpu(hotcpu);
524 return NOTIFY_BAD;
525 break; 527 break;
526 case CPU_ONLINE: 528 case CPU_ONLINE:
527 case CPU_ONLINE_FROZEN: 529 case CPU_ONLINE_FROZEN:
528 if (watchdog_enable(hotcpu)) 530 err = watchdog_enable(hotcpu);
529 return NOTIFY_BAD;
530 break; 531 break;
531#ifdef CONFIG_HOTPLUG_CPU 532#ifdef CONFIG_HOTPLUG_CPU
532 case CPU_UP_CANCELED: 533 case CPU_UP_CANCELED:
@@ -539,7 +540,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
539 break; 540 break;
540#endif /* CONFIG_HOTPLUG_CPU */ 541#endif /* CONFIG_HOTPLUG_CPU */
541 } 542 }
542 return NOTIFY_OK; 543 return notifier_from_errno(err);
543} 544}
544 545
545static struct notifier_block __cpuinitdata cpu_nfb = { 546static struct notifier_block __cpuinitdata cpu_nfb = {
@@ -555,13 +556,11 @@ static int __init spawn_watchdog_task(void)
555 return 0; 556 return 0;
556 557
557 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 558 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
558 WARN_ON(err == NOTIFY_BAD); 559 WARN_ON(notifier_to_errno(err));
559 560
560 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 561 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
561 register_cpu_notifier(&cpu_nfb); 562 register_cpu_notifier(&cpu_nfb);
562 563
563 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
564
565 return 0; 564 return 0;
566} 565}
567early_initcall(spawn_watchdog_task); 566early_initcall(spawn_watchdog_task);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2994a0e3a61c..90db1bd1a978 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1,19 +1,26 @@
1/* 1/*
2 * linux/kernel/workqueue.c 2 * kernel/workqueue.c - generic async execution with shared worker pool
3 * 3 *
4 * Generic mechanism for defining kernel helper threads for running 4 * Copyright (C) 2002 Ingo Molnar
5 * arbitrary tasks in process context.
6 * 5 *
7 * Started by Ingo Molnar, Copyright (C) 2002 6 * Derived from the taskqueue/keventd code by:
7 * David Woodhouse <dwmw2@infradead.org>
8 * Andrew Morton
9 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
10 * Theodore Ts'o <tytso@mit.edu>
8 * 11 *
9 * Derived from the taskqueue/keventd code by: 12 * Made to use alloc_percpu by Christoph Lameter.
10 * 13 *
11 * David Woodhouse <dwmw2@infradead.org> 14 * Copyright (C) 2010 SUSE Linux Products GmbH
12 * Andrew Morton 15 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
13 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
14 * Theodore Ts'o <tytso@mit.edu>
15 * 16 *
16 * Made to use alloc_percpu by Christoph Lameter. 17 * This is the generic async execution mechanism. Work items as are
18 * executed in process context. The worker pool is shared and
19 * automatically managed. There is one worker pool for each CPU and
20 * one extra for works which are better served by workers which are
21 * not bound to any specific CPU.
22 *
23 * Please read Documentation/workqueue.txt for details.
17 */ 24 */
18 25
19#include <linux/module.h> 26#include <linux/module.h>
@@ -87,7 +94,8 @@ enum {
87/* 94/*
88 * Structure fields follow one of the following exclusion rules. 95 * Structure fields follow one of the following exclusion rules.
89 * 96 *
90 * I: Set during initialization and read-only afterwards. 97 * I: Modifiable by initialization/destruction paths and read-only for
98 * everyone else.
91 * 99 *
92 * P: Preemption protected. Disabling preemption is enough and should 100 * P: Preemption protected. Disabling preemption is enough and should
93 * only be modified and accessed from the local cpu. 101 * only be modified and accessed from the local cpu.
@@ -195,7 +203,7 @@ typedef cpumask_var_t mayday_mask_t;
195 cpumask_test_and_set_cpu((cpu), (mask)) 203 cpumask_test_and_set_cpu((cpu), (mask))
196#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask)) 204#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask))
197#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask)) 205#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask))
198#define alloc_mayday_mask(maskp, gfp) alloc_cpumask_var((maskp), (gfp)) 206#define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp))
199#define free_mayday_mask(mask) free_cpumask_var((mask)) 207#define free_mayday_mask(mask) free_cpumask_var((mask))
200#else 208#else
201typedef unsigned long mayday_mask_t; 209typedef unsigned long mayday_mask_t;
@@ -246,6 +254,9 @@ EXPORT_SYMBOL_GPL(system_long_wq);
246EXPORT_SYMBOL_GPL(system_nrt_wq); 254EXPORT_SYMBOL_GPL(system_nrt_wq);
247EXPORT_SYMBOL_GPL(system_unbound_wq); 255EXPORT_SYMBOL_GPL(system_unbound_wq);
248 256
257#define CREATE_TRACE_POINTS
258#include <trace/events/workqueue.h>
259
249#define for_each_busy_worker(worker, i, pos, gcwq) \ 260#define for_each_busy_worker(worker, i, pos, gcwq) \
250 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ 261 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
251 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) 262 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
@@ -299,21 +310,6 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
299 (cpu) < WORK_CPU_NONE; \ 310 (cpu) < WORK_CPU_NONE; \
300 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) 311 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
301 312
302#ifdef CONFIG_LOCKDEP
303/**
304 * in_workqueue_context() - in context of specified workqueue?
305 * @wq: the workqueue of interest
306 *
307 * Checks lockdep state to see if the current task is executing from
308 * within a workqueue item. This function exists only if lockdep is
309 * enabled.
310 */
311int in_workqueue_context(struct workqueue_struct *wq)
312{
313 return lock_is_held(&wq->lockdep_map);
314}
315#endif
316
317#ifdef CONFIG_DEBUG_OBJECTS_WORK 313#ifdef CONFIG_DEBUG_OBJECTS_WORK
318 314
319static struct debug_obj_descr work_debug_descr; 315static struct debug_obj_descr work_debug_descr;
@@ -593,7 +589,9 @@ static bool keep_working(struct global_cwq *gcwq)
593{ 589{
594 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); 590 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
595 591
596 return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1; 592 return !list_empty(&gcwq->worklist) &&
593 (atomic_read(nr_running) <= 1 ||
594 gcwq->flags & GCWQ_HIGHPRI_PENDING);
597} 595}
598 596
599/* Do we need a new worker? Called from manager. */ 597/* Do we need a new worker? Called from manager. */
@@ -940,10 +938,14 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
940 struct global_cwq *gcwq; 938 struct global_cwq *gcwq;
941 struct cpu_workqueue_struct *cwq; 939 struct cpu_workqueue_struct *cwq;
942 struct list_head *worklist; 940 struct list_head *worklist;
941 unsigned int work_flags;
943 unsigned long flags; 942 unsigned long flags;
944 943
945 debug_work_activate(work); 944 debug_work_activate(work);
946 945
946 if (WARN_ON_ONCE(wq->flags & WQ_DYING))
947 return;
948
947 /* determine gcwq to use */ 949 /* determine gcwq to use */
948 if (!(wq->flags & WQ_UNBOUND)) { 950 if (!(wq->flags & WQ_UNBOUND)) {
949 struct global_cwq *last_gcwq; 951 struct global_cwq *last_gcwq;
@@ -982,18 +984,23 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
982 984
983 /* gcwq determined, get cwq and queue */ 985 /* gcwq determined, get cwq and queue */
984 cwq = get_cwq(gcwq->cpu, wq); 986 cwq = get_cwq(gcwq->cpu, wq);
987 trace_workqueue_queue_work(cpu, cwq, work);
985 988
986 BUG_ON(!list_empty(&work->entry)); 989 BUG_ON(!list_empty(&work->entry));
987 990
988 cwq->nr_in_flight[cwq->work_color]++; 991 cwq->nr_in_flight[cwq->work_color]++;
992 work_flags = work_color_to_flags(cwq->work_color);
989 993
990 if (likely(cwq->nr_active < cwq->max_active)) { 994 if (likely(cwq->nr_active < cwq->max_active)) {
995 trace_workqueue_activate_work(work);
991 cwq->nr_active++; 996 cwq->nr_active++;
992 worklist = gcwq_determine_ins_pos(gcwq, cwq); 997 worklist = gcwq_determine_ins_pos(gcwq, cwq);
993 } else 998 } else {
999 work_flags |= WORK_STRUCT_DELAYED;
994 worklist = &cwq->delayed_works; 1000 worklist = &cwq->delayed_works;
1001 }
995 1002
996 insert_work(cwq, work, worklist, work_color_to_flags(cwq->work_color)); 1003 insert_work(cwq, work, worklist, work_flags);
997 1004
998 spin_unlock_irqrestore(&gcwq->lock, flags); 1005 spin_unlock_irqrestore(&gcwq->lock, flags);
999} 1006}
@@ -1212,6 +1219,7 @@ static void worker_leave_idle(struct worker *worker)
1212 * bound), %false if offline. 1219 * bound), %false if offline.
1213 */ 1220 */
1214static bool worker_maybe_bind_and_lock(struct worker *worker) 1221static bool worker_maybe_bind_and_lock(struct worker *worker)
1222__acquires(&gcwq->lock)
1215{ 1223{
1216 struct global_cwq *gcwq = worker->gcwq; 1224 struct global_cwq *gcwq = worker->gcwq;
1217 struct task_struct *task = worker->task; 1225 struct task_struct *task = worker->task;
@@ -1485,6 +1493,8 @@ static void gcwq_mayday_timeout(unsigned long __gcwq)
1485 * otherwise. 1493 * otherwise.
1486 */ 1494 */
1487static bool maybe_create_worker(struct global_cwq *gcwq) 1495static bool maybe_create_worker(struct global_cwq *gcwq)
1496__releases(&gcwq->lock)
1497__acquires(&gcwq->lock)
1488{ 1498{
1489 if (!need_to_create_worker(gcwq)) 1499 if (!need_to_create_worker(gcwq))
1490 return false; 1500 return false;
@@ -1658,7 +1668,9 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1658 struct work_struct, entry); 1668 struct work_struct, entry);
1659 struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); 1669 struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
1660 1670
1671 trace_workqueue_activate_work(work);
1661 move_linked_works(work, pos, NULL); 1672 move_linked_works(work, pos, NULL);
1673 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
1662 cwq->nr_active++; 1674 cwq->nr_active++;
1663} 1675}
1664 1676
@@ -1666,6 +1678,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1666 * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight 1678 * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
1667 * @cwq: cwq of interest 1679 * @cwq: cwq of interest
1668 * @color: color of work which left the queue 1680 * @color: color of work which left the queue
1681 * @delayed: for a delayed work
1669 * 1682 *
1670 * A work either has completed or is removed from pending queue, 1683 * A work either has completed or is removed from pending queue,
1671 * decrement nr_in_flight of its cwq and handle workqueue flushing. 1684 * decrement nr_in_flight of its cwq and handle workqueue flushing.
@@ -1673,19 +1686,22 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1673 * CONTEXT: 1686 * CONTEXT:
1674 * spin_lock_irq(gcwq->lock). 1687 * spin_lock_irq(gcwq->lock).
1675 */ 1688 */
1676static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) 1689static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
1690 bool delayed)
1677{ 1691{
1678 /* ignore uncolored works */ 1692 /* ignore uncolored works */
1679 if (color == WORK_NO_COLOR) 1693 if (color == WORK_NO_COLOR)
1680 return; 1694 return;
1681 1695
1682 cwq->nr_in_flight[color]--; 1696 cwq->nr_in_flight[color]--;
1683 cwq->nr_active--;
1684 1697
1685 if (!list_empty(&cwq->delayed_works)) { 1698 if (!delayed) {
1686 /* one down, submit a delayed one */ 1699 cwq->nr_active--;
1687 if (cwq->nr_active < cwq->max_active) 1700 if (!list_empty(&cwq->delayed_works)) {
1688 cwq_activate_first_delayed(cwq); 1701 /* one down, submit a delayed one */
1702 if (cwq->nr_active < cwq->max_active)
1703 cwq_activate_first_delayed(cwq);
1704 }
1689 } 1705 }
1690 1706
1691 /* is flush in progress and are we at the flushing tip? */ 1707 /* is flush in progress and are we at the flushing tip? */
@@ -1722,6 +1738,8 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
1722 * spin_lock_irq(gcwq->lock) which is released and regrabbed. 1738 * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1723 */ 1739 */
1724static void process_one_work(struct worker *worker, struct work_struct *work) 1740static void process_one_work(struct worker *worker, struct work_struct *work)
1741__releases(&gcwq->lock)
1742__acquires(&gcwq->lock)
1725{ 1743{
1726 struct cpu_workqueue_struct *cwq = get_work_cwq(work); 1744 struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1727 struct global_cwq *gcwq = cwq->gcwq; 1745 struct global_cwq *gcwq = cwq->gcwq;
@@ -1790,7 +1808,13 @@ static void process_one_work(struct worker *worker, struct work_struct *work)
1790 work_clear_pending(work); 1808 work_clear_pending(work);
1791 lock_map_acquire(&cwq->wq->lockdep_map); 1809 lock_map_acquire(&cwq->wq->lockdep_map);
1792 lock_map_acquire(&lockdep_map); 1810 lock_map_acquire(&lockdep_map);
1811 trace_workqueue_execute_start(work);
1793 f(work); 1812 f(work);
1813 /*
1814 * While we must be careful to not use "work" after this, the trace
1815 * point will only record its address.
1816 */
1817 trace_workqueue_execute_end(work);
1794 lock_map_release(&lockdep_map); 1818 lock_map_release(&lockdep_map);
1795 lock_map_release(&cwq->wq->lockdep_map); 1819 lock_map_release(&cwq->wq->lockdep_map);
1796 1820
@@ -1814,7 +1838,7 @@ static void process_one_work(struct worker *worker, struct work_struct *work)
1814 hlist_del_init(&worker->hentry); 1838 hlist_del_init(&worker->hentry);
1815 worker->current_work = NULL; 1839 worker->current_work = NULL;
1816 worker->current_cwq = NULL; 1840 worker->current_cwq = NULL;
1817 cwq_dec_nr_in_flight(cwq, work_color); 1841 cwq_dec_nr_in_flight(cwq, work_color, false);
1818} 1842}
1819 1843
1820/** 1844/**
@@ -2040,7 +2064,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2040 * checks and call back into the fixup functions where we 2064 * checks and call back into the fixup functions where we
2041 * might deadlock. 2065 * might deadlock.
2042 */ 2066 */
2043 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); 2067 INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
2044 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); 2068 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
2045 init_completion(&barr->done); 2069 init_completion(&barr->done);
2046 2070
@@ -2292,27 +2316,17 @@ out_unlock:
2292} 2316}
2293EXPORT_SYMBOL_GPL(flush_workqueue); 2317EXPORT_SYMBOL_GPL(flush_workqueue);
2294 2318
2295/** 2319static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2296 * flush_work - block until a work_struct's callback has terminated 2320 bool wait_executing)
2297 * @work: the work which is to be flushed
2298 *
2299 * Returns false if @work has already terminated.
2300 *
2301 * It is expected that, prior to calling flush_work(), the caller has
2302 * arranged for the work to not be requeued, otherwise it doesn't make
2303 * sense to use this function.
2304 */
2305int flush_work(struct work_struct *work)
2306{ 2321{
2307 struct worker *worker = NULL; 2322 struct worker *worker = NULL;
2308 struct global_cwq *gcwq; 2323 struct global_cwq *gcwq;
2309 struct cpu_workqueue_struct *cwq; 2324 struct cpu_workqueue_struct *cwq;
2310 struct wq_barrier barr;
2311 2325
2312 might_sleep(); 2326 might_sleep();
2313 gcwq = get_work_gcwq(work); 2327 gcwq = get_work_gcwq(work);
2314 if (!gcwq) 2328 if (!gcwq)
2315 return 0; 2329 return false;
2316 2330
2317 spin_lock_irq(&gcwq->lock); 2331 spin_lock_irq(&gcwq->lock);
2318 if (!list_empty(&work->entry)) { 2332 if (!list_empty(&work->entry)) {
@@ -2325,28 +2339,127 @@ int flush_work(struct work_struct *work)
2325 cwq = get_work_cwq(work); 2339 cwq = get_work_cwq(work);
2326 if (unlikely(!cwq || gcwq != cwq->gcwq)) 2340 if (unlikely(!cwq || gcwq != cwq->gcwq))
2327 goto already_gone; 2341 goto already_gone;
2328 } else { 2342 } else if (wait_executing) {
2329 worker = find_worker_executing_work(gcwq, work); 2343 worker = find_worker_executing_work(gcwq, work);
2330 if (!worker) 2344 if (!worker)
2331 goto already_gone; 2345 goto already_gone;
2332 cwq = worker->current_cwq; 2346 cwq = worker->current_cwq;
2333 } 2347 } else
2348 goto already_gone;
2334 2349
2335 insert_wq_barrier(cwq, &barr, work, worker); 2350 insert_wq_barrier(cwq, barr, work, worker);
2336 spin_unlock_irq(&gcwq->lock); 2351 spin_unlock_irq(&gcwq->lock);
2337 2352
2338 lock_map_acquire(&cwq->wq->lockdep_map); 2353 lock_map_acquire(&cwq->wq->lockdep_map);
2339 lock_map_release(&cwq->wq->lockdep_map); 2354 lock_map_release(&cwq->wq->lockdep_map);
2340 2355 return true;
2341 wait_for_completion(&barr.done);
2342 destroy_work_on_stack(&barr.work);
2343 return 1;
2344already_gone: 2356already_gone:
2345 spin_unlock_irq(&gcwq->lock); 2357 spin_unlock_irq(&gcwq->lock);
2346 return 0; 2358 return false;
2359}
2360
2361/**
2362 * flush_work - wait for a work to finish executing the last queueing instance
2363 * @work: the work to flush
2364 *
2365 * Wait until @work has finished execution. This function considers
2366 * only the last queueing instance of @work. If @work has been
2367 * enqueued across different CPUs on a non-reentrant workqueue or on
2368 * multiple workqueues, @work might still be executing on return on
2369 * some of the CPUs from earlier queueing.
2370 *
2371 * If @work was queued only on a non-reentrant, ordered or unbound
2372 * workqueue, @work is guaranteed to be idle on return if it hasn't
2373 * been requeued since flush started.
2374 *
2375 * RETURNS:
2376 * %true if flush_work() waited for the work to finish execution,
2377 * %false if it was already idle.
2378 */
2379bool flush_work(struct work_struct *work)
2380{
2381 struct wq_barrier barr;
2382
2383 if (start_flush_work(work, &barr, true)) {
2384 wait_for_completion(&barr.done);
2385 destroy_work_on_stack(&barr.work);
2386 return true;
2387 } else
2388 return false;
2347} 2389}
2348EXPORT_SYMBOL_GPL(flush_work); 2390EXPORT_SYMBOL_GPL(flush_work);
2349 2391
2392static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
2393{
2394 struct wq_barrier barr;
2395 struct worker *worker;
2396
2397 spin_lock_irq(&gcwq->lock);
2398
2399 worker = find_worker_executing_work(gcwq, work);
2400 if (unlikely(worker))
2401 insert_wq_barrier(worker->current_cwq, &barr, work, worker);
2402
2403 spin_unlock_irq(&gcwq->lock);
2404
2405 if (unlikely(worker)) {
2406 wait_for_completion(&barr.done);
2407 destroy_work_on_stack(&barr.work);
2408 return true;
2409 } else
2410 return false;
2411}
2412
2413static bool wait_on_work(struct work_struct *work)
2414{
2415 bool ret = false;
2416 int cpu;
2417
2418 might_sleep();
2419
2420 lock_map_acquire(&work->lockdep_map);
2421 lock_map_release(&work->lockdep_map);
2422
2423 for_each_gcwq_cpu(cpu)
2424 ret |= wait_on_cpu_work(get_gcwq(cpu), work);
2425 return ret;
2426}
2427
2428/**
2429 * flush_work_sync - wait until a work has finished execution
2430 * @work: the work to flush
2431 *
2432 * Wait until @work has finished execution. On return, it's
2433 * guaranteed that all queueing instances of @work which happened
2434 * before this function is called are finished. In other words, if
2435 * @work hasn't been requeued since this function was called, @work is
2436 * guaranteed to be idle on return.
2437 *
2438 * RETURNS:
2439 * %true if flush_work_sync() waited for the work to finish execution,
2440 * %false if it was already idle.
2441 */
2442bool flush_work_sync(struct work_struct *work)
2443{
2444 struct wq_barrier barr;
2445 bool pending, waited;
2446
2447 /* we'll wait for executions separately, queue barr only if pending */
2448 pending = start_flush_work(work, &barr, false);
2449
2450 /* wait for executions to finish */
2451 waited = wait_on_work(work);
2452
2453 /* wait for the pending one */
2454 if (pending) {
2455 wait_for_completion(&barr.done);
2456 destroy_work_on_stack(&barr.work);
2457 }
2458
2459 return pending || waited;
2460}
2461EXPORT_SYMBOL_GPL(flush_work_sync);
2462
2350/* 2463/*
2351 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, 2464 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
2352 * so this work can't be re-armed in any way. 2465 * so this work can't be re-armed in any way.
@@ -2379,7 +2492,8 @@ static int try_to_grab_pending(struct work_struct *work)
2379 debug_work_deactivate(work); 2492 debug_work_deactivate(work);
2380 list_del_init(&work->entry); 2493 list_del_init(&work->entry);
2381 cwq_dec_nr_in_flight(get_work_cwq(work), 2494 cwq_dec_nr_in_flight(get_work_cwq(work),
2382 get_work_color(work)); 2495 get_work_color(work),
2496 *work_data_bits(work) & WORK_STRUCT_DELAYED);
2383 ret = 1; 2497 ret = 1;
2384 } 2498 }
2385 } 2499 }
@@ -2388,39 +2502,7 @@ static int try_to_grab_pending(struct work_struct *work)
2388 return ret; 2502 return ret;
2389} 2503}
2390 2504
2391static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) 2505static bool __cancel_work_timer(struct work_struct *work,
2392{
2393 struct wq_barrier barr;
2394 struct worker *worker;
2395
2396 spin_lock_irq(&gcwq->lock);
2397
2398 worker = find_worker_executing_work(gcwq, work);
2399 if (unlikely(worker))
2400 insert_wq_barrier(worker->current_cwq, &barr, work, worker);
2401
2402 spin_unlock_irq(&gcwq->lock);
2403
2404 if (unlikely(worker)) {
2405 wait_for_completion(&barr.done);
2406 destroy_work_on_stack(&barr.work);
2407 }
2408}
2409
2410static void wait_on_work(struct work_struct *work)
2411{
2412 int cpu;
2413
2414 might_sleep();
2415
2416 lock_map_acquire(&work->lockdep_map);
2417 lock_map_release(&work->lockdep_map);
2418
2419 for_each_gcwq_cpu(cpu)
2420 wait_on_cpu_work(get_gcwq(cpu), work);
2421}
2422
2423static int __cancel_work_timer(struct work_struct *work,
2424 struct timer_list* timer) 2506 struct timer_list* timer)
2425{ 2507{
2426 int ret; 2508 int ret;
@@ -2437,42 +2519,81 @@ static int __cancel_work_timer(struct work_struct *work,
2437} 2519}
2438 2520
2439/** 2521/**
2440 * cancel_work_sync - block until a work_struct's callback has terminated 2522 * cancel_work_sync - cancel a work and wait for it to finish
2441 * @work: the work which is to be flushed 2523 * @work: the work to cancel
2442 *
2443 * Returns true if @work was pending.
2444 *
2445 * cancel_work_sync() will cancel the work if it is queued. If the work's
2446 * callback appears to be running, cancel_work_sync() will block until it
2447 * has completed.
2448 * 2524 *
2449 * It is possible to use this function if the work re-queues itself. It can 2525 * Cancel @work and wait for its execution to finish. This function
2450 * cancel the work even if it migrates to another workqueue, however in that 2526 * can be used even if the work re-queues itself or migrates to
2451 * case it only guarantees that work->func() has completed on the last queued 2527 * another workqueue. On return from this function, @work is
2452 * workqueue. 2528 * guaranteed to be not pending or executing on any CPU.
2453 * 2529 *
2454 * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not 2530 * cancel_work_sync(&delayed_work->work) must not be used for
2455 * pending, otherwise it goes into a busy-wait loop until the timer expires. 2531 * delayed_work's. Use cancel_delayed_work_sync() instead.
2456 * 2532 *
2457 * The caller must ensure that workqueue_struct on which this work was last 2533 * The caller must ensure that the workqueue on which @work was last
2458 * queued can't be destroyed before this function returns. 2534 * queued can't be destroyed before this function returns.
2535 *
2536 * RETURNS:
2537 * %true if @work was pending, %false otherwise.
2459 */ 2538 */
2460int cancel_work_sync(struct work_struct *work) 2539bool cancel_work_sync(struct work_struct *work)
2461{ 2540{
2462 return __cancel_work_timer(work, NULL); 2541 return __cancel_work_timer(work, NULL);
2463} 2542}
2464EXPORT_SYMBOL_GPL(cancel_work_sync); 2543EXPORT_SYMBOL_GPL(cancel_work_sync);
2465 2544
2466/** 2545/**
2467 * cancel_delayed_work_sync - reliably kill off a delayed work. 2546 * flush_delayed_work - wait for a dwork to finish executing the last queueing
2468 * @dwork: the delayed work struct 2547 * @dwork: the delayed work to flush
2469 * 2548 *
2470 * Returns true if @dwork was pending. 2549 * Delayed timer is cancelled and the pending work is queued for
2550 * immediate execution. Like flush_work(), this function only
2551 * considers the last queueing instance of @dwork.
2471 * 2552 *
2472 * It is possible to use this function if @dwork rearms itself via queue_work() 2553 * RETURNS:
2473 * or queue_delayed_work(). See also the comment for cancel_work_sync(). 2554 * %true if flush_work() waited for the work to finish execution,
2555 * %false if it was already idle.
2474 */ 2556 */
2475int cancel_delayed_work_sync(struct delayed_work *dwork) 2557bool flush_delayed_work(struct delayed_work *dwork)
2558{
2559 if (del_timer_sync(&dwork->timer))
2560 __queue_work(raw_smp_processor_id(),
2561 get_work_cwq(&dwork->work)->wq, &dwork->work);
2562 return flush_work(&dwork->work);
2563}
2564EXPORT_SYMBOL(flush_delayed_work);
2565
2566/**
2567 * flush_delayed_work_sync - wait for a dwork to finish
2568 * @dwork: the delayed work to flush
2569 *
2570 * Delayed timer is cancelled and the pending work is queued for
2571 * execution immediately. Other than timer handling, its behavior
2572 * is identical to flush_work_sync().
2573 *
2574 * RETURNS:
2575 * %true if flush_work_sync() waited for the work to finish execution,
2576 * %false if it was already idle.
2577 */
2578bool flush_delayed_work_sync(struct delayed_work *dwork)
2579{
2580 if (del_timer_sync(&dwork->timer))
2581 __queue_work(raw_smp_processor_id(),
2582 get_work_cwq(&dwork->work)->wq, &dwork->work);
2583 return flush_work_sync(&dwork->work);
2584}
2585EXPORT_SYMBOL(flush_delayed_work_sync);
2586
2587/**
2588 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
2589 * @dwork: the delayed work cancel
2590 *
2591 * This is cancel_work_sync() for delayed works.
2592 *
2593 * RETURNS:
2594 * %true if @dwork was pending, %false otherwise.
2595 */
2596bool cancel_delayed_work_sync(struct delayed_work *dwork)
2476{ 2597{
2477 return __cancel_work_timer(&dwork->work, &dwork->timer); 2598 return __cancel_work_timer(&dwork->work, &dwork->timer);
2478} 2599}
@@ -2524,23 +2645,6 @@ int schedule_delayed_work(struct delayed_work *dwork,
2524EXPORT_SYMBOL(schedule_delayed_work); 2645EXPORT_SYMBOL(schedule_delayed_work);
2525 2646
2526/** 2647/**
2527 * flush_delayed_work - block until a dwork_struct's callback has terminated
2528 * @dwork: the delayed work which is to be flushed
2529 *
2530 * Any timeout is cancelled, and any pending work is run immediately.
2531 */
2532void flush_delayed_work(struct delayed_work *dwork)
2533{
2534 if (del_timer_sync(&dwork->timer)) {
2535 __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq,
2536 &dwork->work);
2537 put_cpu();
2538 }
2539 flush_work(&dwork->work);
2540}
2541EXPORT_SYMBOL(flush_delayed_work);
2542
2543/**
2544 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay 2648 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
2545 * @cpu: cpu to use 2649 * @cpu: cpu to use
2546 * @dwork: job to be done 2650 * @dwork: job to be done
@@ -2557,13 +2661,15 @@ int schedule_delayed_work_on(int cpu,
2557EXPORT_SYMBOL(schedule_delayed_work_on); 2661EXPORT_SYMBOL(schedule_delayed_work_on);
2558 2662
2559/** 2663/**
2560 * schedule_on_each_cpu - call a function on each online CPU from keventd 2664 * schedule_on_each_cpu - execute a function synchronously on each online CPU
2561 * @func: the function to call 2665 * @func: the function to call
2562 * 2666 *
2563 * Returns zero on success. 2667 * schedule_on_each_cpu() executes @func on each online CPU using the
2564 * Returns -ve errno on failure. 2668 * system workqueue and blocks until all CPUs have completed.
2565 *
2566 * schedule_on_each_cpu() is very slow. 2669 * schedule_on_each_cpu() is very slow.
2670 *
2671 * RETURNS:
2672 * 0 on success, -errno on failure.
2567 */ 2673 */
2568int schedule_on_each_cpu(work_func_t func) 2674int schedule_on_each_cpu(work_func_t func)
2569{ 2675{
@@ -2685,7 +2791,9 @@ static int alloc_cwqs(struct workqueue_struct *wq)
2685 } 2791 }
2686 } 2792 }
2687 2793
2688 /* just in case, make sure it's actually aligned */ 2794 /* just in case, make sure it's actually aligned
2795 * - this is affected by PERCPU() alignment in vmlinux.lds.S
2796 */
2689 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); 2797 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
2690 return wq->cpu_wq.v ? 0 : -ENOMEM; 2798 return wq->cpu_wq.v ? 0 : -ENOMEM;
2691} 2799}
@@ -2729,6 +2837,13 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
2729 unsigned int cpu; 2837 unsigned int cpu;
2730 2838
2731 /* 2839 /*
2840 * Workqueues which may be used during memory reclaim should
2841 * have a rescuer to guarantee forward progress.
2842 */
2843 if (flags & WQ_MEM_RECLAIM)
2844 flags |= WQ_RESCUER;
2845
2846 /*
2732 * Unbound workqueues aren't concurrency managed and should be 2847 * Unbound workqueues aren't concurrency managed and should be
2733 * dispatched to workers immediately. 2848 * dispatched to workers immediately.
2734 */ 2849 */
@@ -2782,7 +2897,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
2782 if (IS_ERR(rescuer->task)) 2897 if (IS_ERR(rescuer->task))
2783 goto err; 2898 goto err;
2784 2899
2785 wq->rescuer = rescuer;
2786 rescuer->task->flags |= PF_THREAD_BOUND; 2900 rescuer->task->flags |= PF_THREAD_BOUND;
2787 wake_up_process(rescuer->task); 2901 wake_up_process(rescuer->task);
2788 } 2902 }
@@ -2824,6 +2938,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
2824{ 2938{
2825 unsigned int cpu; 2939 unsigned int cpu;
2826 2940
2941 wq->flags |= WQ_DYING;
2827 flush_workqueue(wq); 2942 flush_workqueue(wq);
2828 2943
2829 /* 2944 /*
@@ -2848,6 +2963,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
2848 if (wq->flags & WQ_RESCUER) { 2963 if (wq->flags & WQ_RESCUER) {
2849 kthread_stop(wq->rescuer->task); 2964 kthread_stop(wq->rescuer->task);
2850 free_mayday_mask(wq->mayday_mask); 2965 free_mayday_mask(wq->mayday_mask);
2966 kfree(wq->rescuer);
2851 } 2967 }
2852 2968
2853 free_cwqs(wq); 2969 free_cwqs(wq);
@@ -3230,6 +3346,8 @@ static int __cpuinit trustee_thread(void *__gcwq)
3230 * multiple times. To be used by cpu_callback. 3346 * multiple times. To be used by cpu_callback.
3231 */ 3347 */
3232static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) 3348static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
3349__releases(&gcwq->lock)
3350__acquires(&gcwq->lock)
3233{ 3351{
3234 if (!(gcwq->trustee_state == state || 3352 if (!(gcwq->trustee_state == state ||
3235 gcwq->trustee_state == TRUSTEE_DONE)) { 3353 gcwq->trustee_state == TRUSTEE_DONE)) {
@@ -3536,8 +3654,7 @@ static int __init init_workqueues(void)
3536 spin_lock_init(&gcwq->lock); 3654 spin_lock_init(&gcwq->lock);
3537 INIT_LIST_HEAD(&gcwq->worklist); 3655 INIT_LIST_HEAD(&gcwq->worklist);
3538 gcwq->cpu = cpu; 3656 gcwq->cpu = cpu;
3539 if (cpu == WORK_CPU_UNBOUND) 3657 gcwq->flags |= GCWQ_DISASSOCIATED;
3540 gcwq->flags |= GCWQ_DISASSOCIATED;
3541 3658
3542 INIT_LIST_HEAD(&gcwq->idle_list); 3659 INIT_LIST_HEAD(&gcwq->idle_list);
3543 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) 3660 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
@@ -3561,6 +3678,8 @@ static int __init init_workqueues(void)
3561 struct global_cwq *gcwq = get_gcwq(cpu); 3678 struct global_cwq *gcwq = get_gcwq(cpu);
3562 struct worker *worker; 3679 struct worker *worker;
3563 3680
3681 if (cpu != WORK_CPU_UNBOUND)
3682 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3564 worker = create_worker(gcwq, true); 3683 worker = create_worker(gcwq, true);
3565 BUG_ON(!worker); 3684 BUG_ON(!worker);
3566 spin_lock_irq(&gcwq->lock); 3685 spin_lock_irq(&gcwq->lock);