aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks2
-rw-r--r--kernel/Makefile8
-rw-r--r--kernel/audit.c8
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/audit_watch.c85
-rw-r--r--kernel/auditfilter.c10
-rw-r--r--kernel/auditsc.c29
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/capability.c112
-rw-r--r--kernel/cgroup.c678
-rw-r--r--kernel/cgroup_freezer.c26
-rw-r--r--kernel/compat.c191
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/cpuset.c178
-rw-r--r--kernel/crash_dump.c34
-rw-r--r--kernel/cred.c14
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/debug/gdbstub.c30
-rw-r--r--kernel/debug/kdb/kdb_main.c10
-rw-r--r--kernel/debug/kdb/kdb_support.c2
-rw-r--r--kernel/events/Makefile6
-rw-r--r--kernel/events/core.c (renamed from kernel/perf_event.c)1096
-rw-r--r--kernel/events/hw_breakpoint.c (renamed from kernel/hw_breakpoint.c)0
-rw-r--r--kernel/exit.c115
-rw-r--r--kernel/extable.c18
-rw-r--r--kernel/fork.c270
-rw-r--r--kernel/freezer.c4
-rw-r--r--kernel/futex.c160
-rw-r--r--kernel/futex_compat.c11
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/gcov/Makefile2
-rw-r--r--kernel/groups.c2
-rw-r--r--kernel/hrtimer.c228
-rw-r--r--kernel/hung_task.c2
-rw-r--r--kernel/irq/Kconfig50
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/autoprobe.c52
-rw-r--r--kernel/irq/chip.c699
-rw-r--r--kernel/irq/debug.h45
-rw-r--r--kernel/irq/dummychip.c9
-rw-r--r--kernel/irq/generic-chip.c354
-rw-r--r--kernel/irq/handle.c129
-rw-r--r--kernel/irq/internals.h161
-rw-r--r--kernel/irq/irqdesc.c108
-rw-r--r--kernel/irq/manage.c634
-rw-r--r--kernel/irq/migration.c29
-rw-r--r--kernel/irq/pm.c30
-rw-r--r--kernel/irq/proc.c143
-rw-r--r--kernel/irq/resend.c18
-rw-r--r--kernel/irq/settings.h142
-rw-r--r--kernel/irq/spurious.c162
-rw-r--r--kernel/jump_label.c539
-rw-r--r--kernel/kallsyms.c58
-rw-r--r--kernel/kexec.c17
-rw-r--r--kernel/kmod.c116
-rw-r--r--kernel/ksysfs.c10
-rw-r--r--kernel/kthread.c33
-rw-r--r--kernel/latencytop.c2
-rw-r--r--kernel/lockdep.c210
-rw-r--r--kernel/lockdep_proc.c9
-rw-r--r--kernel/module.c119
-rw-r--r--kernel/mutex-debug.c2
-rw-r--r--kernel/mutex-debug.h2
-rw-r--r--kernel/mutex.c36
-rw-r--r--kernel/mutex.h2
-rw-r--r--kernel/ns_cgroup.c118
-rw-r--r--kernel/nsproxy.c50
-rw-r--r--kernel/padata.c8
-rw-r--r--kernel/panic.c10
-rw-r--r--kernel/params.c25
-rw-r--r--kernel/pid.c7
-rw-r--r--kernel/pid_namespace.c11
-rw-r--r--kernel/pm_qos_params.c24
-rw-r--r--kernel/posix-cpu-timers.c112
-rw-r--r--kernel/posix-timers.c369
-rw-r--r--kernel/power/Kconfig239
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/block_io.c2
-rw-r--r--kernel/power/hibernate.c53
-rw-r--r--kernel/power/main.c8
-rw-r--r--kernel/power/power.h4
-rw-r--r--kernel/power/process.c6
-rw-r--r--kernel/power/snapshot.c32
-rw-r--r--kernel/power/suspend.c11
-rw-r--r--kernel/power/user.c5
-rw-r--r--kernel/printk.c315
-rw-r--r--kernel/profile.c22
-rw-r--r--kernel/ptrace.c170
-rw-r--r--kernel/rcupdate.c28
-rw-r--r--kernel/rcutiny.c46
-rw-r--r--kernel/rcutiny_plugin.h205
-rw-r--r--kernel/rcutorture.c27
-rw-r--r--kernel/rcutree.c527
-rw-r--r--kernel/rcutree.h104
-rw-r--r--kernel/rcutree_plugin.h568
-rw-r--r--kernel/rcutree_trace.c180
-rw-r--r--kernel/res_counter.c14
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex-tester.c40
-rw-r--r--kernel/rtmutex.c318
-rw-r--r--kernel/rtmutex_common.h16
-rw-r--r--kernel/sched.c2122
-rw-r--r--kernel/sched_autogroup.c17
-rw-r--r--kernel/sched_autogroup.h5
-rw-r--r--kernel/sched_debug.c8
-rw-r--r--kernel/sched_fair.c598
-rw-r--r--kernel/sched_features.h6
-rw-r--r--kernel/sched_idletask.c30
-rw-r--r--kernel/sched_rt.c114
-rw-r--r--kernel/sched_stoptask.c14
-rw-r--r--kernel/signal.c869
-rw-r--r--kernel/smp.c152
-rw-r--r--kernel/softirq.c33
-rw-r--r--kernel/stop_machine.c6
-rw-r--r--kernel/sys.c86
-rw-r--r--kernel/sys_ni.c14
-rw-r--r--kernel/sysctl.c86
-rw-r--r--kernel/sysctl_binary.c19
-rw-r--r--kernel/sysctl_check.c10
-rw-r--r--kernel/taskstats.c2
-rw-r--r--kernel/time.c35
-rw-r--r--kernel/time/Makefile3
-rw-r--r--kernel/time/alarmtimer.c702
-rw-r--r--kernel/time/clockevents.c65
-rw-r--r--kernel/time/clocksource.c42
-rw-r--r--kernel/time/jiffies.c22
-rw-r--r--kernel/time/ntp.c15
-rw-r--r--kernel/time/posix-clock.c445
-rw-r--r--kernel/time/tick-broadcast.c39
-rw-r--r--kernel/time/tick-common.c7
-rw-r--r--kernel/time/tick-internal.h12
-rw-r--r--kernel/time/tick-oneshot.c1
-rw-r--r--kernel/time/tick-sched.c1
-rw-r--r--kernel/time/timekeeping.c241
-rw-r--r--kernel/time/timer_list.c4
-rw-r--r--kernel/time/timer_stats.c2
-rw-r--r--kernel/timer.c50
-rw-r--r--kernel/trace/Kconfig6
-rw-r--r--kernel/trace/blktrace.c71
-rw-r--r--kernel/trace/ftrace.c1318
-rw-r--r--kernel/trace/ring_buffer.c30
-rw-r--r--kernel/trace/trace.c56
-rw-r--r--kernel/trace/trace.h43
-rw-r--r--kernel/trace/trace_clock.c2
-rw-r--r--kernel/trace/trace_entries.h8
-rw-r--r--kernel/trace/trace_events.c3
-rw-r--r--kernel/trace/trace_events_filter.c885
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_irqsoff.c3
-rw-r--r--kernel/trace/trace_kprobe.c114
-rw-r--r--kernel/trace/trace_output.c39
-rw-r--r--kernel/trace/trace_printk.c120
-rw-r--r--kernel/trace/trace_sched_switch.c48
-rw-r--r--kernel/trace/trace_sched_wakeup.c1
-rw-r--r--kernel/trace/trace_selftest.c214
-rw-r--r--kernel/trace/trace_selftest_dynamic.c6
-rw-r--r--kernel/trace/trace_stack.c1
-rw-r--r--kernel/trace/trace_syscalls.c42
-rw-r--r--kernel/tracepoint.c23
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/user-return-notifier.c2
-rw-r--r--kernel/user.c8
-rw-r--r--kernel/utsname.c51
-rw-r--r--kernel/wait.c2
-rw-r--r--kernel/watchdog.c92
-rw-r--r--kernel/workqueue.c67
167 files changed, 13748 insertions, 6341 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 88c92fb44618..5068e2a4e75f 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -199,4 +199,4 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
199 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE 199 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
200 200
201config MUTEX_SPIN_ON_OWNER 201config MUTEX_SPIN_ON_OWNER
202 def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES 202 def_bool SMP && !DEBUG_MUTEXES
diff --git a/kernel/Makefile b/kernel/Makefile
index 353d3fe8ba33..2d64cfcc8b42 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,7 +21,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 21CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 22CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg 23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_perf_event.o = -pg
25CFLAGS_REMOVE_irq_work.o = -pg 24CFLAGS_REMOVE_irq_work.o = -pg
26endif 25endif
27 26
@@ -62,7 +61,6 @@ obj-$(CONFIG_COMPAT) += compat.o
62obj-$(CONFIG_CGROUPS) += cgroup.o 61obj-$(CONFIG_CGROUPS) += cgroup.o
63obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o 62obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
64obj-$(CONFIG_CPUSETS) += cpuset.o 63obj-$(CONFIG_CPUSETS) += cpuset.o
65obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
66obj-$(CONFIG_UTS_NS) += utsname.o 64obj-$(CONFIG_UTS_NS) += utsname.o
67obj-$(CONFIG_USER_NS) += user_namespace.o 65obj-$(CONFIG_USER_NS) += user_namespace.o
68obj-$(CONFIG_PID_NS) += pid_namespace.o 66obj-$(CONFIG_PID_NS) += pid_namespace.o
@@ -103,10 +101,12 @@ obj-$(CONFIG_RING_BUFFER) += trace/
103obj-$(CONFIG_TRACEPOINTS) += trace/ 101obj-$(CONFIG_TRACEPOINTS) += trace/
104obj-$(CONFIG_SMP) += sched_cpupri.o 102obj-$(CONFIG_SMP) += sched_cpupri.o
105obj-$(CONFIG_IRQ_WORK) += irq_work.o 103obj-$(CONFIG_IRQ_WORK) += irq_work.o
106obj-$(CONFIG_PERF_EVENTS) += perf_event.o 104
107obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 105obj-$(CONFIG_PERF_EVENTS) += events/
106
108obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 107obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
109obj-$(CONFIG_PADATA) += padata.o 108obj-$(CONFIG_PADATA) += padata.o
109obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
110 110
111ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 111ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
112# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 112# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit.c b/kernel/audit.c
index e4956244ae50..939500317066 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -74,6 +74,8 @@ static int audit_initialized;
74int audit_enabled; 74int audit_enabled;
75int audit_ever_enabled; 75int audit_ever_enabled;
76 76
77EXPORT_SYMBOL_GPL(audit_enabled);
78
77/* Default state when kernel boots without any parameters. */ 79/* Default state when kernel boots without any parameters. */
78static int audit_default; 80static int audit_default;
79 81
@@ -671,9 +673,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
671 673
672 pid = NETLINK_CREDS(skb)->pid; 674 pid = NETLINK_CREDS(skb)->pid;
673 uid = NETLINK_CREDS(skb)->uid; 675 uid = NETLINK_CREDS(skb)->uid;
674 loginuid = NETLINK_CB(skb).loginuid; 676 loginuid = audit_get_loginuid(current);
675 sessionid = NETLINK_CB(skb).sessionid; 677 sessionid = audit_get_sessionid(current);
676 sid = NETLINK_CB(skb).sid; 678 security_task_getsecid(current, &sid);
677 seq = nlh->nlmsg_seq; 679 seq = nlh->nlmsg_seq;
678 data = NLMSG_DATA(nlh); 680 data = NLMSG_DATA(nlh);
679 681
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 37b2bea170c8..e99dda04b126 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -607,7 +607,7 @@ void audit_trim_trees(void)
607 spin_lock(&hash_lock); 607 spin_lock(&hash_lock);
608 list_for_each_entry(node, &tree->chunks, list) { 608 list_for_each_entry(node, &tree->chunks, list) {
609 struct audit_chunk *chunk = find_chunk(node); 609 struct audit_chunk *chunk = find_chunk(node);
610 /* this could be NULL if the watch is dieing else where... */ 610 /* this could be NULL if the watch is dying else where... */
611 struct inode *inode = chunk->mark.i.inode; 611 struct inode *inode = chunk->mark.i.inode;
612 node->index |= 1U<<31; 612 node->index |= 1U<<31;
613 if (iterate_mounts(compare_root, inode, root_mnt)) 613 if (iterate_mounts(compare_root, inode, root_mnt))
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index d2e3c7866460..e683869365d9 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
144} 144}
145 145
146/* Initialize a parent watch entry. */ 146/* Initialize a parent watch entry. */
147static struct audit_parent *audit_init_parent(struct nameidata *ndp) 147static struct audit_parent *audit_init_parent(struct path *path)
148{ 148{
149 struct inode *inode = ndp->path.dentry->d_inode; 149 struct inode *inode = path->dentry->d_inode;
150 struct audit_parent *parent; 150 struct audit_parent *parent;
151 int ret; 151 int ret;
152 152
@@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
353} 353}
354 354
355/* Get path information necessary for adding watches. */ 355/* Get path information necessary for adding watches. */
356static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) 356static int audit_get_nd(struct audit_watch *watch, struct path *parent)
357{ 357{
358 struct nameidata *ndparent, *ndwatch; 358 struct nameidata nd;
359 struct dentry *d;
359 int err; 360 int err;
360 361
361 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); 362 err = kern_path_parent(watch->path, &nd);
362 if (unlikely(!ndparent)) 363 if (err)
363 return -ENOMEM; 364 return err;
364 365
365 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); 366 if (nd.last_type != LAST_NORM) {
366 if (unlikely(!ndwatch)) { 367 path_put(&nd.path);
367 kfree(ndparent); 368 return -EINVAL;
368 return -ENOMEM;
369 } 369 }
370 370
371 err = path_lookup(path, LOOKUP_PARENT, ndparent); 371 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
372 if (err) { 372 d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
373 kfree(ndparent); 373 if (IS_ERR(d)) {
374 kfree(ndwatch); 374 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
375 return err; 375 path_put(&nd.path);
376 return PTR_ERR(d);
376 } 377 }
377 378 if (d->d_inode) {
378 err = path_lookup(path, 0, ndwatch); 379 /* update watch filter fields */
379 if (err) { 380 watch->dev = d->d_inode->i_sb->s_dev;
380 kfree(ndwatch); 381 watch->ino = d->d_inode->i_ino;
381 ndwatch = NULL;
382 } 382 }
383 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
383 384
384 *ndp = ndparent; 385 *parent = nd.path;
385 *ndw = ndwatch; 386 dput(d);
386
387 return 0; 387 return 0;
388} 388}
389 389
390/* Release resources used for watch path information. */
391static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
392{
393 if (ndp) {
394 path_put(&ndp->path);
395 kfree(ndp);
396 }
397 if (ndw) {
398 path_put(&ndw->path);
399 kfree(ndw);
400 }
401}
402
403/* Associate the given rule with an existing parent. 390/* Associate the given rule with an existing parent.
404 * Caller must hold audit_filter_mutex. */ 391 * Caller must hold audit_filter_mutex. */
405static void audit_add_to_parent(struct audit_krule *krule, 392static void audit_add_to_parent(struct audit_krule *krule,
@@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
440{ 427{
441 struct audit_watch *watch = krule->watch; 428 struct audit_watch *watch = krule->watch;
442 struct audit_parent *parent; 429 struct audit_parent *parent;
443 struct nameidata *ndp = NULL, *ndw = NULL; 430 struct path parent_path;
444 int h, ret = 0; 431 int h, ret = 0;
445 432
446 mutex_unlock(&audit_filter_mutex); 433 mutex_unlock(&audit_filter_mutex);
447 434
448 /* Avoid calling path_lookup under audit_filter_mutex. */ 435 /* Avoid calling path_lookup under audit_filter_mutex. */
449 ret = audit_get_nd(watch->path, &ndp, &ndw); 436 ret = audit_get_nd(watch, &parent_path);
450 if (ret) {
451 /* caller expects mutex locked */
452 mutex_lock(&audit_filter_mutex);
453 goto error;
454 }
455 437
438 /* caller expects mutex locked */
456 mutex_lock(&audit_filter_mutex); 439 mutex_lock(&audit_filter_mutex);
457 440
458 /* update watch filter fields */ 441 if (ret)
459 if (ndw) { 442 return ret;
460 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
461 watch->ino = ndw->path.dentry->d_inode->i_ino;
462 }
463 443
464 /* either find an old parent or attach a new one */ 444 /* either find an old parent or attach a new one */
465 parent = audit_find_parent(ndp->path.dentry->d_inode); 445 parent = audit_find_parent(parent_path.dentry->d_inode);
466 if (!parent) { 446 if (!parent) {
467 parent = audit_init_parent(ndp); 447 parent = audit_init_parent(&parent_path);
468 if (IS_ERR(parent)) { 448 if (IS_ERR(parent)) {
469 ret = PTR_ERR(parent); 449 ret = PTR_ERR(parent);
470 goto error; 450 goto error;
@@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
479 h = audit_hash_ino((u32)watch->ino); 459 h = audit_hash_ino((u32)watch->ino);
480 *list = &audit_inode_hash[h]; 460 *list = &audit_inode_hash[h];
481error: 461error:
482 audit_put_nd(ndp, ndw); /* NULL args OK */ 462 path_put(&parent_path);
483 return ret; 463 return ret;
484
485} 464}
486 465
487void audit_remove_watch_rule(struct audit_krule *krule) 466void audit_remove_watch_rule(struct audit_krule *krule)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index add2819af71b..f8277c80d678 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1238,6 +1238,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1238 for (i = 0; i < rule->field_count; i++) { 1238 for (i = 0; i < rule->field_count; i++) {
1239 struct audit_field *f = &rule->fields[i]; 1239 struct audit_field *f = &rule->fields[i];
1240 int result = 0; 1240 int result = 0;
1241 u32 sid;
1241 1242
1242 switch (f->type) { 1243 switch (f->type) {
1243 case AUDIT_PID: 1244 case AUDIT_PID:
@@ -1250,19 +1251,22 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1250 result = audit_comparator(cb->creds.gid, f->op, f->val); 1251 result = audit_comparator(cb->creds.gid, f->op, f->val);
1251 break; 1252 break;
1252 case AUDIT_LOGINUID: 1253 case AUDIT_LOGINUID:
1253 result = audit_comparator(cb->loginuid, f->op, f->val); 1254 result = audit_comparator(audit_get_loginuid(current),
1255 f->op, f->val);
1254 break; 1256 break;
1255 case AUDIT_SUBJ_USER: 1257 case AUDIT_SUBJ_USER:
1256 case AUDIT_SUBJ_ROLE: 1258 case AUDIT_SUBJ_ROLE:
1257 case AUDIT_SUBJ_TYPE: 1259 case AUDIT_SUBJ_TYPE:
1258 case AUDIT_SUBJ_SEN: 1260 case AUDIT_SUBJ_SEN:
1259 case AUDIT_SUBJ_CLR: 1261 case AUDIT_SUBJ_CLR:
1260 if (f->lsm_rule) 1262 if (f->lsm_rule) {
1261 result = security_audit_rule_match(cb->sid, 1263 security_task_getsecid(current, &sid);
1264 result = security_audit_rule_match(sid,
1262 f->type, 1265 f->type,
1263 f->op, 1266 f->op,
1264 f->lsm_rule, 1267 f->lsm_rule,
1265 NULL); 1268 NULL);
1269 }
1266 break; 1270 break;
1267 } 1271 }
1268 1272
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f49a0318c2ed..00d79df03e76 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -443,17 +443,25 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
443 443
444/* Determine if any context name data matches a rule's watch data */ 444/* Determine if any context name data matches a rule's watch data */
445/* Compare a task_struct with an audit_rule. Return 1 on match, 0 445/* Compare a task_struct with an audit_rule. Return 1 on match, 0
446 * otherwise. */ 446 * otherwise.
447 *
448 * If task_creation is true, this is an explicit indication that we are
449 * filtering a task rule at task creation time. This and tsk == current are
450 * the only situations where tsk->cred may be accessed without an rcu read lock.
451 */
447static int audit_filter_rules(struct task_struct *tsk, 452static int audit_filter_rules(struct task_struct *tsk,
448 struct audit_krule *rule, 453 struct audit_krule *rule,
449 struct audit_context *ctx, 454 struct audit_context *ctx,
450 struct audit_names *name, 455 struct audit_names *name,
451 enum audit_state *state) 456 enum audit_state *state,
457 bool task_creation)
452{ 458{
453 const struct cred *cred = get_task_cred(tsk); 459 const struct cred *cred;
454 int i, j, need_sid = 1; 460 int i, j, need_sid = 1;
455 u32 sid; 461 u32 sid;
456 462
463 cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
464
457 for (i = 0; i < rule->field_count; i++) { 465 for (i = 0; i < rule->field_count; i++) {
458 struct audit_field *f = &rule->fields[i]; 466 struct audit_field *f = &rule->fields[i];
459 int result = 0; 467 int result = 0;
@@ -637,10 +645,8 @@ static int audit_filter_rules(struct task_struct *tsk,
637 break; 645 break;
638 } 646 }
639 647
640 if (!result) { 648 if (!result)
641 put_cred(cred);
642 return 0; 649 return 0;
643 }
644 } 650 }
645 651
646 if (ctx) { 652 if (ctx) {
@@ -656,7 +662,6 @@ static int audit_filter_rules(struct task_struct *tsk,
656 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 662 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
657 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; 663 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
658 } 664 }
659 put_cred(cred);
660 return 1; 665 return 1;
661} 666}
662 667
@@ -671,7 +676,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
671 676
672 rcu_read_lock(); 677 rcu_read_lock();
673 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { 678 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
674 if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { 679 if (audit_filter_rules(tsk, &e->rule, NULL, NULL,
680 &state, true)) {
675 if (state == AUDIT_RECORD_CONTEXT) 681 if (state == AUDIT_RECORD_CONTEXT)
676 *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); 682 *key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
677 rcu_read_unlock(); 683 rcu_read_unlock();
@@ -705,7 +711,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
705 list_for_each_entry_rcu(e, list, list) { 711 list_for_each_entry_rcu(e, list, list) {
706 if ((e->rule.mask[word] & bit) == bit && 712 if ((e->rule.mask[word] & bit) == bit &&
707 audit_filter_rules(tsk, &e->rule, ctx, NULL, 713 audit_filter_rules(tsk, &e->rule, ctx, NULL,
708 &state)) { 714 &state, false)) {
709 rcu_read_unlock(); 715 rcu_read_unlock();
710 ctx->current_state = state; 716 ctx->current_state = state;
711 return state; 717 return state;
@@ -743,7 +749,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
743 749
744 list_for_each_entry_rcu(e, list, list) { 750 list_for_each_entry_rcu(e, list, list) {
745 if ((e->rule.mask[word] & bit) == bit && 751 if ((e->rule.mask[word] & bit) == bit &&
746 audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { 752 audit_filter_rules(tsk, &e->rule, ctx, n,
753 &state, false)) {
747 rcu_read_unlock(); 754 rcu_read_unlock();
748 ctx->current_state = state; 755 ctx->current_state = state;
749 return; 756 return;
@@ -1011,7 +1018,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
1011/* 1018/*
1012 * to_send and len_sent accounting are very loose estimates. We aren't 1019 * to_send and len_sent accounting are very loose estimates. We aren't
1013 * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being 1020 * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
1014 * within about 500 bytes (next page boundry) 1021 * within about 500 bytes (next page boundary)
1015 * 1022 *
1016 * why snprintf? an int is up to 12 digits long. if we just assumed when 1023 * why snprintf? an int is up to 12 digits long. if we just assumed when
1017 * logging that a[%d]= was going to be 16 characters long we would be wasting 1024 * logging that a[%d]= was going to be 16 characters long we would be wasting
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 98a51f26c136..0c9b862292b2 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,11 +9,13 @@
9#include <linux/page-flags.h> 9#include <linux/page-flags.h>
10#include <linux/mmzone.h> 10#include <linux/mmzone.h>
11#include <linux/kbuild.h> 11#include <linux/kbuild.h>
12#include <linux/page_cgroup.h>
12 13
13void foo(void) 14void foo(void)
14{ 15{
15 /* The enum constants to put into include/generated/bounds.h */ 16 /* The enum constants to put into include/generated/bounds.h */
16 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); 17 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
17 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); 18 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
19 DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
18 /* End of constants */ 20 /* End of constants */
19} 21}
diff --git a/kernel/capability.c b/kernel/capability.c
index 2f05303715a5..283c529f8b1c 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -14,6 +14,7 @@
14#include <linux/security.h> 14#include <linux/security.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
17#include <linux/user_namespace.h>
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
18 19
19/* 20/*
@@ -21,12 +22,8 @@
21 */ 22 */
22 23
23const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; 24const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
24const kernel_cap_t __cap_full_set = CAP_FULL_SET;
25const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET;
26 25
27EXPORT_SYMBOL(__cap_empty_set); 26EXPORT_SYMBOL(__cap_empty_set);
28EXPORT_SYMBOL(__cap_full_set);
29EXPORT_SYMBOL(__cap_init_eff_set);
30 27
31int file_caps_enabled = 1; 28int file_caps_enabled = 1;
32 29
@@ -290,6 +287,60 @@ error:
290} 287}
291 288
292/** 289/**
290 * has_capability - Does a task have a capability in init_user_ns
291 * @t: The task in question
292 * @cap: The capability to be tested for
293 *
294 * Return true if the specified task has the given superior capability
295 * currently in effect to the initial user namespace, false if not.
296 *
297 * Note that this does not set PF_SUPERPRIV on the task.
298 */
299bool has_capability(struct task_struct *t, int cap)
300{
301 int ret = security_real_capable(t, &init_user_ns, cap);
302
303 return (ret == 0);
304}
305
306/**
307 * has_capability - Does a task have a capability in a specific user ns
308 * @t: The task in question
309 * @ns: target user namespace
310 * @cap: The capability to be tested for
311 *
312 * Return true if the specified task has the given superior capability
313 * currently in effect to the specified user namespace, false if not.
314 *
315 * Note that this does not set PF_SUPERPRIV on the task.
316 */
317bool has_ns_capability(struct task_struct *t,
318 struct user_namespace *ns, int cap)
319{
320 int ret = security_real_capable(t, ns, cap);
321
322 return (ret == 0);
323}
324
325/**
326 * has_capability_noaudit - Does a task have a capability (unaudited)
327 * @t: The task in question
328 * @cap: The capability to be tested for
329 *
330 * Return true if the specified task has the given superior capability
331 * currently in effect to init_user_ns, false if not. Don't write an
332 * audit message for the check.
333 *
334 * Note that this does not set PF_SUPERPRIV on the task.
335 */
336bool has_capability_noaudit(struct task_struct *t, int cap)
337{
338 int ret = security_real_capable_noaudit(t, &init_user_ns, cap);
339
340 return (ret == 0);
341}
342
343/**
293 * capable - Determine if the current task has a superior capability in effect 344 * capable - Determine if the current task has a superior capability in effect
294 * @cap: The capability to be tested for 345 * @cap: The capability to be tested for
295 * 346 *
@@ -299,17 +350,60 @@ error:
299 * This sets PF_SUPERPRIV on the task if the capability is available on the 350 * This sets PF_SUPERPRIV on the task if the capability is available on the
300 * assumption that it's about to be used. 351 * assumption that it's about to be used.
301 */ 352 */
302int capable(int cap) 353bool capable(int cap)
354{
355 return ns_capable(&init_user_ns, cap);
356}
357EXPORT_SYMBOL(capable);
358
359/**
360 * ns_capable - Determine if the current task has a superior capability in effect
361 * @ns: The usernamespace we want the capability in
362 * @cap: The capability to be tested for
363 *
364 * Return true if the current task has the given superior capability currently
365 * available for use, false if not.
366 *
367 * This sets PF_SUPERPRIV on the task if the capability is available on the
368 * assumption that it's about to be used.
369 */
370bool ns_capable(struct user_namespace *ns, int cap)
303{ 371{
304 if (unlikely(!cap_valid(cap))) { 372 if (unlikely(!cap_valid(cap))) {
305 printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); 373 printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
306 BUG(); 374 BUG();
307 } 375 }
308 376
309 if (security_capable(cap) == 0) { 377 if (security_capable(ns, current_cred(), cap) == 0) {
310 current->flags |= PF_SUPERPRIV; 378 current->flags |= PF_SUPERPRIV;
311 return 1; 379 return true;
312 } 380 }
313 return 0; 381 return false;
382}
383EXPORT_SYMBOL(ns_capable);
384
385/**
386 * task_ns_capable - Determine whether current task has a superior
387 * capability targeted at a specific task's user namespace.
388 * @t: The task whose user namespace is targeted.
389 * @cap: The capability in question.
390 *
391 * Return true if it does, false otherwise.
392 */
393bool task_ns_capable(struct task_struct *t, int cap)
394{
395 return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
396}
397EXPORT_SYMBOL(task_ns_capable);
398
399/**
400 * nsown_capable - Check superior capability to one's own user_ns
401 * @cap: The capability in question
402 *
403 * Return true if the current task has the given superior capability
404 * targeted at its own user namespace.
405 */
406bool nsown_capable(int cap)
407{
408 return ns_capable(current_user_ns(), cap);
314} 409}
315EXPORT_SYMBOL(capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b24d7027b83c..2731d115d725 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,6 +57,7 @@
57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58#include <linux/eventfd.h> 58#include <linux/eventfd.h>
59#include <linux/poll.h> 59#include <linux/poll.h>
60#include <linux/flex_array.h> /* used in cgroup_attach_proc */
60 61
61#include <asm/atomic.h> 62#include <asm/atomic.h>
62 63
@@ -157,7 +158,7 @@ struct css_id {
157}; 158};
158 159
159/* 160/*
160 * cgroup_event represents events which userspace want to recieve. 161 * cgroup_event represents events which userspace want to receive.
161 */ 162 */
162struct cgroup_event { 163struct cgroup_event {
163 /* 164 /*
@@ -326,12 +327,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
326 return &css_set_table[index]; 327 return &css_set_table[index];
327} 328}
328 329
329static void free_css_set_rcu(struct rcu_head *obj)
330{
331 struct css_set *cg = container_of(obj, struct css_set, rcu_head);
332 kfree(cg);
333}
334
335/* We don't maintain the lists running through each css_set to its 330/* We don't maintain the lists running through each css_set to its
336 * task until after the first call to cgroup_iter_start(). This 331 * task until after the first call to cgroup_iter_start(). This
337 * reduces the fork()/exit() overhead for people who have cgroups 332 * reduces the fork()/exit() overhead for people who have cgroups
@@ -375,7 +370,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
375 } 370 }
376 371
377 write_unlock(&css_set_lock); 372 write_unlock(&css_set_lock);
378 call_rcu(&cg->rcu_head, free_css_set_rcu); 373 kfree_rcu(cg, rcu_head);
379} 374}
380 375
381/* 376/*
@@ -812,13 +807,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
812 return ret; 807 return ret;
813} 808}
814 809
815static void free_cgroup_rcu(struct rcu_head *obj)
816{
817 struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
818
819 kfree(cgrp);
820}
821
822static void cgroup_diput(struct dentry *dentry, struct inode *inode) 810static void cgroup_diput(struct dentry *dentry, struct inode *inode)
823{ 811{
824 /* is dentry a directory ? if so, kfree() associated cgroup */ 812 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -856,7 +844,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
856 */ 844 */
857 BUG_ON(!list_empty(&cgrp->pidlists)); 845 BUG_ON(!list_empty(&cgrp->pidlists));
858 846
859 call_rcu(&cgrp->rcu_head, free_cgroup_rcu); 847 kfree_rcu(cgrp, rcu_head);
860 } 848 }
861 iput(inode); 849 iput(inode);
862} 850}
@@ -1748,6 +1736,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1748} 1736}
1749EXPORT_SYMBOL_GPL(cgroup_path); 1737EXPORT_SYMBOL_GPL(cgroup_path);
1750 1738
1739/*
1740 * cgroup_task_migrate - move a task from one cgroup to another.
1741 *
1742 * 'guarantee' is set if the caller promises that a new css_set for the task
1743 * will already exist. If not set, this function might sleep, and can fail with
1744 * -ENOMEM. Otherwise, it can only fail with -ESRCH.
1745 */
1746static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1747 struct task_struct *tsk, bool guarantee)
1748{
1749 struct css_set *oldcg;
1750 struct css_set *newcg;
1751
1752 /*
1753 * get old css_set. we need to take task_lock and refcount it, because
1754 * an exiting task can change its css_set to init_css_set and drop its
1755 * old one without taking cgroup_mutex.
1756 */
1757 task_lock(tsk);
1758 oldcg = tsk->cgroups;
1759 get_css_set(oldcg);
1760 task_unlock(tsk);
1761
1762 /* locate or allocate a new css_set for this task. */
1763 if (guarantee) {
1764 /* we know the css_set we want already exists. */
1765 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1766 read_lock(&css_set_lock);
1767 newcg = find_existing_css_set(oldcg, cgrp, template);
1768 BUG_ON(!newcg);
1769 get_css_set(newcg);
1770 read_unlock(&css_set_lock);
1771 } else {
1772 might_sleep();
1773 /* find_css_set will give us newcg already referenced. */
1774 newcg = find_css_set(oldcg, cgrp);
1775 if (!newcg) {
1776 put_css_set(oldcg);
1777 return -ENOMEM;
1778 }
1779 }
1780 put_css_set(oldcg);
1781
1782 /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
1783 task_lock(tsk);
1784 if (tsk->flags & PF_EXITING) {
1785 task_unlock(tsk);
1786 put_css_set(newcg);
1787 return -ESRCH;
1788 }
1789 rcu_assign_pointer(tsk->cgroups, newcg);
1790 task_unlock(tsk);
1791
1792 /* Update the css_set linked lists if we're using them */
1793 write_lock(&css_set_lock);
1794 if (!list_empty(&tsk->cg_list))
1795 list_move(&tsk->cg_list, &newcg->tasks);
1796 write_unlock(&css_set_lock);
1797
1798 /*
1799 * We just gained a reference on oldcg by taking it from the task. As
1800 * trading it for newcg is protected by cgroup_mutex, we're safe to drop
1801 * it here; it will be freed under RCU.
1802 */
1803 put_css_set(oldcg);
1804
1805 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1806 return 0;
1807}
1808
1751/** 1809/**
1752 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1810 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
1753 * @cgrp: the cgroup the task is attaching to 1811 * @cgrp: the cgroup the task is attaching to
@@ -1758,11 +1816,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
1758 */ 1816 */
1759int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1817int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1760{ 1818{
1761 int retval = 0; 1819 int retval;
1762 struct cgroup_subsys *ss, *failed_ss = NULL; 1820 struct cgroup_subsys *ss, *failed_ss = NULL;
1763 struct cgroup *oldcgrp; 1821 struct cgroup *oldcgrp;
1764 struct css_set *cg;
1765 struct css_set *newcg;
1766 struct cgroupfs_root *root = cgrp->root; 1822 struct cgroupfs_root *root = cgrp->root;
1767 1823
1768 /* Nothing to do if the task is already in that cgroup */ 1824 /* Nothing to do if the task is already in that cgroup */
@@ -1772,7 +1828,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1772 1828
1773 for_each_subsys(root, ss) { 1829 for_each_subsys(root, ss) {
1774 if (ss->can_attach) { 1830 if (ss->can_attach) {
1775 retval = ss->can_attach(ss, cgrp, tsk, false); 1831 retval = ss->can_attach(ss, cgrp, tsk);
1776 if (retval) { 1832 if (retval) {
1777 /* 1833 /*
1778 * Remember on which subsystem the can_attach() 1834 * Remember on which subsystem the can_attach()
@@ -1784,48 +1840,29 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1784 goto out; 1840 goto out;
1785 } 1841 }
1786 } 1842 }
1843 if (ss->can_attach_task) {
1844 retval = ss->can_attach_task(cgrp, tsk);
1845 if (retval) {
1846 failed_ss = ss;
1847 goto out;
1848 }
1849 }
1787 } 1850 }
1788 1851
1789 task_lock(tsk); 1852 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
1790 cg = tsk->cgroups; 1853 if (retval)
1791 get_css_set(cg);
1792 task_unlock(tsk);
1793 /*
1794 * Locate or allocate a new css_set for this task,
1795 * based on its final set of cgroups
1796 */
1797 newcg = find_css_set(cg, cgrp);
1798 put_css_set(cg);
1799 if (!newcg) {
1800 retval = -ENOMEM;
1801 goto out;
1802 }
1803
1804 task_lock(tsk);
1805 if (tsk->flags & PF_EXITING) {
1806 task_unlock(tsk);
1807 put_css_set(newcg);
1808 retval = -ESRCH;
1809 goto out; 1854 goto out;
1810 }
1811 rcu_assign_pointer(tsk->cgroups, newcg);
1812 task_unlock(tsk);
1813
1814 /* Update the css_set linked lists if we're using them */
1815 write_lock(&css_set_lock);
1816 if (!list_empty(&tsk->cg_list)) {
1817 list_del(&tsk->cg_list);
1818 list_add(&tsk->cg_list, &newcg->tasks);
1819 }
1820 write_unlock(&css_set_lock);
1821 1855
1822 for_each_subsys(root, ss) { 1856 for_each_subsys(root, ss) {
1857 if (ss->pre_attach)
1858 ss->pre_attach(cgrp);
1859 if (ss->attach_task)
1860 ss->attach_task(cgrp, tsk);
1823 if (ss->attach) 1861 if (ss->attach)
1824 ss->attach(ss, cgrp, oldcgrp, tsk, false); 1862 ss->attach(ss, cgrp, oldcgrp, tsk);
1825 } 1863 }
1826 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1864
1827 synchronize_rcu(); 1865 synchronize_rcu();
1828 put_css_set(cg);
1829 1866
1830 /* 1867 /*
1831 * wake up rmdir() waiter. the rmdir should fail since the cgroup 1868 * wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1844,7 +1881,7 @@ out:
1844 */ 1881 */
1845 break; 1882 break;
1846 if (ss->cancel_attach) 1883 if (ss->cancel_attach)
1847 ss->cancel_attach(ss, cgrp, tsk, false); 1884 ss->cancel_attach(ss, cgrp, tsk);
1848 } 1885 }
1849 } 1886 }
1850 return retval; 1887 return retval;
@@ -1875,49 +1912,370 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
1875EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 1912EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1876 1913
1877/* 1914/*
1878 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex 1915 * cgroup_attach_proc works in two stages, the first of which prefetches all
1879 * held. May take task_lock of task 1916 * new css_sets needed (to make sure we have enough memory before committing
1917 * to the move) and stores them in a list of entries of the following type.
1918 * TODO: possible optimization: use css_set->rcu_head for chaining instead
1919 */
1920struct cg_list_entry {
1921 struct css_set *cg;
1922 struct list_head links;
1923};
1924
1925static bool css_set_check_fetched(struct cgroup *cgrp,
1926 struct task_struct *tsk, struct css_set *cg,
1927 struct list_head *newcg_list)
1928{
1929 struct css_set *newcg;
1930 struct cg_list_entry *cg_entry;
1931 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1932
1933 read_lock(&css_set_lock);
1934 newcg = find_existing_css_set(cg, cgrp, template);
1935 if (newcg)
1936 get_css_set(newcg);
1937 read_unlock(&css_set_lock);
1938
1939 /* doesn't exist at all? */
1940 if (!newcg)
1941 return false;
1942 /* see if it's already in the list */
1943 list_for_each_entry(cg_entry, newcg_list, links) {
1944 if (cg_entry->cg == newcg) {
1945 put_css_set(newcg);
1946 return true;
1947 }
1948 }
1949
1950 /* not found */
1951 put_css_set(newcg);
1952 return false;
1953}
1954
1955/*
1956 * Find the new css_set and store it in the list in preparation for moving the
1957 * given task to the given cgroup. Returns 0 or -ENOMEM.
1958 */
1959static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
1960 struct list_head *newcg_list)
1961{
1962 struct css_set *newcg;
1963 struct cg_list_entry *cg_entry;
1964
1965 /* ensure a new css_set will exist for this thread */
1966 newcg = find_css_set(cg, cgrp);
1967 if (!newcg)
1968 return -ENOMEM;
1969 /* add it to the list */
1970 cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
1971 if (!cg_entry) {
1972 put_css_set(newcg);
1973 return -ENOMEM;
1974 }
1975 cg_entry->cg = newcg;
1976 list_add(&cg_entry->links, newcg_list);
1977 return 0;
1978}
1979
1980/**
1981 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
1982 * @cgrp: the cgroup to attach to
1983 * @leader: the threadgroup leader task_struct of the group to be attached
1984 *
1985 * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
1986 * take task_lock of each thread in leader's threadgroup individually in turn.
1987 */
1988int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
1989{
1990 int retval, i, group_size;
1991 struct cgroup_subsys *ss, *failed_ss = NULL;
1992 bool cancel_failed_ss = false;
1993 /* guaranteed to be initialized later, but the compiler needs this */
1994 struct cgroup *oldcgrp = NULL;
1995 struct css_set *oldcg;
1996 struct cgroupfs_root *root = cgrp->root;
1997 /* threadgroup list cursor and array */
1998 struct task_struct *tsk;
1999 struct flex_array *group;
2000 /*
2001 * we need to make sure we have css_sets for all the tasks we're
2002 * going to move -before- we actually start moving them, so that in
2003 * case we get an ENOMEM we can bail out before making any changes.
2004 */
2005 struct list_head newcg_list;
2006 struct cg_list_entry *cg_entry, *temp_nobe;
2007
2008 /*
2009 * step 0: in order to do expensive, possibly blocking operations for
2010 * every thread, we cannot iterate the thread group list, since it needs
2011 * rcu or tasklist locked. instead, build an array of all threads in the
2012 * group - threadgroup_fork_lock prevents new threads from appearing,
2013 * and if threads exit, this will just be an over-estimate.
2014 */
2015 group_size = get_nr_threads(leader);
2016 /* flex_array supports very large thread-groups better than kmalloc. */
2017 group = flex_array_alloc(sizeof(struct task_struct *), group_size,
2018 GFP_KERNEL);
2019 if (!group)
2020 return -ENOMEM;
2021 /* pre-allocate to guarantee space while iterating in rcu read-side. */
2022 retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
2023 if (retval)
2024 goto out_free_group_list;
2025
2026 /* prevent changes to the threadgroup list while we take a snapshot. */
2027 rcu_read_lock();
2028 if (!thread_group_leader(leader)) {
2029 /*
2030 * a race with de_thread from another thread's exec() may strip
2031 * us of our leadership, making while_each_thread unsafe to use
2032 * on this task. if this happens, there is no choice but to
2033 * throw this task away and try again (from cgroup_procs_write);
2034 * this is "double-double-toil-and-trouble-check locking".
2035 */
2036 rcu_read_unlock();
2037 retval = -EAGAIN;
2038 goto out_free_group_list;
2039 }
2040 /* take a reference on each task in the group to go in the array. */
2041 tsk = leader;
2042 i = 0;
2043 do {
2044 /* as per above, nr_threads may decrease, but not increase. */
2045 BUG_ON(i >= group_size);
2046 get_task_struct(tsk);
2047 /*
2048 * saying GFP_ATOMIC has no effect here because we did prealloc
2049 * earlier, but it's good form to communicate our expectations.
2050 */
2051 retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
2052 BUG_ON(retval != 0);
2053 i++;
2054 } while_each_thread(leader, tsk);
2055 /* remember the number of threads in the array for later. */
2056 group_size = i;
2057 rcu_read_unlock();
2058
2059 /*
2060 * step 1: check that we can legitimately attach to the cgroup.
2061 */
2062 for_each_subsys(root, ss) {
2063 if (ss->can_attach) {
2064 retval = ss->can_attach(ss, cgrp, leader);
2065 if (retval) {
2066 failed_ss = ss;
2067 goto out_cancel_attach;
2068 }
2069 }
2070 /* a callback to be run on every thread in the threadgroup. */
2071 if (ss->can_attach_task) {
2072 /* run on each task in the threadgroup. */
2073 for (i = 0; i < group_size; i++) {
2074 tsk = flex_array_get_ptr(group, i);
2075 retval = ss->can_attach_task(cgrp, tsk);
2076 if (retval) {
2077 failed_ss = ss;
2078 cancel_failed_ss = true;
2079 goto out_cancel_attach;
2080 }
2081 }
2082 }
2083 }
2084
2085 /*
2086 * step 2: make sure css_sets exist for all threads to be migrated.
2087 * we use find_css_set, which allocates a new one if necessary.
2088 */
2089 INIT_LIST_HEAD(&newcg_list);
2090 for (i = 0; i < group_size; i++) {
2091 tsk = flex_array_get_ptr(group, i);
2092 /* nothing to do if this task is already in the cgroup */
2093 oldcgrp = task_cgroup_from_root(tsk, root);
2094 if (cgrp == oldcgrp)
2095 continue;
2096 /* get old css_set pointer */
2097 task_lock(tsk);
2098 if (tsk->flags & PF_EXITING) {
2099 /* ignore this task if it's going away */
2100 task_unlock(tsk);
2101 continue;
2102 }
2103 oldcg = tsk->cgroups;
2104 get_css_set(oldcg);
2105 task_unlock(tsk);
2106 /* see if the new one for us is already in the list? */
2107 if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
2108 /* was already there, nothing to do. */
2109 put_css_set(oldcg);
2110 } else {
2111 /* we don't already have it. get new one. */
2112 retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2113 put_css_set(oldcg);
2114 if (retval)
2115 goto out_list_teardown;
2116 }
2117 }
2118
2119 /*
2120 * step 3: now that we're guaranteed success wrt the css_sets, proceed
2121 * to move all tasks to the new cgroup, calling ss->attach_task for each
2122 * one along the way. there are no failure cases after here, so this is
2123 * the commit point.
2124 */
2125 for_each_subsys(root, ss) {
2126 if (ss->pre_attach)
2127 ss->pre_attach(cgrp);
2128 }
2129 for (i = 0; i < group_size; i++) {
2130 tsk = flex_array_get_ptr(group, i);
2131 /* leave current thread as it is if it's already there */
2132 oldcgrp = task_cgroup_from_root(tsk, root);
2133 if (cgrp == oldcgrp)
2134 continue;
2135 /* attach each task to each subsystem */
2136 for_each_subsys(root, ss) {
2137 if (ss->attach_task)
2138 ss->attach_task(cgrp, tsk);
2139 }
2140 /* if the thread is PF_EXITING, it can just get skipped. */
2141 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
2142 BUG_ON(retval != 0 && retval != -ESRCH);
2143 }
2144 /* nothing is sensitive to fork() after this point. */
2145
2146 /*
2147 * step 4: do expensive, non-thread-specific subsystem callbacks.
2148 * TODO: if ever a subsystem needs to know the oldcgrp for each task
2149 * being moved, this call will need to be reworked to communicate that.
2150 */
2151 for_each_subsys(root, ss) {
2152 if (ss->attach)
2153 ss->attach(ss, cgrp, oldcgrp, leader);
2154 }
2155
2156 /*
2157 * step 5: success! and cleanup
2158 */
2159 synchronize_rcu();
2160 cgroup_wakeup_rmdir_waiter(cgrp);
2161 retval = 0;
2162out_list_teardown:
2163 /* clean up the list of prefetched css_sets. */
2164 list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
2165 list_del(&cg_entry->links);
2166 put_css_set(cg_entry->cg);
2167 kfree(cg_entry);
2168 }
2169out_cancel_attach:
2170 /* same deal as in cgroup_attach_task */
2171 if (retval) {
2172 for_each_subsys(root, ss) {
2173 if (ss == failed_ss) {
2174 if (cancel_failed_ss && ss->cancel_attach)
2175 ss->cancel_attach(ss, cgrp, leader);
2176 break;
2177 }
2178 if (ss->cancel_attach)
2179 ss->cancel_attach(ss, cgrp, leader);
2180 }
2181 }
2182 /* clean up the array of referenced threads in the group. */
2183 for (i = 0; i < group_size; i++) {
2184 tsk = flex_array_get_ptr(group, i);
2185 put_task_struct(tsk);
2186 }
2187out_free_group_list:
2188 flex_array_free(group);
2189 return retval;
2190}
2191
2192/*
2193 * Find the task_struct of the task to attach by vpid and pass it along to the
2194 * function to attach either it or all tasks in its threadgroup. Will take
2195 * cgroup_mutex; may take task_lock of task.
1880 */ 2196 */
1881static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) 2197static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
1882{ 2198{
1883 struct task_struct *tsk; 2199 struct task_struct *tsk;
1884 const struct cred *cred = current_cred(), *tcred; 2200 const struct cred *cred = current_cred(), *tcred;
1885 int ret; 2201 int ret;
1886 2202
2203 if (!cgroup_lock_live_group(cgrp))
2204 return -ENODEV;
2205
1887 if (pid) { 2206 if (pid) {
1888 rcu_read_lock(); 2207 rcu_read_lock();
1889 tsk = find_task_by_vpid(pid); 2208 tsk = find_task_by_vpid(pid);
1890 if (!tsk || tsk->flags & PF_EXITING) { 2209 if (!tsk) {
2210 rcu_read_unlock();
2211 cgroup_unlock();
2212 return -ESRCH;
2213 }
2214 if (threadgroup) {
2215 /*
2216 * RCU protects this access, since tsk was found in the
2217 * tid map. a race with de_thread may cause group_leader
2218 * to stop being the leader, but cgroup_attach_proc will
2219 * detect it later.
2220 */
2221 tsk = tsk->group_leader;
2222 } else if (tsk->flags & PF_EXITING) {
2223 /* optimization for the single-task-only case */
1891 rcu_read_unlock(); 2224 rcu_read_unlock();
2225 cgroup_unlock();
1892 return -ESRCH; 2226 return -ESRCH;
1893 } 2227 }
1894 2228
2229 /*
2230 * even if we're attaching all tasks in the thread group, we
2231 * only need to check permissions on one of them.
2232 */
1895 tcred = __task_cred(tsk); 2233 tcred = __task_cred(tsk);
1896 if (cred->euid && 2234 if (cred->euid &&
1897 cred->euid != tcred->uid && 2235 cred->euid != tcred->uid &&
1898 cred->euid != tcred->suid) { 2236 cred->euid != tcred->suid) {
1899 rcu_read_unlock(); 2237 rcu_read_unlock();
2238 cgroup_unlock();
1900 return -EACCES; 2239 return -EACCES;
1901 } 2240 }
1902 get_task_struct(tsk); 2241 get_task_struct(tsk);
1903 rcu_read_unlock(); 2242 rcu_read_unlock();
1904 } else { 2243 } else {
1905 tsk = current; 2244 if (threadgroup)
2245 tsk = current->group_leader;
2246 else
2247 tsk = current;
1906 get_task_struct(tsk); 2248 get_task_struct(tsk);
1907 } 2249 }
1908 2250
1909 ret = cgroup_attach_task(cgrp, tsk); 2251 if (threadgroup) {
2252 threadgroup_fork_write_lock(tsk);
2253 ret = cgroup_attach_proc(cgrp, tsk);
2254 threadgroup_fork_write_unlock(tsk);
2255 } else {
2256 ret = cgroup_attach_task(cgrp, tsk);
2257 }
1910 put_task_struct(tsk); 2258 put_task_struct(tsk);
2259 cgroup_unlock();
1911 return ret; 2260 return ret;
1912} 2261}
1913 2262
1914static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2263static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
1915{ 2264{
2265 return attach_task_by_pid(cgrp, pid, false);
2266}
2267
2268static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2269{
1916 int ret; 2270 int ret;
1917 if (!cgroup_lock_live_group(cgrp)) 2271 do {
1918 return -ENODEV; 2272 /*
1919 ret = attach_task_by_pid(cgrp, pid); 2273 * attach_proc fails with -EAGAIN if threadgroup leadership
1920 cgroup_unlock(); 2274 * changes in the middle of the operation, in which case we need
2275 * to find the task_struct for the new leader and start over.
2276 */
2277 ret = attach_task_by_pid(cgrp, tgid, true);
2278 } while (ret == -EAGAIN);
1921 return ret; 2279 return ret;
1922} 2280}
1923 2281
@@ -3274,9 +3632,9 @@ static struct cftype files[] = {
3274 { 3632 {
3275 .name = CGROUP_FILE_GENERIC_PREFIX "procs", 3633 .name = CGROUP_FILE_GENERIC_PREFIX "procs",
3276 .open = cgroup_procs_open, 3634 .open = cgroup_procs_open,
3277 /* .write_u64 = cgroup_procs_write, TODO */ 3635 .write_u64 = cgroup_procs_write,
3278 .release = cgroup_pidlist_release, 3636 .release = cgroup_pidlist_release,
3279 .mode = S_IRUGO, 3637 .mode = S_IRUGO | S_IWUSR,
3280 }, 3638 },
3281 { 3639 {
3282 .name = "notify_on_release", 3640 .name = "notify_on_release",
@@ -3655,12 +4013,12 @@ again:
3655 spin_lock(&release_list_lock); 4013 spin_lock(&release_list_lock);
3656 set_bit(CGRP_REMOVED, &cgrp->flags); 4014 set_bit(CGRP_REMOVED, &cgrp->flags);
3657 if (!list_empty(&cgrp->release_list)) 4015 if (!list_empty(&cgrp->release_list))
3658 list_del(&cgrp->release_list); 4016 list_del_init(&cgrp->release_list);
3659 spin_unlock(&release_list_lock); 4017 spin_unlock(&release_list_lock);
3660 4018
3661 cgroup_lock_hierarchy(cgrp->root); 4019 cgroup_lock_hierarchy(cgrp->root);
3662 /* delete this cgroup from parent->children */ 4020 /* delete this cgroup from parent->children */
3663 list_del(&cgrp->sibling); 4021 list_del_init(&cgrp->sibling);
3664 cgroup_unlock_hierarchy(cgrp->root); 4022 cgroup_unlock_hierarchy(cgrp->root);
3665 4023
3666 d = dget(cgrp->dentry); 4024 d = dget(cgrp->dentry);
@@ -3879,7 +4237,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
3879 subsys[ss->subsys_id] = NULL; 4237 subsys[ss->subsys_id] = NULL;
3880 4238
3881 /* remove subsystem from rootnode's list of subsystems */ 4239 /* remove subsystem from rootnode's list of subsystems */
3882 list_del(&ss->sibling); 4240 list_del_init(&ss->sibling);
3883 4241
3884 /* 4242 /*
3885 * disentangle the css from all css_sets attached to the dummytop. as 4243 * disentangle the css from all css_sets attached to the dummytop. as
@@ -4230,20 +4588,8 @@ void cgroup_post_fork(struct task_struct *child)
4230 */ 4588 */
4231void cgroup_exit(struct task_struct *tsk, int run_callbacks) 4589void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4232{ 4590{
4233 int i;
4234 struct css_set *cg; 4591 struct css_set *cg;
4235 4592 int i;
4236 if (run_callbacks && need_forkexit_callback) {
4237 /*
4238 * modular subsystems can't use callbacks, so no need to lock
4239 * the subsys array
4240 */
4241 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4242 struct cgroup_subsys *ss = subsys[i];
4243 if (ss->exit)
4244 ss->exit(ss, tsk);
4245 }
4246 }
4247 4593
4248 /* 4594 /*
4249 * Unlink from the css_set task list if necessary. 4595 * Unlink from the css_set task list if necessary.
@@ -4253,7 +4599,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4253 if (!list_empty(&tsk->cg_list)) { 4599 if (!list_empty(&tsk->cg_list)) {
4254 write_lock(&css_set_lock); 4600 write_lock(&css_set_lock);
4255 if (!list_empty(&tsk->cg_list)) 4601 if (!list_empty(&tsk->cg_list))
4256 list_del(&tsk->cg_list); 4602 list_del_init(&tsk->cg_list);
4257 write_unlock(&css_set_lock); 4603 write_unlock(&css_set_lock);
4258 } 4604 }
4259 4605
@@ -4261,125 +4607,26 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4261 task_lock(tsk); 4607 task_lock(tsk);
4262 cg = tsk->cgroups; 4608 cg = tsk->cgroups;
4263 tsk->cgroups = &init_css_set; 4609 tsk->cgroups = &init_css_set;
4264 task_unlock(tsk);
4265 if (cg)
4266 put_css_set_taskexit(cg);
4267}
4268
4269/**
4270 * cgroup_clone - clone the cgroup the given subsystem is attached to
4271 * @tsk: the task to be moved
4272 * @subsys: the given subsystem
4273 * @nodename: the name for the new cgroup
4274 *
4275 * Duplicate the current cgroup in the hierarchy that the given
4276 * subsystem is attached to, and move this task into the new
4277 * child.
4278 */
4279int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
4280 char *nodename)
4281{
4282 struct dentry *dentry;
4283 int ret = 0;
4284 struct cgroup *parent, *child;
4285 struct inode *inode;
4286 struct css_set *cg;
4287 struct cgroupfs_root *root;
4288 struct cgroup_subsys *ss;
4289
4290 /* We shouldn't be called by an unregistered subsystem */
4291 BUG_ON(!subsys->active);
4292
4293 /* First figure out what hierarchy and cgroup we're dealing
4294 * with, and pin them so we can drop cgroup_mutex */
4295 mutex_lock(&cgroup_mutex);
4296 again:
4297 root = subsys->root;
4298 if (root == &rootnode) {
4299 mutex_unlock(&cgroup_mutex);
4300 return 0;
4301 }
4302 4610
4303 /* Pin the hierarchy */ 4611 if (run_callbacks && need_forkexit_callback) {
4304 if (!atomic_inc_not_zero(&root->sb->s_active)) { 4612 /*
4305 /* We race with the final deactivate_super() */ 4613 * modular subsystems can't use callbacks, so no need to lock
4306 mutex_unlock(&cgroup_mutex); 4614 * the subsys array
4307 return 0; 4615 */
4616 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4617 struct cgroup_subsys *ss = subsys[i];
4618 if (ss->exit) {
4619 struct cgroup *old_cgrp =
4620 rcu_dereference_raw(cg->subsys[i])->cgroup;
4621 struct cgroup *cgrp = task_cgroup(tsk, i);
4622 ss->exit(ss, cgrp, old_cgrp, tsk);
4623 }
4624 }
4308 } 4625 }
4309
4310 /* Keep the cgroup alive */
4311 task_lock(tsk);
4312 parent = task_cgroup(tsk, subsys->subsys_id);
4313 cg = tsk->cgroups;
4314 get_css_set(cg);
4315 task_unlock(tsk); 4626 task_unlock(tsk);
4316 4627
4317 mutex_unlock(&cgroup_mutex); 4628 if (cg)
4318 4629 put_css_set_taskexit(cg);
4319 /* Now do the VFS work to create a cgroup */
4320 inode = parent->dentry->d_inode;
4321
4322 /* Hold the parent directory mutex across this operation to
4323 * stop anyone else deleting the new cgroup */
4324 mutex_lock(&inode->i_mutex);
4325 dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
4326 if (IS_ERR(dentry)) {
4327 printk(KERN_INFO
4328 "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
4329 PTR_ERR(dentry));
4330 ret = PTR_ERR(dentry);
4331 goto out_release;
4332 }
4333
4334 /* Create the cgroup directory, which also creates the cgroup */
4335 ret = vfs_mkdir(inode, dentry, 0755);
4336 child = __d_cgrp(dentry);
4337 dput(dentry);
4338 if (ret) {
4339 printk(KERN_INFO
4340 "Failed to create cgroup %s: %d\n", nodename,
4341 ret);
4342 goto out_release;
4343 }
4344
4345 /* The cgroup now exists. Retake cgroup_mutex and check
4346 * that we're still in the same state that we thought we
4347 * were. */
4348 mutex_lock(&cgroup_mutex);
4349 if ((root != subsys->root) ||
4350 (parent != task_cgroup(tsk, subsys->subsys_id))) {
4351 /* Aargh, we raced ... */
4352 mutex_unlock(&inode->i_mutex);
4353 put_css_set(cg);
4354
4355 deactivate_super(root->sb);
4356 /* The cgroup is still accessible in the VFS, but
4357 * we're not going to try to rmdir() it at this
4358 * point. */
4359 printk(KERN_INFO
4360 "Race in cgroup_clone() - leaking cgroup %s\n",
4361 nodename);
4362 goto again;
4363 }
4364
4365 /* do any required auto-setup */
4366 for_each_subsys(root, ss) {
4367 if (ss->post_clone)
4368 ss->post_clone(ss, child);
4369 }
4370
4371 /* All seems fine. Finish by moving the task into the new cgroup */
4372 ret = cgroup_attach_task(child, tsk);
4373 mutex_unlock(&cgroup_mutex);
4374
4375 out_release:
4376 mutex_unlock(&inode->i_mutex);
4377
4378 mutex_lock(&cgroup_mutex);
4379 put_css_set(cg);
4380 mutex_unlock(&cgroup_mutex);
4381 deactivate_super(root->sb);
4382 return ret;
4383} 4630}
4384 4631
4385/** 4632/**
@@ -4620,14 +4867,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
4620 return ret; 4867 return ret;
4621} 4868}
4622 4869
4623static void __free_css_id_cb(struct rcu_head *head)
4624{
4625 struct css_id *id;
4626
4627 id = container_of(head, struct css_id, rcu_head);
4628 kfree(id);
4629}
4630
4631void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) 4870void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4632{ 4871{
4633 struct css_id *id = css->id; 4872 struct css_id *id = css->id;
@@ -4642,7 +4881,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4642 spin_lock(&ss->id_lock); 4881 spin_lock(&ss->id_lock);
4643 idr_remove(&ss->idr, id->id); 4882 idr_remove(&ss->idr, id->id);
4644 spin_unlock(&ss->id_lock); 4883 spin_unlock(&ss->id_lock);
4645 call_rcu(&id->rcu_head, __free_css_id_cb); 4884 kfree_rcu(id, rcu_head);
4646} 4885}
4647EXPORT_SYMBOL_GPL(free_css_id); 4886EXPORT_SYMBOL_GPL(free_css_id);
4648 4887
@@ -4813,6 +5052,29 @@ css_get_next(struct cgroup_subsys *ss, int id,
4813 return ret; 5052 return ret;
4814} 5053}
4815 5054
5055/*
5056 * get corresponding css from file open on cgroupfs directory
5057 */
5058struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5059{
5060 struct cgroup *cgrp;
5061 struct inode *inode;
5062 struct cgroup_subsys_state *css;
5063
5064 inode = f->f_dentry->d_inode;
5065 /* check in cgroup filesystem dir */
5066 if (inode->i_op != &cgroup_dir_inode_operations)
5067 return ERR_PTR(-EBADF);
5068
5069 if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
5070 return ERR_PTR(-EINVAL);
5071
5072 /* get cgroup */
5073 cgrp = __d_cgrp(f->f_dentry);
5074 css = cgrp->subsys[id];
5075 return css ? css : ERR_PTR(-ENOENT);
5076}
5077
4816#ifdef CONFIG_CGROUP_DEBUG 5078#ifdef CONFIG_CGROUP_DEBUG
4817static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, 5079static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
4818 struct cgroup *cont) 5080 struct cgroup *cont)
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e7bebb7c6c38..e691818d7e45 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -160,7 +160,7 @@ static void freezer_destroy(struct cgroup_subsys *ss,
160 */ 160 */
161static int freezer_can_attach(struct cgroup_subsys *ss, 161static int freezer_can_attach(struct cgroup_subsys *ss,
162 struct cgroup *new_cgroup, 162 struct cgroup *new_cgroup,
163 struct task_struct *task, bool threadgroup) 163 struct task_struct *task)
164{ 164{
165 struct freezer *freezer; 165 struct freezer *freezer;
166 166
@@ -172,26 +172,17 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
172 if (freezer->state != CGROUP_THAWED) 172 if (freezer->state != CGROUP_THAWED)
173 return -EBUSY; 173 return -EBUSY;
174 174
175 return 0;
176}
177
178static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
179{
175 rcu_read_lock(); 180 rcu_read_lock();
176 if (__cgroup_freezing_or_frozen(task)) { 181 if (__cgroup_freezing_or_frozen(tsk)) {
177 rcu_read_unlock(); 182 rcu_read_unlock();
178 return -EBUSY; 183 return -EBUSY;
179 } 184 }
180 rcu_read_unlock(); 185 rcu_read_unlock();
181
182 if (threadgroup) {
183 struct task_struct *c;
184
185 rcu_read_lock();
186 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
187 if (__cgroup_freezing_or_frozen(c)) {
188 rcu_read_unlock();
189 return -EBUSY;
190 }
191 }
192 rcu_read_unlock();
193 }
194
195 return 0; 186 return 0;
196} 187}
197 188
@@ -390,6 +381,9 @@ struct cgroup_subsys freezer_subsys = {
390 .populate = freezer_populate, 381 .populate = freezer_populate,
391 .subsys_id = freezer_subsys_id, 382 .subsys_id = freezer_subsys_id,
392 .can_attach = freezer_can_attach, 383 .can_attach = freezer_can_attach,
384 .can_attach_task = freezer_can_attach_task,
385 .pre_attach = NULL,
386 .attach_task = NULL,
393 .attach = NULL, 387 .attach = NULL,
394 .fork = freezer_fork, 388 .fork = freezer_fork,
395 .exit = NULL, 389 .exit = NULL,
diff --git a/kernel/compat.c b/kernel/compat.c
index c9e2ec0b34a8..fc9eb093acd5 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o,
52 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; 52 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
53} 53}
54 54
55static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
56{
57 memset(txc, 0, sizeof(struct timex));
58
59 if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
60 __get_user(txc->modes, &utp->modes) ||
61 __get_user(txc->offset, &utp->offset) ||
62 __get_user(txc->freq, &utp->freq) ||
63 __get_user(txc->maxerror, &utp->maxerror) ||
64 __get_user(txc->esterror, &utp->esterror) ||
65 __get_user(txc->status, &utp->status) ||
66 __get_user(txc->constant, &utp->constant) ||
67 __get_user(txc->precision, &utp->precision) ||
68 __get_user(txc->tolerance, &utp->tolerance) ||
69 __get_user(txc->time.tv_sec, &utp->time.tv_sec) ||
70 __get_user(txc->time.tv_usec, &utp->time.tv_usec) ||
71 __get_user(txc->tick, &utp->tick) ||
72 __get_user(txc->ppsfreq, &utp->ppsfreq) ||
73 __get_user(txc->jitter, &utp->jitter) ||
74 __get_user(txc->shift, &utp->shift) ||
75 __get_user(txc->stabil, &utp->stabil) ||
76 __get_user(txc->jitcnt, &utp->jitcnt) ||
77 __get_user(txc->calcnt, &utp->calcnt) ||
78 __get_user(txc->errcnt, &utp->errcnt) ||
79 __get_user(txc->stbcnt, &utp->stbcnt))
80 return -EFAULT;
81
82 return 0;
83}
84
85static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
86{
87 if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
88 __put_user(txc->modes, &utp->modes) ||
89 __put_user(txc->offset, &utp->offset) ||
90 __put_user(txc->freq, &utp->freq) ||
91 __put_user(txc->maxerror, &utp->maxerror) ||
92 __put_user(txc->esterror, &utp->esterror) ||
93 __put_user(txc->status, &utp->status) ||
94 __put_user(txc->constant, &utp->constant) ||
95 __put_user(txc->precision, &utp->precision) ||
96 __put_user(txc->tolerance, &utp->tolerance) ||
97 __put_user(txc->time.tv_sec, &utp->time.tv_sec) ||
98 __put_user(txc->time.tv_usec, &utp->time.tv_usec) ||
99 __put_user(txc->tick, &utp->tick) ||
100 __put_user(txc->ppsfreq, &utp->ppsfreq) ||
101 __put_user(txc->jitter, &utp->jitter) ||
102 __put_user(txc->shift, &utp->shift) ||
103 __put_user(txc->stabil, &utp->stabil) ||
104 __put_user(txc->jitcnt, &utp->jitcnt) ||
105 __put_user(txc->calcnt, &utp->calcnt) ||
106 __put_user(txc->errcnt, &utp->errcnt) ||
107 __put_user(txc->stbcnt, &utp->stbcnt) ||
108 __put_user(txc->tai, &utp->tai))
109 return -EFAULT;
110 return 0;
111}
112
55asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, 113asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
56 struct timezone __user *tz) 114 struct timezone __user *tz)
57{ 115{
@@ -235,6 +293,8 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
235 return compat_jiffies_to_clock_t(jiffies); 293 return compat_jiffies_to_clock_t(jiffies);
236} 294}
237 295
296#ifdef __ARCH_WANT_SYS_SIGPENDING
297
238/* 298/*
239 * Assumption: old_sigset_t and compat_old_sigset_t are both 299 * Assumption: old_sigset_t and compat_old_sigset_t are both
240 * types that can be passed to put_user()/get_user(). 300 * types that can be passed to put_user()/get_user().
@@ -254,6 +314,10 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
254 return ret; 314 return ret;
255} 315}
256 316
317#endif
318
319#ifdef __ARCH_WANT_SYS_SIGPROCMASK
320
257asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, 321asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
258 compat_old_sigset_t __user *oset) 322 compat_old_sigset_t __user *oset)
259{ 323{
@@ -275,6 +339,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
275 return ret; 339 return ret;
276} 340}
277 341
342#endif
343
278asmlinkage long compat_sys_setrlimit(unsigned int resource, 344asmlinkage long compat_sys_setrlimit(unsigned int resource,
279 struct compat_rlimit __user *rlim) 345 struct compat_rlimit __user *rlim)
280{ 346{
@@ -617,6 +683,29 @@ long compat_sys_clock_gettime(clockid_t which_clock,
617 return err; 683 return err;
618} 684}
619 685
686long compat_sys_clock_adjtime(clockid_t which_clock,
687 struct compat_timex __user *utp)
688{
689 struct timex txc;
690 mm_segment_t oldfs;
691 int err, ret;
692
693 err = compat_get_timex(&txc, utp);
694 if (err)
695 return err;
696
697 oldfs = get_fs();
698 set_fs(KERNEL_DS);
699 ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc);
700 set_fs(oldfs);
701
702 err = compat_put_timex(utp, &txc);
703 if (err)
704 return err;
705
706 return ret;
707}
708
620long compat_sys_clock_getres(clockid_t which_clock, 709long compat_sys_clock_getres(clockid_t which_clock,
621 struct compat_timespec __user *tp) 710 struct compat_timespec __user *tp)
622{ 711{
@@ -809,10 +898,9 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
809{ 898{
810 compat_sigset_t s32; 899 compat_sigset_t s32;
811 sigset_t s; 900 sigset_t s;
812 int sig;
813 struct timespec t; 901 struct timespec t;
814 siginfo_t info; 902 siginfo_t info;
815 long ret, timeout = 0; 903 long ret;
816 904
817 if (sigsetsize != sizeof(sigset_t)) 905 if (sigsetsize != sizeof(sigset_t))
818 return -EINVAL; 906 return -EINVAL;
@@ -820,51 +908,19 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
820 if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) 908 if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
821 return -EFAULT; 909 return -EFAULT;
822 sigset_from_compat(&s, &s32); 910 sigset_from_compat(&s, &s32);
823 sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP));
824 signotset(&s);
825 911
826 if (uts) { 912 if (uts) {
827 if (get_compat_timespec (&t, uts)) 913 if (get_compat_timespec(&t, uts))
828 return -EFAULT; 914 return -EFAULT;
829 if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0
830 || t.tv_sec < 0)
831 return -EINVAL;
832 } 915 }
833 916
834 spin_lock_irq(&current->sighand->siglock); 917 ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
835 sig = dequeue_signal(current, &s, &info);
836 if (!sig) {
837 timeout = MAX_SCHEDULE_TIMEOUT;
838 if (uts)
839 timeout = timespec_to_jiffies(&t)
840 +(t.tv_sec || t.tv_nsec);
841 if (timeout) {
842 current->real_blocked = current->blocked;
843 sigandsets(&current->blocked, &current->blocked, &s);
844
845 recalc_sigpending();
846 spin_unlock_irq(&current->sighand->siglock);
847
848 timeout = schedule_timeout_interruptible(timeout);
849
850 spin_lock_irq(&current->sighand->siglock);
851 sig = dequeue_signal(current, &s, &info);
852 current->blocked = current->real_blocked;
853 siginitset(&current->real_blocked, 0);
854 recalc_sigpending();
855 }
856 }
857 spin_unlock_irq(&current->sighand->siglock);
858 918
859 if (sig) { 919 if (ret > 0 && uinfo) {
860 ret = sig; 920 if (copy_siginfo_to_user32(uinfo, &info))
861 if (uinfo) { 921 ret = -EFAULT;
862 if (copy_siginfo_to_user32(uinfo, &info))
863 ret = -EFAULT;
864 }
865 }else {
866 ret = timeout?-EINTR:-EAGAIN;
867 } 922 }
923
868 return ret; 924 return ret;
869 925
870} 926}
@@ -951,58 +1007,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
951asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) 1007asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
952{ 1008{
953 struct timex txc; 1009 struct timex txc;
954 int ret; 1010 int err, ret;
955
956 memset(&txc, 0, sizeof(struct timex));
957 1011
958 if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || 1012 err = compat_get_timex(&txc, utp);
959 __get_user(txc.modes, &utp->modes) || 1013 if (err)
960 __get_user(txc.offset, &utp->offset) || 1014 return err;
961 __get_user(txc.freq, &utp->freq) ||
962 __get_user(txc.maxerror, &utp->maxerror) ||
963 __get_user(txc.esterror, &utp->esterror) ||
964 __get_user(txc.status, &utp->status) ||
965 __get_user(txc.constant, &utp->constant) ||
966 __get_user(txc.precision, &utp->precision) ||
967 __get_user(txc.tolerance, &utp->tolerance) ||
968 __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
969 __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
970 __get_user(txc.tick, &utp->tick) ||
971 __get_user(txc.ppsfreq, &utp->ppsfreq) ||
972 __get_user(txc.jitter, &utp->jitter) ||
973 __get_user(txc.shift, &utp->shift) ||
974 __get_user(txc.stabil, &utp->stabil) ||
975 __get_user(txc.jitcnt, &utp->jitcnt) ||
976 __get_user(txc.calcnt, &utp->calcnt) ||
977 __get_user(txc.errcnt, &utp->errcnt) ||
978 __get_user(txc.stbcnt, &utp->stbcnt))
979 return -EFAULT;
980 1015
981 ret = do_adjtimex(&txc); 1016 ret = do_adjtimex(&txc);
982 1017
983 if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || 1018 err = compat_put_timex(utp, &txc);
984 __put_user(txc.modes, &utp->modes) || 1019 if (err)
985 __put_user(txc.offset, &utp->offset) || 1020 return err;
986 __put_user(txc.freq, &utp->freq) ||
987 __put_user(txc.maxerror, &utp->maxerror) ||
988 __put_user(txc.esterror, &utp->esterror) ||
989 __put_user(txc.status, &utp->status) ||
990 __put_user(txc.constant, &utp->constant) ||
991 __put_user(txc.precision, &utp->precision) ||
992 __put_user(txc.tolerance, &utp->tolerance) ||
993 __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
994 __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
995 __put_user(txc.tick, &utp->tick) ||
996 __put_user(txc.ppsfreq, &utp->ppsfreq) ||
997 __put_user(txc.jitter, &utp->jitter) ||
998 __put_user(txc.shift, &utp->shift) ||
999 __put_user(txc.stabil, &utp->stabil) ||
1000 __put_user(txc.jitcnt, &utp->jitcnt) ||
1001 __put_user(txc.calcnt, &utp->calcnt) ||
1002 __put_user(txc.errcnt, &utp->errcnt) ||
1003 __put_user(txc.stbcnt, &utp->stbcnt) ||
1004 __put_user(txc.tai, &utp->tai))
1005 ret = -EFAULT;
1006 1021
1007 return ret; 1022 return ret;
1008} 1023}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 156cc5556140..12b7458f23b1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -126,7 +126,7 @@ static void cpu_hotplug_done(void)
126#else /* #if CONFIG_HOTPLUG_CPU */ 126#else /* #if CONFIG_HOTPLUG_CPU */
127static void cpu_hotplug_begin(void) {} 127static void cpu_hotplug_begin(void) {}
128static void cpu_hotplug_done(void) {} 128static void cpu_hotplug_done(void) {}
129#endif /* #esle #if CONFIG_HOTPLUG_CPU */ 129#endif /* #else #if CONFIG_HOTPLUG_CPU */
130 130
131/* Need to know about CPUs going up/down? */ 131/* Need to know about CPUs going up/down? */
132int __ref register_cpu_notifier(struct notifier_block *nb) 132int __ref register_cpu_notifier(struct notifier_block *nb)
@@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v)
160{ 160{
161 BUG_ON(cpu_notify(val, v)); 161 BUG_ON(cpu_notify(val, v));
162} 162}
163
164EXPORT_SYMBOL(register_cpu_notifier); 163EXPORT_SYMBOL(register_cpu_notifier);
165 164
166void __ref unregister_cpu_notifier(struct notifier_block *nb) 165void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -205,7 +204,6 @@ static int __ref take_cpu_down(void *_param)
205 return err; 204 return err;
206 205
207 cpu_notify(CPU_DYING | param->mod, param->hcpu); 206 cpu_notify(CPU_DYING | param->mod, param->hcpu);
208
209 return 0; 207 return 0;
210} 208}
211 209
@@ -227,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
227 return -EINVAL; 225 return -EINVAL;
228 226
229 cpu_hotplug_begin(); 227 cpu_hotplug_begin();
228
230 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); 229 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
231 if (err) { 230 if (err) {
232 nr_calls--; 231 nr_calls--;
@@ -304,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
304 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); 303 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
305 if (ret) { 304 if (ret) {
306 nr_calls--; 305 nr_calls--;
307 printk("%s: attempt to bring up CPU %u failed\n", 306 printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
308 __func__, cpu); 307 __func__, cpu);
309 goto out_notify; 308 goto out_notify;
310 } 309 }
@@ -450,14 +449,14 @@ void __ref enable_nonboot_cpus(void)
450 if (cpumask_empty(frozen_cpus)) 449 if (cpumask_empty(frozen_cpus))
451 goto out; 450 goto out;
452 451
453 printk("Enabling non-boot CPUs ...\n"); 452 printk(KERN_INFO "Enabling non-boot CPUs ...\n");
454 453
455 arch_enable_nonboot_cpus_begin(); 454 arch_enable_nonboot_cpus_begin();
456 455
457 for_each_cpu(cpu, frozen_cpus) { 456 for_each_cpu(cpu, frozen_cpus) {
458 error = _cpu_up(cpu, 1); 457 error = _cpu_up(cpu, 1);
459 if (!error) { 458 if (!error) {
460 printk("CPU%d is up\n", cpu); 459 printk(KERN_INFO "CPU%d is up\n", cpu);
461 continue; 460 continue;
462 } 461 }
463 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 462 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
@@ -509,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
509 */ 508 */
510 509
511/* cpu_bit_bitmap[0] is empty - so we can back into it */ 510/* cpu_bit_bitmap[0] is empty - so we can back into it */
512#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x) 511#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x))
513#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) 512#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
514#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) 513#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
515#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) 514#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4349935c2ad8..1ceeb049c827 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p,
1015 struct cpuset *cs; 1015 struct cpuset *cs;
1016 int migrate; 1016 int migrate;
1017 const nodemask_t *oldmem = scan->data; 1017 const nodemask_t *oldmem = scan->data;
1018 NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); 1018 static nodemask_t newmems; /* protected by cgroup_mutex */
1019
1020 if (!newmems)
1021 return;
1022 1019
1023 cs = cgroup_cs(scan->cg); 1020 cs = cgroup_cs(scan->cg);
1024 guarantee_online_mems(cs, newmems); 1021 guarantee_online_mems(cs, &newmems);
1025 1022
1026 cpuset_change_task_nodemask(p, newmems); 1023 cpuset_change_task_nodemask(p, &newmems);
1027
1028 NODEMASK_FREE(newmems);
1029 1024
1030 mm = get_task_mm(p); 1025 mm = get_task_mm(p);
1031 if (!mm) 1026 if (!mm)
@@ -1164,7 +1159,7 @@ int current_cpuset_is_being_rebound(void)
1164static int update_relax_domain_level(struct cpuset *cs, s64 val) 1159static int update_relax_domain_level(struct cpuset *cs, s64 val)
1165{ 1160{
1166#ifdef CONFIG_SMP 1161#ifdef CONFIG_SMP
1167 if (val < -1 || val >= SD_LV_MAX) 1162 if (val < -1 || val >= sched_domain_level_max)
1168 return -EINVAL; 1163 return -EINVAL;
1169#endif 1164#endif
1170 1165
@@ -1372,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp)
1372 return val; 1367 return val;
1373} 1368}
1374 1369
1375/* Protected by cgroup_lock */
1376static cpumask_var_t cpus_attach;
1377
1378/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1370/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1379static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, 1371static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1380 struct task_struct *tsk, bool threadgroup) 1372 struct task_struct *tsk)
1381{ 1373{
1382 int ret;
1383 struct cpuset *cs = cgroup_cs(cont); 1374 struct cpuset *cs = cgroup_cs(cont);
1384 1375
1385 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1376 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1396,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1396 if (tsk->flags & PF_THREAD_BOUND) 1387 if (tsk->flags & PF_THREAD_BOUND)
1397 return -EINVAL; 1388 return -EINVAL;
1398 1389
1399 ret = security_task_setscheduler(tsk);
1400 if (ret)
1401 return ret;
1402 if (threadgroup) {
1403 struct task_struct *c;
1404
1405 rcu_read_lock();
1406 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1407 ret = security_task_setscheduler(c);
1408 if (ret) {
1409 rcu_read_unlock();
1410 return ret;
1411 }
1412 }
1413 rcu_read_unlock();
1414 }
1415 return 0; 1390 return 0;
1416} 1391}
1417 1392
1418static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, 1393static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
1419 struct cpuset *cs) 1394{
1395 return security_task_setscheduler(task);
1396}
1397
1398/*
1399 * Protected by cgroup_lock. The nodemasks must be stored globally because
1400 * dynamically allocating them is not allowed in pre_attach, and they must
1401 * persist among pre_attach, attach_task, and attach.
1402 */
1403static cpumask_var_t cpus_attach;
1404static nodemask_t cpuset_attach_nodemask_from;
1405static nodemask_t cpuset_attach_nodemask_to;
1406
1407/* Set-up work for before attaching each task. */
1408static void cpuset_pre_attach(struct cgroup *cont)
1409{
1410 struct cpuset *cs = cgroup_cs(cont);
1411
1412 if (cs == &top_cpuset)
1413 cpumask_copy(cpus_attach, cpu_possible_mask);
1414 else
1415 guarantee_online_cpus(cs, cpus_attach);
1416
1417 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1418}
1419
1420/* Per-thread attachment work. */
1421static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
1420{ 1422{
1421 int err; 1423 int err;
1424 struct cpuset *cs = cgroup_cs(cont);
1425
1422 /* 1426 /*
1423 * can_attach beforehand should guarantee that this doesn't fail. 1427 * can_attach beforehand should guarantee that this doesn't fail.
1424 * TODO: have a better way to handle failure here 1428 * TODO: have a better way to handle failure here
@@ -1426,56 +1430,31 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1426 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1430 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1427 WARN_ON_ONCE(err); 1431 WARN_ON_ONCE(err);
1428 1432
1429 cpuset_change_task_nodemask(tsk, to); 1433 cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
1430 cpuset_update_task_spread_flag(cs, tsk); 1434 cpuset_update_task_spread_flag(cs, tsk);
1431
1432} 1435}
1433 1436
1434static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, 1437static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1435 struct cgroup *oldcont, struct task_struct *tsk, 1438 struct cgroup *oldcont, struct task_struct *tsk)
1436 bool threadgroup)
1437{ 1439{
1438 struct mm_struct *mm; 1440 struct mm_struct *mm;
1439 struct cpuset *cs = cgroup_cs(cont); 1441 struct cpuset *cs = cgroup_cs(cont);
1440 struct cpuset *oldcs = cgroup_cs(oldcont); 1442 struct cpuset *oldcs = cgroup_cs(oldcont);
1441 NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
1442 NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
1443
1444 if (from == NULL || to == NULL)
1445 goto alloc_fail;
1446 1443
1447 if (cs == &top_cpuset) { 1444 /*
1448 cpumask_copy(cpus_attach, cpu_possible_mask); 1445 * Change mm, possibly for multiple threads in a threadgroup. This is
1449 } else { 1446 * expensive and may sleep.
1450 guarantee_online_cpus(cs, cpus_attach); 1447 */
1451 } 1448 cpuset_attach_nodemask_from = oldcs->mems_allowed;
1452 guarantee_online_mems(cs, to); 1449 cpuset_attach_nodemask_to = cs->mems_allowed;
1453
1454 /* do per-task migration stuff possibly for each in the threadgroup */
1455 cpuset_attach_task(tsk, to, cs);
1456 if (threadgroup) {
1457 struct task_struct *c;
1458 rcu_read_lock();
1459 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1460 cpuset_attach_task(c, to, cs);
1461 }
1462 rcu_read_unlock();
1463 }
1464
1465 /* change mm; only needs to be done once even if threadgroup */
1466 *from = oldcs->mems_allowed;
1467 *to = cs->mems_allowed;
1468 mm = get_task_mm(tsk); 1450 mm = get_task_mm(tsk);
1469 if (mm) { 1451 if (mm) {
1470 mpol_rebind_mm(mm, to); 1452 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1471 if (is_memory_migrate(cs)) 1453 if (is_memory_migrate(cs))
1472 cpuset_migrate_mm(mm, from, to); 1454 cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
1455 &cpuset_attach_nodemask_to);
1473 mmput(mm); 1456 mmput(mm);
1474 } 1457 }
1475
1476alloc_fail:
1477 NODEMASK_FREE(from);
1478 NODEMASK_FREE(to);
1479} 1458}
1480 1459
1481/* The various types of files and directories in a cpuset file system */ 1460/* The various types of files and directories in a cpuset file system */
@@ -1575,8 +1554,10 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1575 return -ENODEV; 1554 return -ENODEV;
1576 1555
1577 trialcs = alloc_trial_cpuset(cs); 1556 trialcs = alloc_trial_cpuset(cs);
1578 if (!trialcs) 1557 if (!trialcs) {
1579 return -ENOMEM; 1558 retval = -ENOMEM;
1559 goto out;
1560 }
1580 1561
1581 switch (cft->private) { 1562 switch (cft->private) {
1582 case FILE_CPULIST: 1563 case FILE_CPULIST:
@@ -1591,6 +1572,7 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1591 } 1572 }
1592 1573
1593 free_trial_cpuset(trialcs); 1574 free_trial_cpuset(trialcs);
1575out:
1594 cgroup_unlock(); 1576 cgroup_unlock();
1595 return retval; 1577 return retval;
1596} 1578}
@@ -1607,34 +1589,26 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1607 * across a page fault. 1589 * across a page fault.
1608 */ 1590 */
1609 1591
1610static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) 1592static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1611{ 1593{
1612 int ret; 1594 size_t count;
1613 1595
1614 mutex_lock(&callback_mutex); 1596 mutex_lock(&callback_mutex);
1615 ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); 1597 count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
1616 mutex_unlock(&callback_mutex); 1598 mutex_unlock(&callback_mutex);
1617 1599
1618 return ret; 1600 return count;
1619} 1601}
1620 1602
1621static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1603static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1622{ 1604{
1623 NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); 1605 size_t count;
1624 int retval;
1625
1626 if (mask == NULL)
1627 return -ENOMEM;
1628 1606
1629 mutex_lock(&callback_mutex); 1607 mutex_lock(&callback_mutex);
1630 *mask = cs->mems_allowed; 1608 count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
1631 mutex_unlock(&callback_mutex); 1609 mutex_unlock(&callback_mutex);
1632 1610
1633 retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); 1611 return count;
1634
1635 NODEMASK_FREE(mask);
1636
1637 return retval;
1638} 1612}
1639 1613
1640static ssize_t cpuset_common_file_read(struct cgroup *cont, 1614static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1828,10 +1802,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1828} 1802}
1829 1803
1830/* 1804/*
1831 * post_clone() is called at the end of cgroup_clone(). 1805 * post_clone() is called during cgroup_create() when the
1832 * 'cgroup' was just created automatically as a result of 1806 * clone_children mount argument was specified. The cgroup
1833 * a cgroup_clone(), and the current task is about to 1807 * can not yet have any tasks.
1834 * be moved into 'cgroup'.
1835 * 1808 *
1836 * Currently we refuse to set up the cgroup - thereby 1809 * Currently we refuse to set up the cgroup - thereby
1837 * refusing the task to be entered, and as a result refusing 1810 * refusing the task to be entered, and as a result refusing
@@ -1859,8 +1832,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
1859 cs = cgroup_cs(cgroup); 1832 cs = cgroup_cs(cgroup);
1860 parent_cs = cgroup_cs(parent); 1833 parent_cs = cgroup_cs(parent);
1861 1834
1835 mutex_lock(&callback_mutex);
1862 cs->mems_allowed = parent_cs->mems_allowed; 1836 cs->mems_allowed = parent_cs->mems_allowed;
1863 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); 1837 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1838 mutex_unlock(&callback_mutex);
1864 return; 1839 return;
1865} 1840}
1866 1841
@@ -1928,6 +1903,9 @@ struct cgroup_subsys cpuset_subsys = {
1928 .create = cpuset_create, 1903 .create = cpuset_create,
1929 .destroy = cpuset_destroy, 1904 .destroy = cpuset_destroy,
1930 .can_attach = cpuset_can_attach, 1905 .can_attach = cpuset_can_attach,
1906 .can_attach_task = cpuset_can_attach_task,
1907 .pre_attach = cpuset_pre_attach,
1908 .attach_task = cpuset_attach_task,
1931 .attach = cpuset_attach, 1909 .attach = cpuset_attach,
1932 .populate = cpuset_populate, 1910 .populate = cpuset_populate,
1933 .post_clone = cpuset_post_clone, 1911 .post_clone = cpuset_post_clone,
@@ -2063,10 +2041,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2063 struct cpuset *cp; /* scans cpusets being updated */ 2041 struct cpuset *cp; /* scans cpusets being updated */
2064 struct cpuset *child; /* scans child cpusets of cp */ 2042 struct cpuset *child; /* scans child cpusets of cp */
2065 struct cgroup *cont; 2043 struct cgroup *cont;
2066 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); 2044 static nodemask_t oldmems; /* protected by cgroup_mutex */
2067
2068 if (oldmems == NULL)
2069 return;
2070 2045
2071 list_add_tail((struct list_head *)&root->stack_list, &queue); 2046 list_add_tail((struct list_head *)&root->stack_list, &queue);
2072 2047
@@ -2083,7 +2058,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2083 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2058 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2084 continue; 2059 continue;
2085 2060
2086 *oldmems = cp->mems_allowed; 2061 oldmems = cp->mems_allowed;
2087 2062
2088 /* Remove offline cpus and mems from this cpuset. */ 2063 /* Remove offline cpus and mems from this cpuset. */
2089 mutex_lock(&callback_mutex); 2064 mutex_lock(&callback_mutex);
@@ -2099,10 +2074,9 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2099 remove_tasks_in_empty_cpuset(cp); 2074 remove_tasks_in_empty_cpuset(cp);
2100 else { 2075 else {
2101 update_tasks_cpumask(cp, NULL); 2076 update_tasks_cpumask(cp, NULL);
2102 update_tasks_nodemask(cp, oldmems, NULL); 2077 update_tasks_nodemask(cp, &oldmems, NULL);
2103 } 2078 }
2104 } 2079 }
2105 NODEMASK_FREE(oldmems);
2106} 2080}
2107 2081
2108/* 2082/*
@@ -2144,19 +2118,16 @@ void cpuset_update_active_cpus(void)
2144static int cpuset_track_online_nodes(struct notifier_block *self, 2118static int cpuset_track_online_nodes(struct notifier_block *self,
2145 unsigned long action, void *arg) 2119 unsigned long action, void *arg)
2146{ 2120{
2147 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); 2121 static nodemask_t oldmems; /* protected by cgroup_mutex */
2148
2149 if (oldmems == NULL)
2150 return NOTIFY_DONE;
2151 2122
2152 cgroup_lock(); 2123 cgroup_lock();
2153 switch (action) { 2124 switch (action) {
2154 case MEM_ONLINE: 2125 case MEM_ONLINE:
2155 *oldmems = top_cpuset.mems_allowed; 2126 oldmems = top_cpuset.mems_allowed;
2156 mutex_lock(&callback_mutex); 2127 mutex_lock(&callback_mutex);
2157 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2128 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2158 mutex_unlock(&callback_mutex); 2129 mutex_unlock(&callback_mutex);
2159 update_tasks_nodemask(&top_cpuset, oldmems, NULL); 2130 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2160 break; 2131 break;
2161 case MEM_OFFLINE: 2132 case MEM_OFFLINE:
2162 /* 2133 /*
@@ -2170,7 +2141,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2170 } 2141 }
2171 cgroup_unlock(); 2142 cgroup_unlock();
2172 2143
2173 NODEMASK_FREE(oldmems);
2174 return NOTIFY_OK; 2144 return NOTIFY_OK;
2175} 2145}
2176#endif 2146#endif
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000000..5f85690285d4
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,34 @@
1#include <linux/kernel.h>
2#include <linux/crash_dump.h>
3#include <linux/init.h>
4#include <linux/errno.h>
5#include <linux/module.h>
6
7/*
8 * If we have booted due to a crash, max_pfn will be a very low value. We need
9 * to know the amount of memory that the previous kernel used.
10 */
11unsigned long saved_max_pfn;
12
13/*
14 * stores the physical address of elf header of crash image
15 *
16 * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
17 * is_kdump_kernel() to determine if we are booting after a panic. Hence put
18 * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
19 */
20unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
21
22/*
23 * elfcorehdr= specifies the location of elf core header stored by the crashed
24 * kernel. This option will be passed by kexec loader to the capture kernel.
25 */
26static int __init setup_elfcorehdr(char *arg)
27{
28 char *end;
29 if (!arg)
30 return -EINVAL;
31 elfcorehdr_addr = memparse(arg, &end);
32 return end > arg ? 0 : -EINVAL;
33}
34early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
index 3a9d6dd53a6c..e12c8af793f8 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar;
35static struct thread_group_cred init_tgcred = { 35static struct thread_group_cred init_tgcred = {
36 .usage = ATOMIC_INIT(2), 36 .usage = ATOMIC_INIT(2),
37 .tgid = 0, 37 .tgid = 0,
38 .lock = SPIN_LOCK_UNLOCKED, 38 .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
39}; 39};
40#endif 40#endif
41 41
@@ -49,11 +49,12 @@ struct cred init_cred = {
49 .magic = CRED_MAGIC, 49 .magic = CRED_MAGIC,
50#endif 50#endif
51 .securebits = SECUREBITS_DEFAULT, 51 .securebits = SECUREBITS_DEFAULT,
52 .cap_inheritable = CAP_INIT_INH_SET, 52 .cap_inheritable = CAP_EMPTY_SET,
53 .cap_permitted = CAP_FULL_SET, 53 .cap_permitted = CAP_FULL_SET,
54 .cap_effective = CAP_INIT_EFF_SET, 54 .cap_effective = CAP_FULL_SET,
55 .cap_bset = CAP_INIT_BSET, 55 .cap_bset = CAP_FULL_SET,
56 .user = INIT_USER, 56 .user = INIT_USER,
57 .user_ns = &init_user_ns,
57 .group_info = &init_groups, 58 .group_info = &init_groups,
58#ifdef CONFIG_KEYS 59#ifdef CONFIG_KEYS
59 .tgcred = &init_tgcred, 60 .tgcred = &init_tgcred,
@@ -410,6 +411,11 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
410 goto error_put; 411 goto error_put;
411 } 412 }
412 413
414 /* cache user_ns in cred. Doesn't need a refcount because it will
415 * stay pinned by cred->user
416 */
417 new->user_ns = new->user->user_ns;
418
413#ifdef CONFIG_KEYS 419#ifdef CONFIG_KEYS
414 /* new threads get their own thread keyrings if their parent already 420 /* new threads get their own thread keyrings if their parent already
415 * had one */ 421 * had one */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index cefd4a11f6d9..bad6786dee88 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -538,7 +538,7 @@ return_normal:
538 538
539 /* 539 /*
540 * For single stepping, try to only enter on the processor 540 * For single stepping, try to only enter on the processor
541 * that was single stepping. To gaurd against a deadlock, the 541 * that was single stepping. To guard against a deadlock, the
542 * kernel will only try for the value of sstep_tries before 542 * kernel will only try for the value of sstep_tries before
543 * giving up and continuing on. 543 * giving up and continuing on.
544 */ 544 */
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 481a7bd2dfe7..a11db956dd62 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1093,3 +1093,33 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
1093 put_packet(remcom_out_buffer); 1093 put_packet(remcom_out_buffer);
1094 return 0; 1094 return 0;
1095} 1095}
1096
1097/**
1098 * gdbstub_exit - Send an exit message to GDB
1099 * @status: The exit code to report.
1100 */
1101void gdbstub_exit(int status)
1102{
1103 unsigned char checksum, ch, buffer[3];
1104 int loop;
1105
1106 buffer[0] = 'W';
1107 buffer[1] = hex_asc_hi(status);
1108 buffer[2] = hex_asc_lo(status);
1109
1110 dbg_io_ops->write_char('$');
1111 checksum = 0;
1112
1113 for (loop = 0; loop < 3; loop++) {
1114 ch = buffer[loop];
1115 checksum += ch;
1116 dbg_io_ops->write_char(ch);
1117 }
1118
1119 dbg_io_ops->write_char('#');
1120 dbg_io_ops->write_char(hex_asc_hi(checksum));
1121 dbg_io_ops->write_char(hex_asc_lo(checksum));
1122
1123 /* make sure the output is flushed, lest the bootloader clobber it */
1124 dbg_io_ops->flush();
1125}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index bd3e8e29caa3..be14779bcef6 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -78,7 +78,7 @@ static unsigned int kdb_continue_catastrophic;
78static kdbtab_t *kdb_commands; 78static kdbtab_t *kdb_commands;
79#define KDB_BASE_CMD_MAX 50 79#define KDB_BASE_CMD_MAX 50
80static int kdb_max_commands = KDB_BASE_CMD_MAX; 80static int kdb_max_commands = KDB_BASE_CMD_MAX;
81static kdbtab_t kdb_base_commands[50]; 81static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX];
82#define for_each_kdbcmd(cmd, num) \ 82#define for_each_kdbcmd(cmd, num) \
83 for ((cmd) = kdb_base_commands, (num) = 0; \ 83 for ((cmd) = kdb_base_commands, (num) = 0; \
84 num < kdb_max_commands; \ 84 num < kdb_max_commands; \
@@ -441,9 +441,9 @@ static int kdb_check_regs(void)
441 * symbol name, and offset to the caller. 441 * symbol name, and offset to the caller.
442 * 442 *
443 * The argument may consist of a numeric value (decimal or 443 * The argument may consist of a numeric value (decimal or
444 * hexidecimal), a symbol name, a register name (preceeded by the 444 * hexidecimal), a symbol name, a register name (preceded by the
445 * percent sign), an environment variable with a numeric value 445 * percent sign), an environment variable with a numeric value
446 * (preceeded by a dollar sign) or a simple arithmetic expression 446 * (preceded by a dollar sign) or a simple arithmetic expression
447 * consisting of a symbol name, +/-, and a numeric constant value 447 * consisting of a symbol name, +/-, and a numeric constant value
448 * (offset). 448 * (offset).
449 * Parameters: 449 * Parameters:
@@ -1335,7 +1335,7 @@ void kdb_print_state(const char *text, int value)
1335 * error The hardware-defined error code 1335 * error The hardware-defined error code
1336 * reason2 kdb's current reason code. 1336 * reason2 kdb's current reason code.
1337 * Initially error but can change 1337 * Initially error but can change
1338 * acording to kdb state. 1338 * according to kdb state.
1339 * db_result Result code from break or debug point. 1339 * db_result Result code from break or debug point.
1340 * regs The exception frame at time of fault/breakpoint. 1340 * regs The exception frame at time of fault/breakpoint.
1341 * should always be valid. 1341 * should always be valid.
@@ -2892,7 +2892,7 @@ static void __init kdb_inittab(void)
2892 "Send a signal to a process", 0, KDB_REPEAT_NONE); 2892 "Send a signal to a process", 0, KDB_REPEAT_NONE);
2893 kdb_register_repeat("summary", kdb_summary, "", 2893 kdb_register_repeat("summary", kdb_summary, "",
2894 "Summarize the system", 4, KDB_REPEAT_NONE); 2894 "Summarize the system", 4, KDB_REPEAT_NONE);
2895 kdb_register_repeat("per_cpu", kdb_per_cpu, "", 2895 kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
2896 "Display per_cpu variables", 3, KDB_REPEAT_NONE); 2896 "Display per_cpu variables", 3, KDB_REPEAT_NONE);
2897 kdb_register_repeat("grephelp", kdb_grep_help, "", 2897 kdb_register_repeat("grephelp", kdb_grep_help, "",
2898 "Display help on | grep", 0, KDB_REPEAT_NONE); 2898 "Display help on | grep", 0, KDB_REPEAT_NONE);
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 6b2485dcb050..5532dd37aa86 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -545,7 +545,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size)
545 * Mask for process state. 545 * Mask for process state.
546 * Notes: 546 * Notes:
547 * The mask folds data from several sources into a single long value, so 547 * The mask folds data from several sources into a single long value, so
548 * be carefull not to overlap the bits. TASK_* bits are in the LSB, 548 * be careful not to overlap the bits. TASK_* bits are in the LSB,
549 * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there 549 * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
550 * is no overlap between TASK_* and EXIT_* but that may not always be 550 * is no overlap between TASK_* and EXIT_* but that may not always be
551 * true, so EXIT_* bits are shifted left 16 bits before being stored in 551 * true, so EXIT_* bits are shifted left 16 bits before being stored in
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
new file mode 100644
index 000000000000..1ce23d3d8394
--- /dev/null
+++ b/kernel/events/Makefile
@@ -0,0 +1,6 @@
1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg
3endif
4
5obj-y := core.o
6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/perf_event.c b/kernel/events/core.c
index 999835b6112b..c09767f7db3e 100644
--- a/kernel/perf_event.c
+++ b/kernel/events/core.c
@@ -2,8 +2,8 @@
2 * Performance events core code: 2 * Performance events core code:
3 * 3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 * 8 *
9 * For licensing details see kernel-base/COPYING 9 * For licensing details see kernel-base/COPYING
@@ -38,13 +38,96 @@
38 38
39#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
40 40
41struct remote_function_call {
42 struct task_struct *p;
43 int (*func)(void *info);
44 void *info;
45 int ret;
46};
47
48static void remote_function(void *data)
49{
50 struct remote_function_call *tfc = data;
51 struct task_struct *p = tfc->p;
52
53 if (p) {
54 tfc->ret = -EAGAIN;
55 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
56 return;
57 }
58
59 tfc->ret = tfc->func(tfc->info);
60}
61
62/**
63 * task_function_call - call a function on the cpu on which a task runs
64 * @p: the task to evaluate
65 * @func: the function to be called
66 * @info: the function call argument
67 *
68 * Calls the function @func when the task is currently running. This might
69 * be on the current CPU, which just calls the function directly
70 *
71 * returns: @func return value, or
72 * -ESRCH - when the process isn't running
73 * -EAGAIN - when the process moved away
74 */
75static int
76task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
77{
78 struct remote_function_call data = {
79 .p = p,
80 .func = func,
81 .info = info,
82 .ret = -ESRCH, /* No such (running) process */
83 };
84
85 if (task_curr(p))
86 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
87
88 return data.ret;
89}
90
91/**
92 * cpu_function_call - call a function on the cpu
93 * @func: the function to be called
94 * @info: the function call argument
95 *
96 * Calls the function @func on the remote cpu.
97 *
98 * returns: @func return value or -ENXIO when the cpu is offline
99 */
100static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
101{
102 struct remote_function_call data = {
103 .p = NULL,
104 .func = func,
105 .info = info,
106 .ret = -ENXIO, /* No such CPU */
107 };
108
109 smp_call_function_single(cpu, remote_function, &data, 1);
110
111 return data.ret;
112}
113
114#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
115 PERF_FLAG_FD_OUTPUT |\
116 PERF_FLAG_PID_CGROUP)
117
41enum event_type_t { 118enum event_type_t {
42 EVENT_FLEXIBLE = 0x1, 119 EVENT_FLEXIBLE = 0x1,
43 EVENT_PINNED = 0x2, 120 EVENT_PINNED = 0x2,
44 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 121 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
45}; 122};
46 123
47atomic_t perf_task_events __read_mostly; 124/*
125 * perf_sched_events : >0 events exist
126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
127 */
128struct jump_label_key perf_sched_events __read_mostly;
129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
130
48static atomic_t nr_mmap_events __read_mostly; 131static atomic_t nr_mmap_events __read_mostly;
49static atomic_t nr_comm_events __read_mostly; 132static atomic_t nr_comm_events __read_mostly;
50static atomic_t nr_task_events __read_mostly; 133static atomic_t nr_task_events __read_mostly;
@@ -62,12 +145,30 @@ static struct srcu_struct pmus_srcu;
62 */ 145 */
63int sysctl_perf_event_paranoid __read_mostly = 1; 146int sysctl_perf_event_paranoid __read_mostly = 1;
64 147
65int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ 148/* Minimum for 512 kiB + 1 user control page */
149int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
66 150
67/* 151/*
68 * max perf event sample rate 152 * max perf event sample rate
69 */ 153 */
70int sysctl_perf_event_sample_rate __read_mostly = 100000; 154#define DEFAULT_MAX_SAMPLE_RATE 100000
155int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
156static int max_samples_per_tick __read_mostly =
157 DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
158
159int perf_proc_update_handler(struct ctl_table *table, int write,
160 void __user *buffer, size_t *lenp,
161 loff_t *ppos)
162{
163 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
164
165 if (ret || !write)
166 return ret;
167
168 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
169
170 return 0;
171}
71 172
72static atomic64_t perf_event_id; 173static atomic64_t perf_event_id;
73 174
@@ -75,7 +176,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
75 enum event_type_t event_type); 176 enum event_type_t event_type);
76 177
77static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 178static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
78 enum event_type_t event_type); 179 enum event_type_t event_type,
180 struct task_struct *task);
181
182static void update_context_time(struct perf_event_context *ctx);
183static u64 perf_event_time(struct perf_event *event);
79 184
80void __weak perf_event_print_debug(void) { } 185void __weak perf_event_print_debug(void) { }
81 186
@@ -89,6 +194,361 @@ static inline u64 perf_clock(void)
89 return local_clock(); 194 return local_clock();
90} 195}
91 196
197static inline struct perf_cpu_context *
198__get_cpu_context(struct perf_event_context *ctx)
199{
200 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
201}
202
203#ifdef CONFIG_CGROUP_PERF
204
205/*
206 * Must ensure cgroup is pinned (css_get) before calling
207 * this function. In other words, we cannot call this function
208 * if there is no cgroup event for the current CPU context.
209 */
210static inline struct perf_cgroup *
211perf_cgroup_from_task(struct task_struct *task)
212{
213 return container_of(task_subsys_state(task, perf_subsys_id),
214 struct perf_cgroup, css);
215}
216
217static inline bool
218perf_cgroup_match(struct perf_event *event)
219{
220 struct perf_event_context *ctx = event->ctx;
221 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
222
223 return !event->cgrp || event->cgrp == cpuctx->cgrp;
224}
225
226static inline void perf_get_cgroup(struct perf_event *event)
227{
228 css_get(&event->cgrp->css);
229}
230
231static inline void perf_put_cgroup(struct perf_event *event)
232{
233 css_put(&event->cgrp->css);
234}
235
236static inline void perf_detach_cgroup(struct perf_event *event)
237{
238 perf_put_cgroup(event);
239 event->cgrp = NULL;
240}
241
242static inline int is_cgroup_event(struct perf_event *event)
243{
244 return event->cgrp != NULL;
245}
246
247static inline u64 perf_cgroup_event_time(struct perf_event *event)
248{
249 struct perf_cgroup_info *t;
250
251 t = per_cpu_ptr(event->cgrp->info, event->cpu);
252 return t->time;
253}
254
255static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
256{
257 struct perf_cgroup_info *info;
258 u64 now;
259
260 now = perf_clock();
261
262 info = this_cpu_ptr(cgrp->info);
263
264 info->time += now - info->timestamp;
265 info->timestamp = now;
266}
267
268static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
269{
270 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
271 if (cgrp_out)
272 __update_cgrp_time(cgrp_out);
273}
274
275static inline void update_cgrp_time_from_event(struct perf_event *event)
276{
277 struct perf_cgroup *cgrp;
278
279 /*
280 * ensure we access cgroup data only when needed and
281 * when we know the cgroup is pinned (css_get)
282 */
283 if (!is_cgroup_event(event))
284 return;
285
286 cgrp = perf_cgroup_from_task(current);
287 /*
288 * Do not update time when cgroup is not active
289 */
290 if (cgrp == event->cgrp)
291 __update_cgrp_time(event->cgrp);
292}
293
294static inline void
295perf_cgroup_set_timestamp(struct task_struct *task,
296 struct perf_event_context *ctx)
297{
298 struct perf_cgroup *cgrp;
299 struct perf_cgroup_info *info;
300
301 /*
302 * ctx->lock held by caller
303 * ensure we do not access cgroup data
304 * unless we have the cgroup pinned (css_get)
305 */
306 if (!task || !ctx->nr_cgroups)
307 return;
308
309 cgrp = perf_cgroup_from_task(task);
310 info = this_cpu_ptr(cgrp->info);
311 info->timestamp = ctx->timestamp;
312}
313
314#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
315#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
316
317/*
318 * reschedule events based on the cgroup constraint of task.
319 *
320 * mode SWOUT : schedule out everything
321 * mode SWIN : schedule in based on cgroup for next
322 */
323void perf_cgroup_switch(struct task_struct *task, int mode)
324{
325 struct perf_cpu_context *cpuctx;
326 struct pmu *pmu;
327 unsigned long flags;
328
329 /*
330 * disable interrupts to avoid geting nr_cgroup
331 * changes via __perf_event_disable(). Also
332 * avoids preemption.
333 */
334 local_irq_save(flags);
335
336 /*
337 * we reschedule only in the presence of cgroup
338 * constrained events.
339 */
340 rcu_read_lock();
341
342 list_for_each_entry_rcu(pmu, &pmus, entry) {
343
344 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
345
346 perf_pmu_disable(cpuctx->ctx.pmu);
347
348 /*
349 * perf_cgroup_events says at least one
350 * context on this CPU has cgroup events.
351 *
352 * ctx->nr_cgroups reports the number of cgroup
353 * events for a context.
354 */
355 if (cpuctx->ctx.nr_cgroups > 0) {
356
357 if (mode & PERF_CGROUP_SWOUT) {
358 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
359 /*
360 * must not be done before ctxswout due
361 * to event_filter_match() in event_sched_out()
362 */
363 cpuctx->cgrp = NULL;
364 }
365
366 if (mode & PERF_CGROUP_SWIN) {
367 WARN_ON_ONCE(cpuctx->cgrp);
368 /* set cgrp before ctxsw in to
369 * allow event_filter_match() to not
370 * have to pass task around
371 */
372 cpuctx->cgrp = perf_cgroup_from_task(task);
373 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
374 }
375 }
376
377 perf_pmu_enable(cpuctx->ctx.pmu);
378 }
379
380 rcu_read_unlock();
381
382 local_irq_restore(flags);
383}
384
385static inline void perf_cgroup_sched_out(struct task_struct *task)
386{
387 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
388}
389
390static inline void perf_cgroup_sched_in(struct task_struct *task)
391{
392 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
393}
394
395static inline int perf_cgroup_connect(int fd, struct perf_event *event,
396 struct perf_event_attr *attr,
397 struct perf_event *group_leader)
398{
399 struct perf_cgroup *cgrp;
400 struct cgroup_subsys_state *css;
401 struct file *file;
402 int ret = 0, fput_needed;
403
404 file = fget_light(fd, &fput_needed);
405 if (!file)
406 return -EBADF;
407
408 css = cgroup_css_from_dir(file, perf_subsys_id);
409 if (IS_ERR(css)) {
410 ret = PTR_ERR(css);
411 goto out;
412 }
413
414 cgrp = container_of(css, struct perf_cgroup, css);
415 event->cgrp = cgrp;
416
417 /* must be done before we fput() the file */
418 perf_get_cgroup(event);
419
420 /*
421 * all events in a group must monitor
422 * the same cgroup because a task belongs
423 * to only one perf cgroup at a time
424 */
425 if (group_leader && group_leader->cgrp != cgrp) {
426 perf_detach_cgroup(event);
427 ret = -EINVAL;
428 }
429out:
430 fput_light(file, fput_needed);
431 return ret;
432}
433
434static inline void
435perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
436{
437 struct perf_cgroup_info *t;
438 t = per_cpu_ptr(event->cgrp->info, event->cpu);
439 event->shadow_ctx_time = now - t->timestamp;
440}
441
442static inline void
443perf_cgroup_defer_enabled(struct perf_event *event)
444{
445 /*
446 * when the current task's perf cgroup does not match
447 * the event's, we need to remember to call the
448 * perf_mark_enable() function the first time a task with
449 * a matching perf cgroup is scheduled in.
450 */
451 if (is_cgroup_event(event) && !perf_cgroup_match(event))
452 event->cgrp_defer_enabled = 1;
453}
454
455static inline void
456perf_cgroup_mark_enabled(struct perf_event *event,
457 struct perf_event_context *ctx)
458{
459 struct perf_event *sub;
460 u64 tstamp = perf_event_time(event);
461
462 if (!event->cgrp_defer_enabled)
463 return;
464
465 event->cgrp_defer_enabled = 0;
466
467 event->tstamp_enabled = tstamp - event->total_time_enabled;
468 list_for_each_entry(sub, &event->sibling_list, group_entry) {
469 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
470 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
471 sub->cgrp_defer_enabled = 0;
472 }
473 }
474}
475#else /* !CONFIG_CGROUP_PERF */
476
477static inline bool
478perf_cgroup_match(struct perf_event *event)
479{
480 return true;
481}
482
483static inline void perf_detach_cgroup(struct perf_event *event)
484{}
485
486static inline int is_cgroup_event(struct perf_event *event)
487{
488 return 0;
489}
490
491static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
492{
493 return 0;
494}
495
496static inline void update_cgrp_time_from_event(struct perf_event *event)
497{
498}
499
500static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
501{
502}
503
504static inline void perf_cgroup_sched_out(struct task_struct *task)
505{
506}
507
508static inline void perf_cgroup_sched_in(struct task_struct *task)
509{
510}
511
512static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
513 struct perf_event_attr *attr,
514 struct perf_event *group_leader)
515{
516 return -EINVAL;
517}
518
519static inline void
520perf_cgroup_set_timestamp(struct task_struct *task,
521 struct perf_event_context *ctx)
522{
523}
524
525void
526perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
527{
528}
529
530static inline void
531perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
532{
533}
534
535static inline u64 perf_cgroup_event_time(struct perf_event *event)
536{
537 return 0;
538}
539
540static inline void
541perf_cgroup_defer_enabled(struct perf_event *event)
542{
543}
544
545static inline void
546perf_cgroup_mark_enabled(struct perf_event *event,
547 struct perf_event_context *ctx)
548{
549}
550#endif
551
92void perf_pmu_disable(struct pmu *pmu) 552void perf_pmu_disable(struct pmu *pmu)
93{ 553{
94 int *count = this_cpu_ptr(pmu->pmu_disable_count); 554 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -126,14 +586,6 @@ static void get_ctx(struct perf_event_context *ctx)
126 WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); 586 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
127} 587}
128 588
129static void free_ctx(struct rcu_head *head)
130{
131 struct perf_event_context *ctx;
132
133 ctx = container_of(head, struct perf_event_context, rcu_head);
134 kfree(ctx);
135}
136
137static void put_ctx(struct perf_event_context *ctx) 589static void put_ctx(struct perf_event_context *ctx)
138{ 590{
139 if (atomic_dec_and_test(&ctx->refcount)) { 591 if (atomic_dec_and_test(&ctx->refcount)) {
@@ -141,7 +593,7 @@ static void put_ctx(struct perf_event_context *ctx)
141 put_ctx(ctx->parent_ctx); 593 put_ctx(ctx->parent_ctx);
142 if (ctx->task) 594 if (ctx->task)
143 put_task_struct(ctx->task); 595 put_task_struct(ctx->task);
144 call_rcu(&ctx->rcu_head, free_ctx); 596 kfree_rcu(ctx, rcu_head);
145 } 597 }
146} 598}
147 599
@@ -254,7 +706,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
254 raw_spin_lock_irqsave(&ctx->lock, flags); 706 raw_spin_lock_irqsave(&ctx->lock, flags);
255 --ctx->pin_count; 707 --ctx->pin_count;
256 raw_spin_unlock_irqrestore(&ctx->lock, flags); 708 raw_spin_unlock_irqrestore(&ctx->lock, flags);
257 put_ctx(ctx);
258} 709}
259 710
260/* 711/*
@@ -271,6 +722,10 @@ static void update_context_time(struct perf_event_context *ctx)
271static u64 perf_event_time(struct perf_event *event) 722static u64 perf_event_time(struct perf_event *event)
272{ 723{
273 struct perf_event_context *ctx = event->ctx; 724 struct perf_event_context *ctx = event->ctx;
725
726 if (is_cgroup_event(event))
727 return perf_cgroup_event_time(event);
728
274 return ctx ? ctx->time : 0; 729 return ctx ? ctx->time : 0;
275} 730}
276 731
@@ -285,9 +740,20 @@ static void update_event_times(struct perf_event *event)
285 if (event->state < PERF_EVENT_STATE_INACTIVE || 740 if (event->state < PERF_EVENT_STATE_INACTIVE ||
286 event->group_leader->state < PERF_EVENT_STATE_INACTIVE) 741 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
287 return; 742 return;
288 743 /*
289 if (ctx->is_active) 744 * in cgroup mode, time_enabled represents
745 * the time the event was enabled AND active
746 * tasks were in the monitored cgroup. This is
747 * independent of the activity of the context as
748 * there may be a mix of cgroup and non-cgroup events.
749 *
750 * That is why we treat cgroup events differently
751 * here.
752 */
753 if (is_cgroup_event(event))
290 run_end = perf_event_time(event); 754 run_end = perf_event_time(event);
755 else if (ctx->is_active)
756 run_end = ctx->time;
291 else 757 else
292 run_end = event->tstamp_stopped; 758 run_end = event->tstamp_stopped;
293 759
@@ -299,6 +765,7 @@ static void update_event_times(struct perf_event *event)
299 run_end = perf_event_time(event); 765 run_end = perf_event_time(event);
300 766
301 event->total_time_running = run_end - event->tstamp_running; 767 event->total_time_running = run_end - event->tstamp_running;
768
302} 769}
303 770
304/* 771/*
@@ -347,6 +814,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
347 list_add_tail(&event->group_entry, list); 814 list_add_tail(&event->group_entry, list);
348 } 815 }
349 816
817 if (is_cgroup_event(event))
818 ctx->nr_cgroups++;
819
350 list_add_rcu(&event->event_entry, &ctx->event_list); 820 list_add_rcu(&event->event_entry, &ctx->event_list);
351 if (!ctx->nr_events) 821 if (!ctx->nr_events)
352 perf_pmu_rotate_start(ctx->pmu); 822 perf_pmu_rotate_start(ctx->pmu);
@@ -465,6 +935,7 @@ static void perf_group_attach(struct perf_event *event)
465static void 935static void
466list_del_event(struct perf_event *event, struct perf_event_context *ctx) 936list_del_event(struct perf_event *event, struct perf_event_context *ctx)
467{ 937{
938 struct perf_cpu_context *cpuctx;
468 /* 939 /*
469 * We can have double detach due to exit/hot-unplug + close. 940 * We can have double detach due to exit/hot-unplug + close.
470 */ 941 */
@@ -473,6 +944,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
473 944
474 event->attach_state &= ~PERF_ATTACH_CONTEXT; 945 event->attach_state &= ~PERF_ATTACH_CONTEXT;
475 946
947 if (is_cgroup_event(event)) {
948 ctx->nr_cgroups--;
949 cpuctx = __get_cpu_context(ctx);
950 /*
951 * if there are no more cgroup events
952 * then cler cgrp to avoid stale pointer
953 * in update_cgrp_time_from_cpuctx()
954 */
955 if (!ctx->nr_cgroups)
956 cpuctx->cgrp = NULL;
957 }
958
476 ctx->nr_events--; 959 ctx->nr_events--;
477 if (event->attr.inherit_stat) 960 if (event->attr.inherit_stat)
478 ctx->nr_stat--; 961 ctx->nr_stat--;
@@ -544,7 +1027,8 @@ out:
544static inline int 1027static inline int
545event_filter_match(struct perf_event *event) 1028event_filter_match(struct perf_event *event)
546{ 1029{
547 return event->cpu == -1 || event->cpu == smp_processor_id(); 1030 return (event->cpu == -1 || event->cpu == smp_processor_id())
1031 && perf_cgroup_match(event);
548} 1032}
549 1033
550static void 1034static void
@@ -562,7 +1046,7 @@ event_sched_out(struct perf_event *event,
562 */ 1046 */
563 if (event->state == PERF_EVENT_STATE_INACTIVE 1047 if (event->state == PERF_EVENT_STATE_INACTIVE
564 && !event_filter_match(event)) { 1048 && !event_filter_match(event)) {
565 delta = ctx->time - event->tstamp_stopped; 1049 delta = tstamp - event->tstamp_stopped;
566 event->tstamp_running += delta; 1050 event->tstamp_running += delta;
567 event->tstamp_stopped = tstamp; 1051 event->tstamp_stopped = tstamp;
568 } 1052 }
@@ -606,47 +1090,30 @@ group_sched_out(struct perf_event *group_event,
606 cpuctx->exclusive = 0; 1090 cpuctx->exclusive = 0;
607} 1091}
608 1092
609static inline struct perf_cpu_context *
610__get_cpu_context(struct perf_event_context *ctx)
611{
612 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
613}
614
615/* 1093/*
616 * Cross CPU call to remove a performance event 1094 * Cross CPU call to remove a performance event
617 * 1095 *
618 * We disable the event on the hardware level first. After that we 1096 * We disable the event on the hardware level first. After that we
619 * remove it from the context list. 1097 * remove it from the context list.
620 */ 1098 */
621static void __perf_event_remove_from_context(void *info) 1099static int __perf_remove_from_context(void *info)
622{ 1100{
623 struct perf_event *event = info; 1101 struct perf_event *event = info;
624 struct perf_event_context *ctx = event->ctx; 1102 struct perf_event_context *ctx = event->ctx;
625 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1103 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
626 1104
627 /*
628 * If this is a task context, we need to check whether it is
629 * the current task context of this cpu. If not it has been
630 * scheduled out before the smp call arrived.
631 */
632 if (ctx->task && cpuctx->task_ctx != ctx)
633 return;
634
635 raw_spin_lock(&ctx->lock); 1105 raw_spin_lock(&ctx->lock);
636
637 event_sched_out(event, cpuctx, ctx); 1106 event_sched_out(event, cpuctx, ctx);
638
639 list_del_event(event, ctx); 1107 list_del_event(event, ctx);
640
641 raw_spin_unlock(&ctx->lock); 1108 raw_spin_unlock(&ctx->lock);
1109
1110 return 0;
642} 1111}
643 1112
644 1113
645/* 1114/*
646 * Remove the event from a task's (or a CPU's) list of events. 1115 * Remove the event from a task's (or a CPU's) list of events.
647 * 1116 *
648 * Must be called with ctx->mutex held.
649 *
650 * CPU events are removed with a smp call. For task events we only 1117 * CPU events are removed with a smp call. For task events we only
651 * call when the task is on a CPU. 1118 * call when the task is on a CPU.
652 * 1119 *
@@ -657,49 +1124,48 @@ static void __perf_event_remove_from_context(void *info)
657 * When called from perf_event_exit_task, it's OK because the 1124 * When called from perf_event_exit_task, it's OK because the
658 * context has been detached from its task. 1125 * context has been detached from its task.
659 */ 1126 */
660static void perf_event_remove_from_context(struct perf_event *event) 1127static void perf_remove_from_context(struct perf_event *event)
661{ 1128{
662 struct perf_event_context *ctx = event->ctx; 1129 struct perf_event_context *ctx = event->ctx;
663 struct task_struct *task = ctx->task; 1130 struct task_struct *task = ctx->task;
664 1131
1132 lockdep_assert_held(&ctx->mutex);
1133
665 if (!task) { 1134 if (!task) {
666 /* 1135 /*
667 * Per cpu events are removed via an smp call and 1136 * Per cpu events are removed via an smp call and
668 * the removal is always successful. 1137 * the removal is always successful.
669 */ 1138 */
670 smp_call_function_single(event->cpu, 1139 cpu_function_call(event->cpu, __perf_remove_from_context, event);
671 __perf_event_remove_from_context,
672 event, 1);
673 return; 1140 return;
674 } 1141 }
675 1142
676retry: 1143retry:
677 task_oncpu_function_call(task, __perf_event_remove_from_context, 1144 if (!task_function_call(task, __perf_remove_from_context, event))
678 event); 1145 return;
679 1146
680 raw_spin_lock_irq(&ctx->lock); 1147 raw_spin_lock_irq(&ctx->lock);
681 /* 1148 /*
682 * If the context is active we need to retry the smp call. 1149 * If we failed to find a running task, but find the context active now
1150 * that we've acquired the ctx->lock, retry.
683 */ 1151 */
684 if (ctx->nr_active && !list_empty(&event->group_entry)) { 1152 if (ctx->is_active) {
685 raw_spin_unlock_irq(&ctx->lock); 1153 raw_spin_unlock_irq(&ctx->lock);
686 goto retry; 1154 goto retry;
687 } 1155 }
688 1156
689 /* 1157 /*
690 * The lock prevents that this context is scheduled in so we 1158 * Since the task isn't running, its safe to remove the event, us
691 * can remove the event safely, if the call above did not 1159 * holding the ctx->lock ensures the task won't get scheduled in.
692 * succeed.
693 */ 1160 */
694 if (!list_empty(&event->group_entry)) 1161 list_del_event(event, ctx);
695 list_del_event(event, ctx);
696 raw_spin_unlock_irq(&ctx->lock); 1162 raw_spin_unlock_irq(&ctx->lock);
697} 1163}
698 1164
699/* 1165/*
700 * Cross CPU call to disable a performance event 1166 * Cross CPU call to disable a performance event
701 */ 1167 */
702static void __perf_event_disable(void *info) 1168static int __perf_event_disable(void *info)
703{ 1169{
704 struct perf_event *event = info; 1170 struct perf_event *event = info;
705 struct perf_event_context *ctx = event->ctx; 1171 struct perf_event_context *ctx = event->ctx;
@@ -708,9 +1174,12 @@ static void __perf_event_disable(void *info)
708 /* 1174 /*
709 * If this is a per-task event, need to check whether this 1175 * If this is a per-task event, need to check whether this
710 * event's task is the current task on this cpu. 1176 * event's task is the current task on this cpu.
1177 *
1178 * Can trigger due to concurrent perf_event_context_sched_out()
1179 * flipping contexts around.
711 */ 1180 */
712 if (ctx->task && cpuctx->task_ctx != ctx) 1181 if (ctx->task && cpuctx->task_ctx != ctx)
713 return; 1182 return -EINVAL;
714 1183
715 raw_spin_lock(&ctx->lock); 1184 raw_spin_lock(&ctx->lock);
716 1185
@@ -720,6 +1189,7 @@ static void __perf_event_disable(void *info)
720 */ 1189 */
721 if (event->state >= PERF_EVENT_STATE_INACTIVE) { 1190 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
722 update_context_time(ctx); 1191 update_context_time(ctx);
1192 update_cgrp_time_from_event(event);
723 update_group_times(event); 1193 update_group_times(event);
724 if (event == event->group_leader) 1194 if (event == event->group_leader)
725 group_sched_out(event, cpuctx, ctx); 1195 group_sched_out(event, cpuctx, ctx);
@@ -729,6 +1199,8 @@ static void __perf_event_disable(void *info)
729 } 1199 }
730 1200
731 raw_spin_unlock(&ctx->lock); 1201 raw_spin_unlock(&ctx->lock);
1202
1203 return 0;
732} 1204}
733 1205
734/* 1206/*
@@ -753,13 +1225,13 @@ void perf_event_disable(struct perf_event *event)
753 /* 1225 /*
754 * Disable the event on the cpu that it's on 1226 * Disable the event on the cpu that it's on
755 */ 1227 */
756 smp_call_function_single(event->cpu, __perf_event_disable, 1228 cpu_function_call(event->cpu, __perf_event_disable, event);
757 event, 1);
758 return; 1229 return;
759 } 1230 }
760 1231
761retry: 1232retry:
762 task_oncpu_function_call(task, __perf_event_disable, event); 1233 if (!task_function_call(task, __perf_event_disable, event))
1234 return;
763 1235
764 raw_spin_lock_irq(&ctx->lock); 1236 raw_spin_lock_irq(&ctx->lock);
765 /* 1237 /*
@@ -767,6 +1239,11 @@ retry:
767 */ 1239 */
768 if (event->state == PERF_EVENT_STATE_ACTIVE) { 1240 if (event->state == PERF_EVENT_STATE_ACTIVE) {
769 raw_spin_unlock_irq(&ctx->lock); 1241 raw_spin_unlock_irq(&ctx->lock);
1242 /*
1243 * Reload the task pointer, it might have been changed by
1244 * a concurrent perf_event_context_sched_out().
1245 */
1246 task = ctx->task;
770 goto retry; 1247 goto retry;
771 } 1248 }
772 1249
@@ -778,10 +1255,48 @@ retry:
778 update_group_times(event); 1255 update_group_times(event);
779 event->state = PERF_EVENT_STATE_OFF; 1256 event->state = PERF_EVENT_STATE_OFF;
780 } 1257 }
781
782 raw_spin_unlock_irq(&ctx->lock); 1258 raw_spin_unlock_irq(&ctx->lock);
783} 1259}
784 1260
1261static void perf_set_shadow_time(struct perf_event *event,
1262 struct perf_event_context *ctx,
1263 u64 tstamp)
1264{
1265 /*
1266 * use the correct time source for the time snapshot
1267 *
1268 * We could get by without this by leveraging the
1269 * fact that to get to this function, the caller
1270 * has most likely already called update_context_time()
1271 * and update_cgrp_time_xx() and thus both timestamp
1272 * are identical (or very close). Given that tstamp is,
1273 * already adjusted for cgroup, we could say that:
1274 * tstamp - ctx->timestamp
1275 * is equivalent to
1276 * tstamp - cgrp->timestamp.
1277 *
1278 * Then, in perf_output_read(), the calculation would
1279 * work with no changes because:
1280 * - event is guaranteed scheduled in
1281 * - no scheduled out in between
1282 * - thus the timestamp would be the same
1283 *
1284 * But this is a bit hairy.
1285 *
1286 * So instead, we have an explicit cgroup call to remain
1287 * within the time time source all along. We believe it
1288 * is cleaner and simpler to understand.
1289 */
1290 if (is_cgroup_event(event))
1291 perf_cgroup_set_shadow_time(event, tstamp);
1292 else
1293 event->shadow_ctx_time = tstamp - ctx->timestamp;
1294}
1295
1296#define MAX_INTERRUPTS (~0ULL)
1297
1298static void perf_log_throttle(struct perf_event *event, int enable);
1299
785static int 1300static int
786event_sched_in(struct perf_event *event, 1301event_sched_in(struct perf_event *event,
787 struct perf_cpu_context *cpuctx, 1302 struct perf_cpu_context *cpuctx,
@@ -794,6 +1309,17 @@ event_sched_in(struct perf_event *event,
794 1309
795 event->state = PERF_EVENT_STATE_ACTIVE; 1310 event->state = PERF_EVENT_STATE_ACTIVE;
796 event->oncpu = smp_processor_id(); 1311 event->oncpu = smp_processor_id();
1312
1313 /*
1314 * Unthrottle events, since we scheduled we might have missed several
1315 * ticks already, also for a heavily scheduling task there is little
1316 * guarantee it'll get a tick in a timely manner.
1317 */
1318 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1319 perf_log_throttle(event, 1);
1320 event->hw.interrupts = 0;
1321 }
1322
797 /* 1323 /*
798 * The new state must be visible before we turn it on in the hardware: 1324 * The new state must be visible before we turn it on in the hardware:
799 */ 1325 */
@@ -807,7 +1333,7 @@ event_sched_in(struct perf_event *event,
807 1333
808 event->tstamp_running += tstamp - event->tstamp_stopped; 1334 event->tstamp_running += tstamp - event->tstamp_stopped;
809 1335
810 event->shadow_ctx_time = tstamp - ctx->timestamp; 1336 perf_set_shadow_time(event, ctx, tstamp);
811 1337
812 if (!is_software_event(event)) 1338 if (!is_software_event(event))
813 cpuctx->active_oncpu++; 1339 cpuctx->active_oncpu++;
@@ -928,12 +1454,15 @@ static void add_event_to_ctx(struct perf_event *event,
928 event->tstamp_stopped = tstamp; 1454 event->tstamp_stopped = tstamp;
929} 1455}
930 1456
1457static void perf_event_context_sched_in(struct perf_event_context *ctx,
1458 struct task_struct *tsk);
1459
931/* 1460/*
932 * Cross CPU call to install and enable a performance event 1461 * Cross CPU call to install and enable a performance event
933 * 1462 *
934 * Must be called with ctx->mutex held 1463 * Must be called with ctx->mutex held
935 */ 1464 */
936static void __perf_install_in_context(void *info) 1465static int __perf_install_in_context(void *info)
937{ 1466{
938 struct perf_event *event = info; 1467 struct perf_event *event = info;
939 struct perf_event_context *ctx = event->ctx; 1468 struct perf_event_context *ctx = event->ctx;
@@ -942,21 +1471,22 @@ static void __perf_install_in_context(void *info)
942 int err; 1471 int err;
943 1472
944 /* 1473 /*
945 * If this is a task context, we need to check whether it is 1474 * In case we're installing a new context to an already running task,
946 * the current task context of this cpu. If not it has been 1475 * could also happen before perf_event_task_sched_in() on architectures
947 * scheduled out before the smp call arrived. 1476 * which do context switches with IRQs enabled.
948 * Or possibly this is the right context but it isn't
949 * on this cpu because it had no events.
950 */ 1477 */
951 if (ctx->task && cpuctx->task_ctx != ctx) { 1478 if (ctx->task && !cpuctx->task_ctx)
952 if (cpuctx->task_ctx || ctx->task != current) 1479 perf_event_context_sched_in(ctx, ctx->task);
953 return;
954 cpuctx->task_ctx = ctx;
955 }
956 1480
957 raw_spin_lock(&ctx->lock); 1481 raw_spin_lock(&ctx->lock);
958 ctx->is_active = 1; 1482 ctx->is_active = 1;
959 update_context_time(ctx); 1483 update_context_time(ctx);
1484 /*
1485 * update cgrp time only if current cgrp
1486 * matches event->cgrp. Must be done before
1487 * calling add_event_to_ctx()
1488 */
1489 update_cgrp_time_from_event(event);
960 1490
961 add_event_to_ctx(event, ctx); 1491 add_event_to_ctx(event, ctx);
962 1492
@@ -997,6 +1527,8 @@ static void __perf_install_in_context(void *info)
997 1527
998unlock: 1528unlock:
999 raw_spin_unlock(&ctx->lock); 1529 raw_spin_unlock(&ctx->lock);
1530
1531 return 0;
1000} 1532}
1001 1533
1002/* 1534/*
@@ -1008,8 +1540,6 @@ unlock:
1008 * If the event is attached to a task which is on a CPU we use a smp 1540 * If the event is attached to a task which is on a CPU we use a smp
1009 * call to enable it in the task context. The task might have been 1541 * call to enable it in the task context. The task might have been
1010 * scheduled away, but we check this in the smp call again. 1542 * scheduled away, but we check this in the smp call again.
1011 *
1012 * Must be called with ctx->mutex held.
1013 */ 1543 */
1014static void 1544static void
1015perf_install_in_context(struct perf_event_context *ctx, 1545perf_install_in_context(struct perf_event_context *ctx,
@@ -1018,6 +1548,8 @@ perf_install_in_context(struct perf_event_context *ctx,
1018{ 1548{
1019 struct task_struct *task = ctx->task; 1549 struct task_struct *task = ctx->task;
1020 1550
1551 lockdep_assert_held(&ctx->mutex);
1552
1021 event->ctx = ctx; 1553 event->ctx = ctx;
1022 1554
1023 if (!task) { 1555 if (!task) {
@@ -1025,31 +1557,29 @@ perf_install_in_context(struct perf_event_context *ctx,
1025 * Per cpu events are installed via an smp call and 1557 * Per cpu events are installed via an smp call and
1026 * the install is always successful. 1558 * the install is always successful.
1027 */ 1559 */
1028 smp_call_function_single(cpu, __perf_install_in_context, 1560 cpu_function_call(cpu, __perf_install_in_context, event);
1029 event, 1);
1030 return; 1561 return;
1031 } 1562 }
1032 1563
1033retry: 1564retry:
1034 task_oncpu_function_call(task, __perf_install_in_context, 1565 if (!task_function_call(task, __perf_install_in_context, event))
1035 event); 1566 return;
1036 1567
1037 raw_spin_lock_irq(&ctx->lock); 1568 raw_spin_lock_irq(&ctx->lock);
1038 /* 1569 /*
1039 * we need to retry the smp call. 1570 * If we failed to find a running task, but find the context active now
1571 * that we've acquired the ctx->lock, retry.
1040 */ 1572 */
1041 if (ctx->is_active && list_empty(&event->group_entry)) { 1573 if (ctx->is_active) {
1042 raw_spin_unlock_irq(&ctx->lock); 1574 raw_spin_unlock_irq(&ctx->lock);
1043 goto retry; 1575 goto retry;
1044 } 1576 }
1045 1577
1046 /* 1578 /*
1047 * The lock prevents that this context is scheduled in so we 1579 * Since the task isn't running, its safe to add the event, us holding
1048 * can add the event safely, if it the call above did not 1580 * the ctx->lock ensures the task won't get scheduled in.
1049 * succeed.
1050 */ 1581 */
1051 if (list_empty(&event->group_entry)) 1582 add_event_to_ctx(event, ctx);
1052 add_event_to_ctx(event, ctx);
1053 raw_spin_unlock_irq(&ctx->lock); 1583 raw_spin_unlock_irq(&ctx->lock);
1054} 1584}
1055 1585
@@ -1078,7 +1608,7 @@ static void __perf_event_mark_enabled(struct perf_event *event,
1078/* 1608/*
1079 * Cross CPU call to enable a performance event 1609 * Cross CPU call to enable a performance event
1080 */ 1610 */
1081static void __perf_event_enable(void *info) 1611static int __perf_event_enable(void *info)
1082{ 1612{
1083 struct perf_event *event = info; 1613 struct perf_event *event = info;
1084 struct perf_event_context *ctx = event->ctx; 1614 struct perf_event_context *ctx = event->ctx;
@@ -1086,26 +1616,27 @@ static void __perf_event_enable(void *info)
1086 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1616 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1087 int err; 1617 int err;
1088 1618
1089 /* 1619 if (WARN_ON_ONCE(!ctx->is_active))
1090 * If this is a per-task event, need to check whether this 1620 return -EINVAL;
1091 * event's task is the current task on this cpu.
1092 */
1093 if (ctx->task && cpuctx->task_ctx != ctx) {
1094 if (cpuctx->task_ctx || ctx->task != current)
1095 return;
1096 cpuctx->task_ctx = ctx;
1097 }
1098 1621
1099 raw_spin_lock(&ctx->lock); 1622 raw_spin_lock(&ctx->lock);
1100 ctx->is_active = 1;
1101 update_context_time(ctx); 1623 update_context_time(ctx);
1102 1624
1103 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1625 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1104 goto unlock; 1626 goto unlock;
1627
1628 /*
1629 * set current task's cgroup time reference point
1630 */
1631 perf_cgroup_set_timestamp(current, ctx);
1632
1105 __perf_event_mark_enabled(event, ctx); 1633 __perf_event_mark_enabled(event, ctx);
1106 1634
1107 if (!event_filter_match(event)) 1635 if (!event_filter_match(event)) {
1636 if (is_cgroup_event(event))
1637 perf_cgroup_defer_enabled(event);
1108 goto unlock; 1638 goto unlock;
1639 }
1109 1640
1110 /* 1641 /*
1111 * If the event is in a group and isn't the group leader, 1642 * If the event is in a group and isn't the group leader,
@@ -1138,6 +1669,8 @@ static void __perf_event_enable(void *info)
1138 1669
1139unlock: 1670unlock:
1140 raw_spin_unlock(&ctx->lock); 1671 raw_spin_unlock(&ctx->lock);
1672
1673 return 0;
1141} 1674}
1142 1675
1143/* 1676/*
@@ -1158,8 +1691,7 @@ void perf_event_enable(struct perf_event *event)
1158 /* 1691 /*
1159 * Enable the event on the cpu that it's on 1692 * Enable the event on the cpu that it's on
1160 */ 1693 */
1161 smp_call_function_single(event->cpu, __perf_event_enable, 1694 cpu_function_call(event->cpu, __perf_event_enable, event);
1162 event, 1);
1163 return; 1695 return;
1164 } 1696 }
1165 1697
@@ -1178,8 +1710,15 @@ void perf_event_enable(struct perf_event *event)
1178 event->state = PERF_EVENT_STATE_OFF; 1710 event->state = PERF_EVENT_STATE_OFF;
1179 1711
1180retry: 1712retry:
1713 if (!ctx->is_active) {
1714 __perf_event_mark_enabled(event, ctx);
1715 goto out;
1716 }
1717
1181 raw_spin_unlock_irq(&ctx->lock); 1718 raw_spin_unlock_irq(&ctx->lock);
1182 task_oncpu_function_call(task, __perf_event_enable, event); 1719
1720 if (!task_function_call(task, __perf_event_enable, event))
1721 return;
1183 1722
1184 raw_spin_lock_irq(&ctx->lock); 1723 raw_spin_lock_irq(&ctx->lock);
1185 1724
@@ -1187,15 +1726,14 @@ retry:
1187 * If the context is active and the event is still off, 1726 * If the context is active and the event is still off,
1188 * we need to retry the cross-call. 1727 * we need to retry the cross-call.
1189 */ 1728 */
1190 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) 1729 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
1730 /*
1731 * task could have been flipped by a concurrent
1732 * perf_event_context_sched_out()
1733 */
1734 task = ctx->task;
1191 goto retry; 1735 goto retry;
1192 1736 }
1193 /*
1194 * Since we have the lock this context can't be scheduled
1195 * in, so we can change the state safely.
1196 */
1197 if (event->state == PERF_EVENT_STATE_OFF)
1198 __perf_event_mark_enabled(event, ctx);
1199 1737
1200out: 1738out:
1201 raw_spin_unlock_irq(&ctx->lock); 1739 raw_spin_unlock_irq(&ctx->lock);
@@ -1227,6 +1765,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
1227 if (likely(!ctx->nr_events)) 1765 if (likely(!ctx->nr_events))
1228 goto out; 1766 goto out;
1229 update_context_time(ctx); 1767 update_context_time(ctx);
1768 update_cgrp_time_from_cpuctx(cpuctx);
1230 1769
1231 if (!ctx->nr_active) 1770 if (!ctx->nr_active)
1232 goto out; 1771 goto out;
@@ -1339,8 +1878,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1339 } 1878 }
1340} 1879}
1341 1880
1342void perf_event_context_sched_out(struct task_struct *task, int ctxn, 1881static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1343 struct task_struct *next) 1882 struct task_struct *next)
1344{ 1883{
1345 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; 1884 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1346 struct perf_event_context *next_ctx; 1885 struct perf_event_context *next_ctx;
@@ -1416,6 +1955,14 @@ void __perf_event_task_sched_out(struct task_struct *task,
1416 1955
1417 for_each_task_context_nr(ctxn) 1956 for_each_task_context_nr(ctxn)
1418 perf_event_context_sched_out(task, ctxn, next); 1957 perf_event_context_sched_out(task, ctxn, next);
1958
1959 /*
1960 * if cgroup events exist on this CPU, then we need
1961 * to check if we have to switch out PMU state.
1962 * cgroup event are system-wide mode only
1963 */
1964 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
1965 perf_cgroup_sched_out(task);
1419} 1966}
1420 1967
1421static void task_ctx_sched_out(struct perf_event_context *ctx, 1968static void task_ctx_sched_out(struct perf_event_context *ctx,
@@ -1454,6 +2001,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
1454 if (!event_filter_match(event)) 2001 if (!event_filter_match(event))
1455 continue; 2002 continue;
1456 2003
2004 /* may need to reset tstamp_enabled */
2005 if (is_cgroup_event(event))
2006 perf_cgroup_mark_enabled(event, ctx);
2007
1457 if (group_can_go_on(event, cpuctx, 1)) 2008 if (group_can_go_on(event, cpuctx, 1))
1458 group_sched_in(event, cpuctx, ctx); 2009 group_sched_in(event, cpuctx, ctx);
1459 2010
@@ -1486,6 +2037,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1486 if (!event_filter_match(event)) 2037 if (!event_filter_match(event))
1487 continue; 2038 continue;
1488 2039
2040 /* may need to reset tstamp_enabled */
2041 if (is_cgroup_event(event))
2042 perf_cgroup_mark_enabled(event, ctx);
2043
1489 if (group_can_go_on(event, cpuctx, can_add_hw)) { 2044 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1490 if (group_sched_in(event, cpuctx, ctx)) 2045 if (group_sched_in(event, cpuctx, ctx))
1491 can_add_hw = 0; 2046 can_add_hw = 0;
@@ -1496,15 +2051,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1496static void 2051static void
1497ctx_sched_in(struct perf_event_context *ctx, 2052ctx_sched_in(struct perf_event_context *ctx,
1498 struct perf_cpu_context *cpuctx, 2053 struct perf_cpu_context *cpuctx,
1499 enum event_type_t event_type) 2054 enum event_type_t event_type,
2055 struct task_struct *task)
1500{ 2056{
2057 u64 now;
2058
1501 raw_spin_lock(&ctx->lock); 2059 raw_spin_lock(&ctx->lock);
1502 ctx->is_active = 1; 2060 ctx->is_active = 1;
1503 if (likely(!ctx->nr_events)) 2061 if (likely(!ctx->nr_events))
1504 goto out; 2062 goto out;
1505 2063
1506 ctx->timestamp = perf_clock(); 2064 now = perf_clock();
1507 2065 ctx->timestamp = now;
2066 perf_cgroup_set_timestamp(task, ctx);
1508 /* 2067 /*
1509 * First go through the list and put on any pinned groups 2068 * First go through the list and put on any pinned groups
1510 * in order to give them the best chance of going on. 2069 * in order to give them the best chance of going on.
@@ -1521,11 +2080,12 @@ out:
1521} 2080}
1522 2081
1523static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 2082static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1524 enum event_type_t event_type) 2083 enum event_type_t event_type,
2084 struct task_struct *task)
1525{ 2085{
1526 struct perf_event_context *ctx = &cpuctx->ctx; 2086 struct perf_event_context *ctx = &cpuctx->ctx;
1527 2087
1528 ctx_sched_in(ctx, cpuctx, event_type); 2088 ctx_sched_in(ctx, cpuctx, event_type, task);
1529} 2089}
1530 2090
1531static void task_ctx_sched_in(struct perf_event_context *ctx, 2091static void task_ctx_sched_in(struct perf_event_context *ctx,
@@ -1533,15 +2093,16 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
1533{ 2093{
1534 struct perf_cpu_context *cpuctx; 2094 struct perf_cpu_context *cpuctx;
1535 2095
1536 cpuctx = __get_cpu_context(ctx); 2096 cpuctx = __get_cpu_context(ctx);
1537 if (cpuctx->task_ctx == ctx) 2097 if (cpuctx->task_ctx == ctx)
1538 return; 2098 return;
1539 2099
1540 ctx_sched_in(ctx, cpuctx, event_type); 2100 ctx_sched_in(ctx, cpuctx, event_type, NULL);
1541 cpuctx->task_ctx = ctx; 2101 cpuctx->task_ctx = ctx;
1542} 2102}
1543 2103
1544void perf_event_context_sched_in(struct perf_event_context *ctx) 2104static void perf_event_context_sched_in(struct perf_event_context *ctx,
2105 struct task_struct *task)
1545{ 2106{
1546 struct perf_cpu_context *cpuctx; 2107 struct perf_cpu_context *cpuctx;
1547 2108
@@ -1557,9 +2118,9 @@ void perf_event_context_sched_in(struct perf_event_context *ctx)
1557 */ 2118 */
1558 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2119 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1559 2120
1560 ctx_sched_in(ctx, cpuctx, EVENT_PINNED); 2121 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1561 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2122 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1562 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); 2123 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1563 2124
1564 cpuctx->task_ctx = ctx; 2125 cpuctx->task_ctx = ctx;
1565 2126
@@ -1592,14 +2153,17 @@ void __perf_event_task_sched_in(struct task_struct *task)
1592 if (likely(!ctx)) 2153 if (likely(!ctx))
1593 continue; 2154 continue;
1594 2155
1595 perf_event_context_sched_in(ctx); 2156 perf_event_context_sched_in(ctx, task);
1596 } 2157 }
2158 /*
2159 * if cgroup events exist on this CPU, then we need
2160 * to check if we have to switch in PMU state.
2161 * cgroup event are system-wide mode only
2162 */
2163 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2164 perf_cgroup_sched_in(task);
1597} 2165}
1598 2166
1599#define MAX_INTERRUPTS (~0ULL)
1600
1601static void perf_log_throttle(struct perf_event *event, int enable);
1602
1603static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2167static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1604{ 2168{
1605 u64 frequency = event->attr.sample_freq; 2169 u64 frequency = event->attr.sample_freq;
@@ -1627,7 +2191,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1627 * Reduce accuracy by one bit such that @a and @b converge 2191 * Reduce accuracy by one bit such that @a and @b converge
1628 * to a similar magnitude. 2192 * to a similar magnitude.
1629 */ 2193 */
1630#define REDUCE_FLS(a, b) \ 2194#define REDUCE_FLS(a, b) \
1631do { \ 2195do { \
1632 if (a##_fls > b##_fls) { \ 2196 if (a##_fls > b##_fls) { \
1633 a >>= 1; \ 2197 a >>= 1; \
@@ -1797,7 +2361,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1797 if (ctx) 2361 if (ctx)
1798 rotate_ctx(ctx); 2362 rotate_ctx(ctx);
1799 2363
1800 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2364 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
1801 if (ctx) 2365 if (ctx)
1802 task_ctx_sched_in(ctx, EVENT_FLEXIBLE); 2366 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1803 2367
@@ -1852,6 +2416,14 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1852 if (!ctx || !ctx->nr_events) 2416 if (!ctx || !ctx->nr_events)
1853 goto out; 2417 goto out;
1854 2418
2419 /*
2420 * We must ctxsw out cgroup events to avoid conflict
2421 * when invoking perf_task_event_sched_in() later on
2422 * in this function. Otherwise we end up trying to
2423 * ctxswin cgroup events which are already scheduled
2424 * in.
2425 */
2426 perf_cgroup_sched_out(current);
1855 task_ctx_sched_out(ctx, EVENT_ALL); 2427 task_ctx_sched_out(ctx, EVENT_ALL);
1856 2428
1857 raw_spin_lock(&ctx->lock); 2429 raw_spin_lock(&ctx->lock);
@@ -1876,7 +2448,10 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1876 2448
1877 raw_spin_unlock(&ctx->lock); 2449 raw_spin_unlock(&ctx->lock);
1878 2450
1879 perf_event_context_sched_in(ctx); 2451 /*
2452 * Also calls ctxswin for cgroup events, if any:
2453 */
2454 perf_event_context_sched_in(ctx, ctx->task);
1880out: 2455out:
1881 local_irq_restore(flags); 2456 local_irq_restore(flags);
1882} 2457}
@@ -1901,8 +2476,10 @@ static void __perf_event_read(void *info)
1901 return; 2476 return;
1902 2477
1903 raw_spin_lock(&ctx->lock); 2478 raw_spin_lock(&ctx->lock);
1904 if (ctx->is_active) 2479 if (ctx->is_active) {
1905 update_context_time(ctx); 2480 update_context_time(ctx);
2481 update_cgrp_time_from_event(event);
2482 }
1906 update_event_times(event); 2483 update_event_times(event);
1907 if (event->state == PERF_EVENT_STATE_ACTIVE) 2484 if (event->state == PERF_EVENT_STATE_ACTIVE)
1908 event->pmu->read(event); 2485 event->pmu->read(event);
@@ -1933,8 +2510,10 @@ static u64 perf_event_read(struct perf_event *event)
1933 * (e.g., thread is blocked), in that case 2510 * (e.g., thread is blocked), in that case
1934 * we cannot update context time 2511 * we cannot update context time
1935 */ 2512 */
1936 if (ctx->is_active) 2513 if (ctx->is_active) {
1937 update_context_time(ctx); 2514 update_context_time(ctx);
2515 update_cgrp_time_from_event(event);
2516 }
1938 update_event_times(event); 2517 update_event_times(event);
1939 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2518 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1940 } 2519 }
@@ -2213,6 +2792,9 @@ errout:
2213 2792
2214} 2793}
2215 2794
2795/*
2796 * Returns a matching context with refcount and pincount.
2797 */
2216static struct perf_event_context * 2798static struct perf_event_context *
2217find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) 2799find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2218{ 2800{
@@ -2237,6 +2819,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2237 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 2819 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2238 ctx = &cpuctx->ctx; 2820 ctx = &cpuctx->ctx;
2239 get_ctx(ctx); 2821 get_ctx(ctx);
2822 ++ctx->pin_count;
2240 2823
2241 return ctx; 2824 return ctx;
2242 } 2825 }
@@ -2250,6 +2833,7 @@ retry:
2250 ctx = perf_lock_task_context(task, ctxn, &flags); 2833 ctx = perf_lock_task_context(task, ctxn, &flags);
2251 if (ctx) { 2834 if (ctx) {
2252 unclone_ctx(ctx); 2835 unclone_ctx(ctx);
2836 ++ctx->pin_count;
2253 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2837 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2254 } 2838 }
2255 2839
@@ -2271,8 +2855,10 @@ retry:
2271 err = -ESRCH; 2855 err = -ESRCH;
2272 else if (task->perf_event_ctxp[ctxn]) 2856 else if (task->perf_event_ctxp[ctxn])
2273 err = -EAGAIN; 2857 err = -EAGAIN;
2274 else 2858 else {
2859 ++ctx->pin_count;
2275 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); 2860 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2861 }
2276 mutex_unlock(&task->perf_event_mutex); 2862 mutex_unlock(&task->perf_event_mutex);
2277 2863
2278 if (unlikely(err)) { 2864 if (unlikely(err)) {
@@ -2312,7 +2898,7 @@ static void free_event(struct perf_event *event)
2312 2898
2313 if (!event->parent) { 2899 if (!event->parent) {
2314 if (event->attach_state & PERF_ATTACH_TASK) 2900 if (event->attach_state & PERF_ATTACH_TASK)
2315 jump_label_dec(&perf_task_events); 2901 jump_label_dec(&perf_sched_events);
2316 if (event->attr.mmap || event->attr.mmap_data) 2902 if (event->attr.mmap || event->attr.mmap_data)
2317 atomic_dec(&nr_mmap_events); 2903 atomic_dec(&nr_mmap_events);
2318 if (event->attr.comm) 2904 if (event->attr.comm)
@@ -2321,6 +2907,10 @@ static void free_event(struct perf_event *event)
2321 atomic_dec(&nr_task_events); 2907 atomic_dec(&nr_task_events);
2322 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 2908 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2323 put_callchain_buffers(); 2909 put_callchain_buffers();
2910 if (is_cgroup_event(event)) {
2911 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2912 jump_label_dec(&perf_sched_events);
2913 }
2324 } 2914 }
2325 2915
2326 if (event->buffer) { 2916 if (event->buffer) {
@@ -2328,6 +2918,9 @@ static void free_event(struct perf_event *event)
2328 event->buffer = NULL; 2918 event->buffer = NULL;
2329 } 2919 }
2330 2920
2921 if (is_cgroup_event(event))
2922 perf_detach_cgroup(event);
2923
2331 if (event->destroy) 2924 if (event->destroy)
2332 event->destroy(event); 2925 event->destroy(event);
2333 2926
@@ -4395,26 +4988,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4395 if (unlikely(!is_sampling_event(event))) 4988 if (unlikely(!is_sampling_event(event)))
4396 return 0; 4989 return 0;
4397 4990
4398 if (!throttle) { 4991 if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
4399 hwc->interrupts++; 4992 if (throttle) {
4400 } else { 4993 hwc->interrupts = MAX_INTERRUPTS;
4401 if (hwc->interrupts != MAX_INTERRUPTS) { 4994 perf_log_throttle(event, 0);
4402 hwc->interrupts++;
4403 if (HZ * hwc->interrupts >
4404 (u64)sysctl_perf_event_sample_rate) {
4405 hwc->interrupts = MAX_INTERRUPTS;
4406 perf_log_throttle(event, 0);
4407 ret = 1;
4408 }
4409 } else {
4410 /*
4411 * Keep re-disabling events even though on the previous
4412 * pass we disabled it - just in case we raced with a
4413 * sched-in and the event got enabled again:
4414 */
4415 ret = 1; 4995 ret = 1;
4416 } 4996 }
4417 } 4997 } else
4998 hwc->interrupts++;
4418 4999
4419 if (event->attr.freq) { 5000 if (event->attr.freq) {
4420 u64 now = perf_clock(); 5001 u64 now = perf_clock();
@@ -4556,7 +5137,7 @@ static int perf_exclude_event(struct perf_event *event,
4556 struct pt_regs *regs) 5137 struct pt_regs *regs)
4557{ 5138{
4558 if (event->hw.state & PERF_HES_STOPPED) 5139 if (event->hw.state & PERF_HES_STOPPED)
4559 return 0; 5140 return 1;
4560 5141
4561 if (regs) { 5142 if (regs) {
4562 if (event->attr.exclude_user && user_mode(regs)) 5143 if (event->attr.exclude_user && user_mode(regs))
@@ -4742,14 +5323,6 @@ swevent_hlist_deref(struct swevent_htable *swhash)
4742 lockdep_is_held(&swhash->hlist_mutex)); 5323 lockdep_is_held(&swhash->hlist_mutex));
4743} 5324}
4744 5325
4745static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4746{
4747 struct swevent_hlist *hlist;
4748
4749 hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
4750 kfree(hlist);
4751}
4752
4753static void swevent_hlist_release(struct swevent_htable *swhash) 5326static void swevent_hlist_release(struct swevent_htable *swhash)
4754{ 5327{
4755 struct swevent_hlist *hlist = swevent_hlist_deref(swhash); 5328 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
@@ -4758,7 +5331,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash)
4758 return; 5331 return;
4759 5332
4760 rcu_assign_pointer(swhash->swevent_hlist, NULL); 5333 rcu_assign_pointer(swhash->swevent_hlist, NULL);
4761 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); 5334 kfree_rcu(hlist, rcu_head);
4762} 5335}
4763 5336
4764static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) 5337static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
@@ -4840,7 +5413,7 @@ fail:
4840 return err; 5413 return err;
4841} 5414}
4842 5415
4843atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 5416struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
4844 5417
4845static void sw_perf_event_destroy(struct perf_event *event) 5418static void sw_perf_event_destroy(struct perf_event *event)
4846{ 5419{
@@ -4912,6 +5485,8 @@ static int perf_tp_event_match(struct perf_event *event,
4912 struct perf_sample_data *data, 5485 struct perf_sample_data *data,
4913 struct pt_regs *regs) 5486 struct pt_regs *regs)
4914{ 5487{
5488 if (event->hw.state & PERF_HES_STOPPED)
5489 return 0;
4915 /* 5490 /*
4916 * All tracepoints are from kernel-space. 5491 * All tracepoints are from kernel-space.
4917 */ 5492 */
@@ -5051,6 +5626,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5051 u64 period; 5626 u64 period;
5052 5627
5053 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 5628 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
5629
5630 if (event->state != PERF_EVENT_STATE_ACTIVE)
5631 return HRTIMER_NORESTART;
5632
5054 event->pmu->read(event); 5633 event->pmu->read(event);
5055 5634
5056 perf_sample_data_init(&data, 0); 5635 perf_sample_data_init(&data, 0);
@@ -5077,9 +5656,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
5077 if (!is_sampling_event(event)) 5656 if (!is_sampling_event(event))
5078 return; 5657 return;
5079 5658
5080 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5081 hwc->hrtimer.function = perf_swevent_hrtimer;
5082
5083 period = local64_read(&hwc->period_left); 5659 period = local64_read(&hwc->period_left);
5084 if (period) { 5660 if (period) {
5085 if (period < 0) 5661 if (period < 0)
@@ -5106,6 +5682,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5106 } 5682 }
5107} 5683}
5108 5684
5685static void perf_swevent_init_hrtimer(struct perf_event *event)
5686{
5687 struct hw_perf_event *hwc = &event->hw;
5688
5689 if (!is_sampling_event(event))
5690 return;
5691
5692 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5693 hwc->hrtimer.function = perf_swevent_hrtimer;
5694
5695 /*
5696 * Since hrtimers have a fixed rate, we can do a static freq->period
5697 * mapping and avoid the whole period adjust feedback stuff.
5698 */
5699 if (event->attr.freq) {
5700 long freq = event->attr.sample_freq;
5701
5702 event->attr.sample_period = NSEC_PER_SEC / freq;
5703 hwc->sample_period = event->attr.sample_period;
5704 local64_set(&hwc->period_left, hwc->sample_period);
5705 event->attr.freq = 0;
5706 }
5707}
5708
5109/* 5709/*
5110 * Software event: cpu wall time clock 5710 * Software event: cpu wall time clock
5111 */ 5711 */
@@ -5158,6 +5758,8 @@ static int cpu_clock_event_init(struct perf_event *event)
5158 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) 5758 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5159 return -ENOENT; 5759 return -ENOENT;
5160 5760
5761 perf_swevent_init_hrtimer(event);
5762
5161 return 0; 5763 return 0;
5162} 5764}
5163 5765
@@ -5213,16 +5815,9 @@ static void task_clock_event_del(struct perf_event *event, int flags)
5213 5815
5214static void task_clock_event_read(struct perf_event *event) 5816static void task_clock_event_read(struct perf_event *event)
5215{ 5817{
5216 u64 time; 5818 u64 now = perf_clock();
5217 5819 u64 delta = now - event->ctx->timestamp;
5218 if (!in_nmi()) { 5820 u64 time = event->ctx->time + delta;
5219 update_context_time(event->ctx);
5220 time = event->ctx->time;
5221 } else {
5222 u64 now = perf_clock();
5223 u64 delta = now - event->ctx->timestamp;
5224 time = event->ctx->time + delta;
5225 }
5226 5821
5227 task_clock_event_update(event, time); 5822 task_clock_event_update(event, time);
5228} 5823}
@@ -5235,6 +5830,8 @@ static int task_clock_event_init(struct perf_event *event)
5235 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) 5830 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5236 return -ENOENT; 5831 return -ENOENT;
5237 5832
5833 perf_swevent_init_hrtimer(event);
5834
5238 return 0; 5835 return 0;
5239} 5836}
5240 5837
@@ -5506,17 +6103,22 @@ struct pmu *perf_init_event(struct perf_event *event)
5506{ 6103{
5507 struct pmu *pmu = NULL; 6104 struct pmu *pmu = NULL;
5508 int idx; 6105 int idx;
6106 int ret;
5509 6107
5510 idx = srcu_read_lock(&pmus_srcu); 6108 idx = srcu_read_lock(&pmus_srcu);
5511 6109
5512 rcu_read_lock(); 6110 rcu_read_lock();
5513 pmu = idr_find(&pmu_idr, event->attr.type); 6111 pmu = idr_find(&pmu_idr, event->attr.type);
5514 rcu_read_unlock(); 6112 rcu_read_unlock();
5515 if (pmu) 6113 if (pmu) {
6114 ret = pmu->event_init(event);
6115 if (ret)
6116 pmu = ERR_PTR(ret);
5516 goto unlock; 6117 goto unlock;
6118 }
5517 6119
5518 list_for_each_entry_rcu(pmu, &pmus, entry) { 6120 list_for_each_entry_rcu(pmu, &pmus, entry) {
5519 int ret = pmu->event_init(event); 6121 ret = pmu->event_init(event);
5520 if (!ret) 6122 if (!ret)
5521 goto unlock; 6123 goto unlock;
5522 6124
@@ -5642,7 +6244,7 @@ done:
5642 6244
5643 if (!event->parent) { 6245 if (!event->parent) {
5644 if (event->attach_state & PERF_ATTACH_TASK) 6246 if (event->attach_state & PERF_ATTACH_TASK)
5645 jump_label_inc(&perf_task_events); 6247 jump_label_inc(&perf_sched_events);
5646 if (event->attr.mmap || event->attr.mmap_data) 6248 if (event->attr.mmap || event->attr.mmap_data)
5647 atomic_inc(&nr_mmap_events); 6249 atomic_inc(&nr_mmap_events);
5648 if (event->attr.comm) 6250 if (event->attr.comm)
@@ -5817,7 +6419,7 @@ SYSCALL_DEFINE5(perf_event_open,
5817 int err; 6419 int err;
5818 6420
5819 /* for future expandability... */ 6421 /* for future expandability... */
5820 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) 6422 if (flags & ~PERF_FLAG_ALL)
5821 return -EINVAL; 6423 return -EINVAL;
5822 6424
5823 err = perf_copy_attr(attr_uptr, &attr); 6425 err = perf_copy_attr(attr_uptr, &attr);
@@ -5834,6 +6436,15 @@ SYSCALL_DEFINE5(perf_event_open,
5834 return -EINVAL; 6436 return -EINVAL;
5835 } 6437 }
5836 6438
6439 /*
6440 * In cgroup mode, the pid argument is used to pass the fd
6441 * opened to the cgroup directory in cgroupfs. The cpu argument
6442 * designates the cpu on which to monitor threads from that
6443 * cgroup.
6444 */
6445 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6446 return -EINVAL;
6447
5837 event_fd = get_unused_fd_flags(O_RDWR); 6448 event_fd = get_unused_fd_flags(O_RDWR);
5838 if (event_fd < 0) 6449 if (event_fd < 0)
5839 return event_fd; 6450 return event_fd;
@@ -5851,7 +6462,7 @@ SYSCALL_DEFINE5(perf_event_open,
5851 group_leader = NULL; 6462 group_leader = NULL;
5852 } 6463 }
5853 6464
5854 if (pid != -1) { 6465 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
5855 task = find_lively_task_by_vpid(pid); 6466 task = find_lively_task_by_vpid(pid);
5856 if (IS_ERR(task)) { 6467 if (IS_ERR(task)) {
5857 err = PTR_ERR(task); 6468 err = PTR_ERR(task);
@@ -5865,6 +6476,19 @@ SYSCALL_DEFINE5(perf_event_open,
5865 goto err_task; 6476 goto err_task;
5866 } 6477 }
5867 6478
6479 if (flags & PERF_FLAG_PID_CGROUP) {
6480 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6481 if (err)
6482 goto err_alloc;
6483 /*
6484 * one more event:
6485 * - that has cgroup constraint on event->cpu
6486 * - that may need work on context switch
6487 */
6488 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6489 jump_label_inc(&perf_sched_events);
6490 }
6491
5868 /* 6492 /*
5869 * Special case software events and allow them to be part of 6493 * Special case software events and allow them to be part of
5870 * any hardware group. 6494 * any hardware group.
@@ -5903,6 +6527,11 @@ SYSCALL_DEFINE5(perf_event_open,
5903 goto err_alloc; 6527 goto err_alloc;
5904 } 6528 }
5905 6529
6530 if (task) {
6531 put_task_struct(task);
6532 task = NULL;
6533 }
6534
5906 /* 6535 /*
5907 * Look up the group leader (we will attach this event to it): 6536 * Look up the group leader (we will attach this event to it):
5908 */ 6537 */
@@ -5950,10 +6579,10 @@ SYSCALL_DEFINE5(perf_event_open,
5950 struct perf_event_context *gctx = group_leader->ctx; 6579 struct perf_event_context *gctx = group_leader->ctx;
5951 6580
5952 mutex_lock(&gctx->mutex); 6581 mutex_lock(&gctx->mutex);
5953 perf_event_remove_from_context(group_leader); 6582 perf_remove_from_context(group_leader);
5954 list_for_each_entry(sibling, &group_leader->sibling_list, 6583 list_for_each_entry(sibling, &group_leader->sibling_list,
5955 group_entry) { 6584 group_entry) {
5956 perf_event_remove_from_context(sibling); 6585 perf_remove_from_context(sibling);
5957 put_ctx(gctx); 6586 put_ctx(gctx);
5958 } 6587 }
5959 mutex_unlock(&gctx->mutex); 6588 mutex_unlock(&gctx->mutex);
@@ -5976,6 +6605,7 @@ SYSCALL_DEFINE5(perf_event_open,
5976 6605
5977 perf_install_in_context(ctx, event, cpu); 6606 perf_install_in_context(ctx, event, cpu);
5978 ++ctx->generation; 6607 ++ctx->generation;
6608 perf_unpin_context(ctx);
5979 mutex_unlock(&ctx->mutex); 6609 mutex_unlock(&ctx->mutex);
5980 6610
5981 event->owner = current; 6611 event->owner = current;
@@ -6001,6 +6631,7 @@ SYSCALL_DEFINE5(perf_event_open,
6001 return event_fd; 6631 return event_fd;
6002 6632
6003err_context: 6633err_context:
6634 perf_unpin_context(ctx);
6004 put_ctx(ctx); 6635 put_ctx(ctx);
6005err_alloc: 6636err_alloc:
6006 free_event(event); 6637 free_event(event);
@@ -6051,6 +6682,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6051 mutex_lock(&ctx->mutex); 6682 mutex_lock(&ctx->mutex);
6052 perf_install_in_context(ctx, event, cpu); 6683 perf_install_in_context(ctx, event, cpu);
6053 ++ctx->generation; 6684 ++ctx->generation;
6685 perf_unpin_context(ctx);
6054 mutex_unlock(&ctx->mutex); 6686 mutex_unlock(&ctx->mutex);
6055 6687
6056 return event; 6688 return event;
@@ -6102,17 +6734,20 @@ __perf_event_exit_task(struct perf_event *child_event,
6102 struct perf_event_context *child_ctx, 6734 struct perf_event_context *child_ctx,
6103 struct task_struct *child) 6735 struct task_struct *child)
6104{ 6736{
6105 struct perf_event *parent_event; 6737 if (child_event->parent) {
6738 raw_spin_lock_irq(&child_ctx->lock);
6739 perf_group_detach(child_event);
6740 raw_spin_unlock_irq(&child_ctx->lock);
6741 }
6106 6742
6107 perf_event_remove_from_context(child_event); 6743 perf_remove_from_context(child_event);
6108 6744
6109 parent_event = child_event->parent;
6110 /* 6745 /*
6111 * It can happen that parent exits first, and has events 6746 * It can happen that the parent exits first, and has events
6112 * that are still around due to the child reference. These 6747 * that are still around due to the child reference. These
6113 * events need to be zapped - but otherwise linger. 6748 * events need to be zapped.
6114 */ 6749 */
6115 if (parent_event) { 6750 if (child_event->parent) {
6116 sync_child_event(child_event, child); 6751 sync_child_event(child_event, child);
6117 free_event(child_event); 6752 free_event(child_event);
6118 } 6753 }
@@ -6411,7 +7046,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
6411 return 0; 7046 return 0;
6412 } 7047 }
6413 7048
6414 child_ctx = child->perf_event_ctxp[ctxn]; 7049 child_ctx = child->perf_event_ctxp[ctxn];
6415 if (!child_ctx) { 7050 if (!child_ctx) {
6416 /* 7051 /*
6417 * This is executed from the parent task context, so 7052 * This is executed from the parent task context, so
@@ -6526,6 +7161,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6526 mutex_unlock(&parent_ctx->mutex); 7161 mutex_unlock(&parent_ctx->mutex);
6527 7162
6528 perf_unpin_context(parent_ctx); 7163 perf_unpin_context(parent_ctx);
7164 put_ctx(parent_ctx);
6529 7165
6530 return ret; 7166 return ret;
6531} 7167}
@@ -6595,9 +7231,9 @@ static void __perf_event_exit_context(void *__info)
6595 perf_pmu_rotate_stop(ctx->pmu); 7231 perf_pmu_rotate_stop(ctx->pmu);
6596 7232
6597 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 7233 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
6598 __perf_event_remove_from_context(event); 7234 __perf_remove_from_context(event);
6599 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) 7235 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
6600 __perf_event_remove_from_context(event); 7236 __perf_remove_from_context(event);
6601} 7237}
6602 7238
6603static void perf_event_exit_cpu_context(int cpu) 7239static void perf_event_exit_cpu_context(int cpu)
@@ -6721,3 +7357,83 @@ unlock:
6721 return ret; 7357 return ret;
6722} 7358}
6723device_initcall(perf_event_sysfs_init); 7359device_initcall(perf_event_sysfs_init);
7360
7361#ifdef CONFIG_CGROUP_PERF
7362static struct cgroup_subsys_state *perf_cgroup_create(
7363 struct cgroup_subsys *ss, struct cgroup *cont)
7364{
7365 struct perf_cgroup *jc;
7366
7367 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
7368 if (!jc)
7369 return ERR_PTR(-ENOMEM);
7370
7371 jc->info = alloc_percpu(struct perf_cgroup_info);
7372 if (!jc->info) {
7373 kfree(jc);
7374 return ERR_PTR(-ENOMEM);
7375 }
7376
7377 return &jc->css;
7378}
7379
7380static void perf_cgroup_destroy(struct cgroup_subsys *ss,
7381 struct cgroup *cont)
7382{
7383 struct perf_cgroup *jc;
7384 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
7385 struct perf_cgroup, css);
7386 free_percpu(jc->info);
7387 kfree(jc);
7388}
7389
7390static int __perf_cgroup_move(void *info)
7391{
7392 struct task_struct *task = info;
7393 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
7394 return 0;
7395}
7396
7397static void perf_cgroup_move(struct task_struct *task)
7398{
7399 task_function_call(task, __perf_cgroup_move, task);
7400}
7401
7402static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7403 struct cgroup *old_cgrp, struct task_struct *task,
7404 bool threadgroup)
7405{
7406 perf_cgroup_move(task);
7407 if (threadgroup) {
7408 struct task_struct *c;
7409 rcu_read_lock();
7410 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
7411 perf_cgroup_move(c);
7412 }
7413 rcu_read_unlock();
7414 }
7415}
7416
7417static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7418 struct cgroup *old_cgrp, struct task_struct *task)
7419{
7420 /*
7421 * cgroup_exit() is called in the copy_process() failure path.
7422 * Ignore this case since the task hasn't ran yet, this avoids
7423 * trying to poke a half freed task state from generic code.
7424 */
7425 if (!(task->flags & PF_EXITING))
7426 return;
7427
7428 perf_cgroup_move(task);
7429}
7430
7431struct cgroup_subsys perf_subsys = {
7432 .name = "perf_event",
7433 .subsys_id = perf_subsys_id,
7434 .create = perf_cgroup_create,
7435 .destroy = perf_cgroup_destroy,
7436 .exit = perf_cgroup_exit,
7437 .attach = perf_cgroup_attach,
7438};
7439#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 086adf25a55e..086adf25a55e 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45ebcc7b1..20a406471525 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -841,7 +841,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
841 /* Let father know we died 841 /* Let father know we died
842 * 842 *
843 * Thread signals are configurable, but you aren't going to use 843 * Thread signals are configurable, but you aren't going to use
844 * that to send signals to arbitary processes. 844 * that to send signals to arbitrary processes.
845 * That stops right now. 845 * That stops right now.
846 * 846 *
847 * If the parent exec id doesn't match the exec id we saved 847 * If the parent exec id doesn't match the exec id we saved
@@ -908,6 +908,7 @@ NORET_TYPE void do_exit(long code)
908 profile_task_exit(tsk); 908 profile_task_exit(tsk);
909 909
910 WARN_ON(atomic_read(&tsk->fs_excl)); 910 WARN_ON(atomic_read(&tsk->fs_excl));
911 WARN_ON(blk_needs_flush_plug(tsk));
911 912
912 if (unlikely(in_interrupt())) 913 if (unlikely(in_interrupt()))
913 panic("Aiee, killing interrupt handler!"); 914 panic("Aiee, killing interrupt handler!");
@@ -1015,7 +1016,7 @@ NORET_TYPE void do_exit(long code)
1015 /* 1016 /*
1016 * FIXME: do that only when needed, using sched_exit tracepoint 1017 * FIXME: do that only when needed, using sched_exit tracepoint
1017 */ 1018 */
1018 flush_ptrace_hw_breakpoint(tsk); 1019 ptrace_put_breakpoints(tsk);
1019 1020
1020 exit_notify(tsk, group_dead); 1021 exit_notify(tsk, group_dead);
1021#ifdef CONFIG_NUMA 1022#ifdef CONFIG_NUMA
@@ -1376,11 +1377,23 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
1376 return NULL; 1377 return NULL;
1377} 1378}
1378 1379
1379/* 1380/**
1380 * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold 1381 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
1381 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold 1382 * @wo: wait options
1382 * the lock and this task is uninteresting. If we return nonzero, we have 1383 * @ptrace: is the wait for ptrace
1383 * released the lock and the system call should return. 1384 * @p: task to wait for
1385 *
1386 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
1387 *
1388 * CONTEXT:
1389 * read_lock(&tasklist_lock), which is released if return value is
1390 * non-zero. Also, grabs and releases @p->sighand->siglock.
1391 *
1392 * RETURNS:
1393 * 0 if wait condition didn't exist and search for other wait conditions
1394 * should continue. Non-zero return, -errno on failure and @p's pid on
1395 * success, implies that tasklist_lock is released and wait condition
1396 * search should terminate.
1384 */ 1397 */
1385static int wait_task_stopped(struct wait_opts *wo, 1398static int wait_task_stopped(struct wait_opts *wo,
1386 int ptrace, struct task_struct *p) 1399 int ptrace, struct task_struct *p)
@@ -1396,6 +1409,9 @@ static int wait_task_stopped(struct wait_opts *wo,
1396 if (!ptrace && !(wo->wo_flags & WUNTRACED)) 1409 if (!ptrace && !(wo->wo_flags & WUNTRACED))
1397 return 0; 1410 return 0;
1398 1411
1412 if (!task_stopped_code(p, ptrace))
1413 return 0;
1414
1399 exit_code = 0; 1415 exit_code = 0;
1400 spin_lock_irq(&p->sighand->siglock); 1416 spin_lock_irq(&p->sighand->siglock);
1401 1417
@@ -1537,33 +1553,84 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1537 return 0; 1553 return 0;
1538 } 1554 }
1539 1555
1540 if (likely(!ptrace) && unlikely(task_ptrace(p))) { 1556 /* dead body doesn't have much to contribute */
1557 if (p->exit_state == EXIT_DEAD)
1558 return 0;
1559
1560 /* slay zombie? */
1561 if (p->exit_state == EXIT_ZOMBIE) {
1562 /*
1563 * A zombie ptracee is only visible to its ptracer.
1564 * Notification and reaping will be cascaded to the real
1565 * parent when the ptracer detaches.
1566 */
1567 if (likely(!ptrace) && unlikely(task_ptrace(p))) {
1568 /* it will become visible, clear notask_error */
1569 wo->notask_error = 0;
1570 return 0;
1571 }
1572
1573 /* we don't reap group leaders with subthreads */
1574 if (!delay_group_leader(p))
1575 return wait_task_zombie(wo, p);
1576
1541 /* 1577 /*
1542 * This child is hidden by ptrace. 1578 * Allow access to stopped/continued state via zombie by
1543 * We aren't allowed to see it now, but eventually we will. 1579 * falling through. Clearing of notask_error is complex.
1580 *
1581 * When !@ptrace:
1582 *
1583 * If WEXITED is set, notask_error should naturally be
1584 * cleared. If not, subset of WSTOPPED|WCONTINUED is set,
1585 * so, if there are live subthreads, there are events to
1586 * wait for. If all subthreads are dead, it's still safe
1587 * to clear - this function will be called again in finite
1588 * amount time once all the subthreads are released and
1589 * will then return without clearing.
1590 *
1591 * When @ptrace:
1592 *
1593 * Stopped state is per-task and thus can't change once the
1594 * target task dies. Only continued and exited can happen.
1595 * Clear notask_error if WCONTINUED | WEXITED.
1596 */
1597 if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
1598 wo->notask_error = 0;
1599 } else {
1600 /*
1601 * If @p is ptraced by a task in its real parent's group,
1602 * hide group stop/continued state when looking at @p as
1603 * the real parent; otherwise, a single stop can be
1604 * reported twice as group and ptrace stops.
1605 *
1606 * If a ptracer wants to distinguish the two events for its
1607 * own children, it should create a separate process which
1608 * takes the role of real parent.
1609 */
1610 if (likely(!ptrace) && task_ptrace(p) &&
1611 same_thread_group(p->parent, p->real_parent))
1612 return 0;
1613
1614 /*
1615 * @p is alive and it's gonna stop, continue or exit, so
1616 * there always is something to wait for.
1544 */ 1617 */
1545 wo->notask_error = 0; 1618 wo->notask_error = 0;
1546 return 0;
1547 } 1619 }
1548 1620
1549 if (p->exit_state == EXIT_DEAD)
1550 return 0;
1551
1552 /* 1621 /*
1553 * We don't reap group leaders with subthreads. 1622 * Wait for stopped. Depending on @ptrace, different stopped state
1623 * is used and the two don't interact with each other.
1554 */ 1624 */
1555 if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) 1625 ret = wait_task_stopped(wo, ptrace, p);
1556 return wait_task_zombie(wo, p); 1626 if (ret)
1627 return ret;
1557 1628
1558 /* 1629 /*
1559 * It's stopped or running now, so it might 1630 * Wait for continued. There's only one continued state and the
1560 * later continue, exit, or stop again. 1631 * ptracer can consume it which can confuse the real parent. Don't
1632 * use WCONTINUED from ptracer. You don't need or want it.
1561 */ 1633 */
1562 wo->notask_error = 0;
1563
1564 if (task_stopped_code(p, ptrace))
1565 return wait_task_stopped(wo, ptrace, p);
1566
1567 return wait_task_continued(wo, p); 1634 return wait_task_continued(wo, p);
1568} 1635}
1569 1636
diff --git a/kernel/extable.c b/kernel/extable.c
index 7f8f263f8524..5339705b8241 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -72,6 +72,24 @@ int core_kernel_text(unsigned long addr)
72 return 0; 72 return 0;
73} 73}
74 74
75/**
76 * core_kernel_data - tell if addr points to kernel data
77 * @addr: address to test
78 *
79 * Returns true if @addr passed in is from the core kernel data
80 * section.
81 *
82 * Note: On some archs it may return true for core RODATA, and false
83 * for others. But will always be true for core RW data.
84 */
85int core_kernel_data(unsigned long addr)
86{
87 if (addr >= (unsigned long)_sdata &&
88 addr < (unsigned long)_edata)
89 return 1;
90 return 0;
91}
92
75int __kernel_text_address(unsigned long addr) 93int __kernel_text_address(unsigned long addr)
76{ 94{
77 if (core_kernel_text(addr)) 95 if (core_kernel_text(addr))
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e429152ddc..ca406d916713 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
40#include <linux/tracehook.h> 40#include <linux/tracehook.h>
41#include <linux/futex.h> 41#include <linux/futex.h>
42#include <linux/compat.h> 42#include <linux/compat.h>
43#include <linux/kthread.h>
43#include <linux/task_io_accounting_ops.h> 44#include <linux/task_io_accounting_ops.h>
44#include <linux/rcupdate.h> 45#include <linux/rcupdate.h>
45#include <linux/ptrace.h> 46#include <linux/ptrace.h>
@@ -58,7 +59,6 @@
58#include <linux/taskstats_kern.h> 59#include <linux/taskstats_kern.h>
59#include <linux/random.h> 60#include <linux/random.h>
60#include <linux/tty.h> 61#include <linux/tty.h>
61#include <linux/proc_fs.h>
62#include <linux/blkdev.h> 62#include <linux/blkdev.h>
63#include <linux/fs_struct.h> 63#include <linux/fs_struct.h>
64#include <linux/magic.h> 64#include <linux/magic.h>
@@ -109,20 +109,25 @@ int nr_processes(void)
109} 109}
110 110
111#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 111#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
112# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) 112# define alloc_task_struct_node(node) \
113# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) 113 kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
114# define free_task_struct(tsk) \
115 kmem_cache_free(task_struct_cachep, (tsk))
114static struct kmem_cache *task_struct_cachep; 116static struct kmem_cache *task_struct_cachep;
115#endif 117#endif
116 118
117#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR 119#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
118static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) 120static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
121 int node)
119{ 122{
120#ifdef CONFIG_DEBUG_STACK_USAGE 123#ifdef CONFIG_DEBUG_STACK_USAGE
121 gfp_t mask = GFP_KERNEL | __GFP_ZERO; 124 gfp_t mask = GFP_KERNEL | __GFP_ZERO;
122#else 125#else
123 gfp_t mask = GFP_KERNEL; 126 gfp_t mask = GFP_KERNEL;
124#endif 127#endif
125 return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); 128 struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
129
130 return page ? page_address(page) : NULL;
126} 131}
127 132
128static inline void free_thread_info(struct thread_info *ti) 133static inline void free_thread_info(struct thread_info *ti)
@@ -193,6 +198,7 @@ void __put_task_struct(struct task_struct *tsk)
193 if (!profile_handoff_task(tsk)) 198 if (!profile_handoff_task(tsk))
194 free_task(tsk); 199 free_task(tsk);
195} 200}
201EXPORT_SYMBOL_GPL(__put_task_struct);
196 202
197/* 203/*
198 * macro override instead of weak attribute alias, to workaround 204 * macro override instead of weak attribute alias, to workaround
@@ -248,16 +254,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
248 struct task_struct *tsk; 254 struct task_struct *tsk;
249 struct thread_info *ti; 255 struct thread_info *ti;
250 unsigned long *stackend; 256 unsigned long *stackend;
251 257 int node = tsk_fork_get_node(orig);
252 int err; 258 int err;
253 259
254 prepare_to_copy(orig); 260 prepare_to_copy(orig);
255 261
256 tsk = alloc_task_struct(); 262 tsk = alloc_task_struct_node(node);
257 if (!tsk) 263 if (!tsk)
258 return NULL; 264 return NULL;
259 265
260 ti = alloc_thread_info(tsk); 266 ti = alloc_thread_info_node(tsk, node);
261 if (!ti) { 267 if (!ti) {
262 free_task_struct(tsk); 268 free_task_struct(tsk);
263 return NULL; 269 return NULL;
@@ -376,15 +382,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
376 get_file(file); 382 get_file(file);
377 if (tmp->vm_flags & VM_DENYWRITE) 383 if (tmp->vm_flags & VM_DENYWRITE)
378 atomic_dec(&inode->i_writecount); 384 atomic_dec(&inode->i_writecount);
379 spin_lock(&mapping->i_mmap_lock); 385 mutex_lock(&mapping->i_mmap_mutex);
380 if (tmp->vm_flags & VM_SHARED) 386 if (tmp->vm_flags & VM_SHARED)
381 mapping->i_mmap_writable++; 387 mapping->i_mmap_writable++;
382 tmp->vm_truncate_count = mpnt->vm_truncate_count;
383 flush_dcache_mmap_lock(mapping); 388 flush_dcache_mmap_lock(mapping);
384 /* insert tmp into the share list, just after mpnt */ 389 /* insert tmp into the share list, just after mpnt */
385 vma_prio_tree_add(tmp, mpnt); 390 vma_prio_tree_add(tmp, mpnt);
386 flush_dcache_mmap_unlock(mapping); 391 flush_dcache_mmap_unlock(mapping);
387 spin_unlock(&mapping->i_mmap_lock); 392 mutex_unlock(&mapping->i_mmap_mutex);
388 } 393 }
389 394
390 /* 395 /*
@@ -479,6 +484,20 @@ static void mm_init_aio(struct mm_struct *mm)
479#endif 484#endif
480} 485}
481 486
487int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm)
488{
489#ifdef CONFIG_CPUMASK_OFFSTACK
490 if (!alloc_cpumask_var(&mm->cpu_vm_mask_var, GFP_KERNEL))
491 return -ENOMEM;
492
493 if (oldmm)
494 cpumask_copy(mm_cpumask(mm), mm_cpumask(oldmm));
495 else
496 memset(mm_cpumask(mm), 0, cpumask_size());
497#endif
498 return 0;
499}
500
482static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) 501static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
483{ 502{
484 atomic_set(&mm->mm_users, 1); 503 atomic_set(&mm->mm_users, 1);
@@ -515,10 +534,20 @@ struct mm_struct * mm_alloc(void)
515 struct mm_struct * mm; 534 struct mm_struct * mm;
516 535
517 mm = allocate_mm(); 536 mm = allocate_mm();
518 if (mm) { 537 if (!mm)
519 memset(mm, 0, sizeof(*mm)); 538 return NULL;
520 mm = mm_init(mm, current); 539
540 memset(mm, 0, sizeof(*mm));
541 mm = mm_init(mm, current);
542 if (!mm)
543 return NULL;
544
545 if (mm_init_cpumask(mm, NULL)) {
546 mm_free_pgd(mm);
547 free_mm(mm);
548 return NULL;
521 } 549 }
550
522 return mm; 551 return mm;
523} 552}
524 553
@@ -530,6 +559,7 @@ struct mm_struct * mm_alloc(void)
530void __mmdrop(struct mm_struct *mm) 559void __mmdrop(struct mm_struct *mm)
531{ 560{
532 BUG_ON(mm == &init_mm); 561 BUG_ON(mm == &init_mm);
562 free_cpumask_var(mm->cpu_vm_mask_var);
533 mm_free_pgd(mm); 563 mm_free_pgd(mm);
534 destroy_context(mm); 564 destroy_context(mm);
535 mmu_notifier_mm_destroy(mm); 565 mmu_notifier_mm_destroy(mm);
@@ -566,6 +596,57 @@ void mmput(struct mm_struct *mm)
566} 596}
567EXPORT_SYMBOL_GPL(mmput); 597EXPORT_SYMBOL_GPL(mmput);
568 598
599/*
600 * We added or removed a vma mapping the executable. The vmas are only mapped
601 * during exec and are not mapped with the mmap system call.
602 * Callers must hold down_write() on the mm's mmap_sem for these
603 */
604void added_exe_file_vma(struct mm_struct *mm)
605{
606 mm->num_exe_file_vmas++;
607}
608
609void removed_exe_file_vma(struct mm_struct *mm)
610{
611 mm->num_exe_file_vmas--;
612 if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
613 fput(mm->exe_file);
614 mm->exe_file = NULL;
615 }
616
617}
618
619void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
620{
621 if (new_exe_file)
622 get_file(new_exe_file);
623 if (mm->exe_file)
624 fput(mm->exe_file);
625 mm->exe_file = new_exe_file;
626 mm->num_exe_file_vmas = 0;
627}
628
629struct file *get_mm_exe_file(struct mm_struct *mm)
630{
631 struct file *exe_file;
632
633 /* We need mmap_sem to protect against races with removal of
634 * VM_EXECUTABLE vmas */
635 down_read(&mm->mmap_sem);
636 exe_file = mm->exe_file;
637 if (exe_file)
638 get_file(exe_file);
639 up_read(&mm->mmap_sem);
640 return exe_file;
641}
642
643static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
644{
645 /* It's safe to write the exe_file pointer without exe_file_lock because
646 * this is called during fork when the task is not yet in /proc */
647 newmm->exe_file = get_mm_exe_file(oldmm);
648}
649
569/** 650/**
570 * get_task_mm - acquire a reference to the task's mm 651 * get_task_mm - acquire a reference to the task's mm
571 * 652 *
@@ -684,6 +765,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
684 if (!mm_init(mm, tsk)) 765 if (!mm_init(mm, tsk))
685 goto fail_nomem; 766 goto fail_nomem;
686 767
768 if (mm_init_cpumask(mm, oldmm))
769 goto fail_nocpumask;
770
687 if (init_new_context(tsk, mm)) 771 if (init_new_context(tsk, mm))
688 goto fail_nocontext; 772 goto fail_nocontext;
689 773
@@ -710,6 +794,9 @@ fail_nomem:
710 return NULL; 794 return NULL;
711 795
712fail_nocontext: 796fail_nocontext:
797 free_cpumask_var(mm->cpu_vm_mask_var);
798
799fail_nocpumask:
713 /* 800 /*
714 * If init_new_context() failed, we cannot use mmput() to free the mm 801 * If init_new_context() failed, we cannot use mmput() to free the mm
715 * because it calls destroy_context() 802 * because it calls destroy_context()
@@ -920,6 +1007,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
920 tty_audit_fork(sig); 1007 tty_audit_fork(sig);
921 sched_autogroup_fork(sig); 1008 sched_autogroup_fork(sig);
922 1009
1010#ifdef CONFIG_CGROUPS
1011 init_rwsem(&sig->threadgroup_fork_lock);
1012#endif
1013
923 sig->oom_adj = current->signal->oom_adj; 1014 sig->oom_adj = current->signal->oom_adj;
924 sig->oom_score_adj = current->signal->oom_score_adj; 1015 sig->oom_score_adj = current->signal->oom_score_adj;
925 sig->oom_score_adj_min = current->signal->oom_score_adj_min; 1016 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -1096,12 +1187,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1096 1187
1097 posix_cpu_timers_init(p); 1188 posix_cpu_timers_init(p);
1098 1189
1099 p->lock_depth = -1; /* -1 = no lock */
1100 do_posix_clock_monotonic_gettime(&p->start_time); 1190 do_posix_clock_monotonic_gettime(&p->start_time);
1101 p->real_start_time = p->start_time; 1191 p->real_start_time = p->start_time;
1102 monotonic_to_bootbased(&p->real_start_time); 1192 monotonic_to_bootbased(&p->real_start_time);
1103 p->io_context = NULL; 1193 p->io_context = NULL;
1104 p->audit_context = NULL; 1194 p->audit_context = NULL;
1195 if (clone_flags & CLONE_THREAD)
1196 threadgroup_fork_read_lock(current);
1105 cgroup_fork(p); 1197 cgroup_fork(p);
1106#ifdef CONFIG_NUMA 1198#ifdef CONFIG_NUMA
1107 p->mempolicy = mpol_dup(p->mempolicy); 1199 p->mempolicy = mpol_dup(p->mempolicy);
@@ -1146,7 +1238,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1146#endif 1238#endif
1147 1239
1148 /* Perform scheduler related setup. Assign this task to a CPU. */ 1240 /* Perform scheduler related setup. Assign this task to a CPU. */
1149 sched_fork(p, clone_flags); 1241 sched_fork(p);
1150 1242
1151 retval = perf_event_init_task(p); 1243 retval = perf_event_init_task(p);
1152 if (retval) 1244 if (retval)
@@ -1180,12 +1272,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1180 pid = alloc_pid(p->nsproxy->pid_ns); 1272 pid = alloc_pid(p->nsproxy->pid_ns);
1181 if (!pid) 1273 if (!pid)
1182 goto bad_fork_cleanup_io; 1274 goto bad_fork_cleanup_io;
1183
1184 if (clone_flags & CLONE_NEWPID) {
1185 retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
1186 if (retval < 0)
1187 goto bad_fork_free_pid;
1188 }
1189 } 1275 }
1190 1276
1191 p->pid = pid_nr(pid); 1277 p->pid = pid_nr(pid);
@@ -1193,17 +1279,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1193 if (clone_flags & CLONE_THREAD) 1279 if (clone_flags & CLONE_THREAD)
1194 p->tgid = current->tgid; 1280 p->tgid = current->tgid;
1195 1281
1196 if (current->nsproxy != p->nsproxy) {
1197 retval = ns_cgroup_clone(p, pid);
1198 if (retval)
1199 goto bad_fork_free_pid;
1200 }
1201
1202 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1282 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1203 /* 1283 /*
1204 * Clear TID on mm_release()? 1284 * Clear TID on mm_release()?
1205 */ 1285 */
1206 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; 1286 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
1287#ifdef CONFIG_BLOCK
1288 p->plug = NULL;
1289#endif
1207#ifdef CONFIG_FUTEX 1290#ifdef CONFIG_FUTEX
1208 p->robust_list = NULL; 1291 p->robust_list = NULL;
1209#ifdef CONFIG_COMPAT 1292#ifdef CONFIG_COMPAT
@@ -1289,7 +1372,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1289 tracehook_finish_clone(p, clone_flags, trace); 1372 tracehook_finish_clone(p, clone_flags, trace);
1290 1373
1291 if (thread_group_leader(p)) { 1374 if (thread_group_leader(p)) {
1292 if (clone_flags & CLONE_NEWPID) 1375 if (is_child_reaper(pid))
1293 p->nsproxy->pid_ns->child_reaper = p; 1376 p->nsproxy->pid_ns->child_reaper = p;
1294 1377
1295 p->signal->leader_pid = pid; 1378 p->signal->leader_pid = pid;
@@ -1309,6 +1392,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1309 write_unlock_irq(&tasklist_lock); 1392 write_unlock_irq(&tasklist_lock);
1310 proc_fork_connector(p); 1393 proc_fork_connector(p);
1311 cgroup_post_fork(p); 1394 cgroup_post_fork(p);
1395 if (clone_flags & CLONE_THREAD)
1396 threadgroup_fork_read_unlock(current);
1312 perf_event_fork(p); 1397 perf_event_fork(p);
1313 return p; 1398 return p;
1314 1399
@@ -1347,6 +1432,8 @@ bad_fork_cleanup_policy:
1347 mpol_put(p->mempolicy); 1432 mpol_put(p->mempolicy);
1348bad_fork_cleanup_cgroup: 1433bad_fork_cleanup_cgroup:
1349#endif 1434#endif
1435 if (clone_flags & CLONE_THREAD)
1436 threadgroup_fork_read_unlock(current);
1350 cgroup_exit(p, cgroup_callbacks_done); 1437 cgroup_exit(p, cgroup_callbacks_done);
1351 delayacct_tsk_free(p); 1438 delayacct_tsk_free(p);
1352 module_put(task_thread_info(p)->exec_domain->module); 1439 module_put(task_thread_info(p)->exec_domain->module);
@@ -1460,7 +1547,7 @@ long do_fork(unsigned long clone_flags,
1460 */ 1547 */
1461 p->flags &= ~PF_STARTING; 1548 p->flags &= ~PF_STARTING;
1462 1549
1463 wake_up_new_task(p, clone_flags); 1550 wake_up_new_task(p);
1464 1551
1465 tracehook_report_clone_complete(trace, regs, 1552 tracehook_report_clone_complete(trace, regs,
1466 clone_flags, nr, p); 1553 clone_flags, nr, p);
@@ -1512,38 +1599,24 @@ void __init proc_caches_init(void)
1512} 1599}
1513 1600
1514/* 1601/*
1515 * Check constraints on flags passed to the unshare system call and 1602 * Check constraints on flags passed to the unshare system call.
1516 * force unsharing of additional process context as appropriate.
1517 */ 1603 */
1518static void check_unshare_flags(unsigned long *flags_ptr) 1604static int check_unshare_flags(unsigned long unshare_flags)
1519{ 1605{
1606 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1607 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1608 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
1609 return -EINVAL;
1520 /* 1610 /*
1521 * If unsharing a thread from a thread group, must also 1611 * Not implemented, but pretend it works if there is nothing to
1522 * unshare vm. 1612 * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
1523 */ 1613 * needs to unshare vm.
1524 if (*flags_ptr & CLONE_THREAD)
1525 *flags_ptr |= CLONE_VM;
1526
1527 /*
1528 * If unsharing vm, must also unshare signal handlers.
1529 */
1530 if (*flags_ptr & CLONE_VM)
1531 *flags_ptr |= CLONE_SIGHAND;
1532
1533 /*
1534 * If unsharing namespace, must also unshare filesystem information.
1535 */ 1614 */
1536 if (*flags_ptr & CLONE_NEWNS) 1615 if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
1537 *flags_ptr |= CLONE_FS; 1616 /* FIXME: get_task_mm() increments ->mm_users */
1538} 1617 if (atomic_read(&current->mm->mm_users) > 1)
1539 1618 return -EINVAL;
1540/* 1619 }
1541 * Unsharing of tasks created with CLONE_THREAD is not supported yet
1542 */
1543static int unshare_thread(unsigned long unshare_flags)
1544{
1545 if (unshare_flags & CLONE_THREAD)
1546 return -EINVAL;
1547 1620
1548 return 0; 1621 return 0;
1549} 1622}
@@ -1570,34 +1643,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1570} 1643}
1571 1644
1572/* 1645/*
1573 * Unsharing of sighand is not supported yet
1574 */
1575static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
1576{
1577 struct sighand_struct *sigh = current->sighand;
1578
1579 if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
1580 return -EINVAL;
1581 else
1582 return 0;
1583}
1584
1585/*
1586 * Unshare vm if it is being shared
1587 */
1588static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
1589{
1590 struct mm_struct *mm = current->mm;
1591
1592 if ((unshare_flags & CLONE_VM) &&
1593 (mm && atomic_read(&mm->mm_users) > 1)) {
1594 return -EINVAL;
1595 }
1596
1597 return 0;
1598}
1599
1600/*
1601 * Unshare file descriptor table if it is being shared 1646 * Unshare file descriptor table if it is being shared
1602 */ 1647 */
1603static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) 1648static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
@@ -1625,45 +1670,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
1625 */ 1670 */
1626SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) 1671SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1627{ 1672{
1628 int err = 0;
1629 struct fs_struct *fs, *new_fs = NULL; 1673 struct fs_struct *fs, *new_fs = NULL;
1630 struct sighand_struct *new_sigh = NULL;
1631 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1632 struct files_struct *fd, *new_fd = NULL; 1674 struct files_struct *fd, *new_fd = NULL;
1633 struct nsproxy *new_nsproxy = NULL; 1675 struct nsproxy *new_nsproxy = NULL;
1634 int do_sysvsem = 0; 1676 int do_sysvsem = 0;
1677 int err;
1635 1678
1636 check_unshare_flags(&unshare_flags); 1679 err = check_unshare_flags(unshare_flags);
1637 1680 if (err)
1638 /* Return -EINVAL for all unsupported flags */
1639 err = -EINVAL;
1640 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1641 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1642 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
1643 goto bad_unshare_out; 1681 goto bad_unshare_out;
1644 1682
1645 /* 1683 /*
1684 * If unsharing namespace, must also unshare filesystem information.
1685 */
1686 if (unshare_flags & CLONE_NEWNS)
1687 unshare_flags |= CLONE_FS;
1688 /*
1646 * CLONE_NEWIPC must also detach from the undolist: after switching 1689 * CLONE_NEWIPC must also detach from the undolist: after switching
1647 * to a new ipc namespace, the semaphore arrays from the old 1690 * to a new ipc namespace, the semaphore arrays from the old
1648 * namespace are unreachable. 1691 * namespace are unreachable.
1649 */ 1692 */
1650 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) 1693 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
1651 do_sysvsem = 1; 1694 do_sysvsem = 1;
1652 if ((err = unshare_thread(unshare_flags)))
1653 goto bad_unshare_out;
1654 if ((err = unshare_fs(unshare_flags, &new_fs))) 1695 if ((err = unshare_fs(unshare_flags, &new_fs)))
1655 goto bad_unshare_cleanup_thread; 1696 goto bad_unshare_out;
1656 if ((err = unshare_sighand(unshare_flags, &new_sigh)))
1657 goto bad_unshare_cleanup_fs;
1658 if ((err = unshare_vm(unshare_flags, &new_mm)))
1659 goto bad_unshare_cleanup_sigh;
1660 if ((err = unshare_fd(unshare_flags, &new_fd))) 1697 if ((err = unshare_fd(unshare_flags, &new_fd)))
1661 goto bad_unshare_cleanup_vm; 1698 goto bad_unshare_cleanup_fs;
1662 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, 1699 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1663 new_fs))) 1700 new_fs)))
1664 goto bad_unshare_cleanup_fd; 1701 goto bad_unshare_cleanup_fd;
1665 1702
1666 if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { 1703 if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
1667 if (do_sysvsem) { 1704 if (do_sysvsem) {
1668 /* 1705 /*
1669 * CLONE_SYSVSEM is equivalent to sys_exit(). 1706 * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1689,19 +1726,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1689 spin_unlock(&fs->lock); 1726 spin_unlock(&fs->lock);
1690 } 1727 }
1691 1728
1692 if (new_mm) {
1693 mm = current->mm;
1694 active_mm = current->active_mm;
1695 current->mm = new_mm;
1696 current->active_mm = new_mm;
1697 if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
1698 atomic_dec(&mm->oom_disable_count);
1699 atomic_inc(&new_mm->oom_disable_count);
1700 }
1701 activate_mm(active_mm, new_mm);
1702 new_mm = mm;
1703 }
1704
1705 if (new_fd) { 1729 if (new_fd) {
1706 fd = current->files; 1730 fd = current->files;
1707 current->files = new_fd; 1731 current->files = new_fd;
@@ -1718,20 +1742,10 @@ bad_unshare_cleanup_fd:
1718 if (new_fd) 1742 if (new_fd)
1719 put_files_struct(new_fd); 1743 put_files_struct(new_fd);
1720 1744
1721bad_unshare_cleanup_vm:
1722 if (new_mm)
1723 mmput(new_mm);
1724
1725bad_unshare_cleanup_sigh:
1726 if (new_sigh)
1727 if (atomic_dec_and_test(&new_sigh->count))
1728 kmem_cache_free(sighand_cachep, new_sigh);
1729
1730bad_unshare_cleanup_fs: 1745bad_unshare_cleanup_fs:
1731 if (new_fs) 1746 if (new_fs)
1732 free_fs_struct(new_fs); 1747 free_fs_struct(new_fs);
1733 1748
1734bad_unshare_cleanup_thread:
1735bad_unshare_out: 1749bad_unshare_out:
1736 return err; 1750 return err;
1737} 1751}
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 66ecd2ead215..7b01de98bb6a 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -17,7 +17,7 @@ static inline void frozen_process(void)
17{ 17{
18 if (!unlikely(current->flags & PF_NOFREEZE)) { 18 if (!unlikely(current->flags & PF_NOFREEZE)) {
19 current->flags |= PF_FROZEN; 19 current->flags |= PF_FROZEN;
20 wmb(); 20 smp_wmb();
21 } 21 }
22 clear_freeze_flag(current); 22 clear_freeze_flag(current);
23} 23}
@@ -93,7 +93,7 @@ bool freeze_task(struct task_struct *p, bool sig_only)
93 * the task as frozen and next clears its TIF_FREEZE. 93 * the task as frozen and next clears its TIF_FREEZE.
94 */ 94 */
95 if (!freezing(p)) { 95 if (!freezing(p)) {
96 rmb(); 96 smp_rmb();
97 if (frozen(p)) 97 if (frozen(p))
98 return false; 98 return false;
99 99
diff --git a/kernel/futex.c b/kernel/futex.c
index b766d28accd6..fe28dc282eae 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -381,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
381 return NULL; 381 return NULL;
382} 382}
383 383
384static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 384static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
385 u32 uval, u32 newval)
385{ 386{
386 u32 curval; 387 int ret;
387 388
388 pagefault_disable(); 389 pagefault_disable();
389 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 390 ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
390 pagefault_enable(); 391 pagefault_enable();
391 392
392 return curval; 393 return ret;
393} 394}
394 395
395static int get_futex_value_locked(u32 *dest, u32 __user *from) 396static int get_futex_value_locked(u32 *dest, u32 __user *from)
@@ -674,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
674 struct task_struct *task, int set_waiters) 675 struct task_struct *task, int set_waiters)
675{ 676{
676 int lock_taken, ret, ownerdied = 0; 677 int lock_taken, ret, ownerdied = 0;
677 u32 uval, newval, curval; 678 u32 uval, newval, curval, vpid = task_pid_vnr(task);
678 679
679retry: 680retry:
680 ret = lock_taken = 0; 681 ret = lock_taken = 0;
@@ -684,19 +685,17 @@ retry:
684 * (by doing a 0 -> TID atomic cmpxchg), while holding all 685 * (by doing a 0 -> TID atomic cmpxchg), while holding all
685 * the locks. It will most likely not succeed. 686 * the locks. It will most likely not succeed.
686 */ 687 */
687 newval = task_pid_vnr(task); 688 newval = vpid;
688 if (set_waiters) 689 if (set_waiters)
689 newval |= FUTEX_WAITERS; 690 newval |= FUTEX_WAITERS;
690 691
691 curval = cmpxchg_futex_value_locked(uaddr, 0, newval); 692 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
692
693 if (unlikely(curval == -EFAULT))
694 return -EFAULT; 693 return -EFAULT;
695 694
696 /* 695 /*
697 * Detect deadlocks. 696 * Detect deadlocks.
698 */ 697 */
699 if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) 698 if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
700 return -EDEADLK; 699 return -EDEADLK;
701 700
702 /* 701 /*
@@ -723,14 +722,12 @@ retry:
723 */ 722 */
724 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { 723 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
725 /* Keep the OWNER_DIED bit */ 724 /* Keep the OWNER_DIED bit */
726 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); 725 newval = (curval & ~FUTEX_TID_MASK) | vpid;
727 ownerdied = 0; 726 ownerdied = 0;
728 lock_taken = 1; 727 lock_taken = 1;
729 } 728 }
730 729
731 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 730 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
732
733 if (unlikely(curval == -EFAULT))
734 return -EFAULT; 731 return -EFAULT;
735 if (unlikely(curval != uval)) 732 if (unlikely(curval != uval))
736 goto retry; 733 goto retry;
@@ -775,6 +772,24 @@ retry:
775 return ret; 772 return ret;
776} 773}
777 774
775/**
776 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
777 * @q: The futex_q to unqueue
778 *
779 * The q->lock_ptr must not be NULL and must be held by the caller.
780 */
781static void __unqueue_futex(struct futex_q *q)
782{
783 struct futex_hash_bucket *hb;
784
785 if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
786 || WARN_ON(plist_node_empty(&q->list)))
787 return;
788
789 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
790 plist_del(&q->list, &hb->chain);
791}
792
778/* 793/*
779 * The hash bucket lock must be held when this is called. 794 * The hash bucket lock must be held when this is called.
780 * Afterwards, the futex_q must not be accessed. 795 * Afterwards, the futex_q must not be accessed.
@@ -792,7 +807,7 @@ static void wake_futex(struct futex_q *q)
792 */ 807 */
793 get_task_struct(p); 808 get_task_struct(p);
794 809
795 plist_del(&q->list, &q->list.plist); 810 __unqueue_futex(q);
796 /* 811 /*
797 * The waiting task can free the futex_q as soon as 812 * The waiting task can free the futex_q as soon as
798 * q->lock_ptr = NULL is written, without taking any locks. A 813 * q->lock_ptr = NULL is written, without taking any locks. A
@@ -843,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
843 858
844 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 859 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
845 860
846 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 861 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
847
848 if (curval == -EFAULT)
849 ret = -EFAULT; 862 ret = -EFAULT;
850 else if (curval != uval) 863 else if (curval != uval)
851 ret = -EINVAL; 864 ret = -EINVAL;
@@ -880,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
880 * There is no waiter, so we unlock the futex. The owner died 893 * There is no waiter, so we unlock the futex. The owner died
881 * bit has not to be preserved here. We are the owner: 894 * bit has not to be preserved here. We are the owner:
882 */ 895 */
883 oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); 896 if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
884 897 return -EFAULT;
885 if (oldval == -EFAULT)
886 return oldval;
887 if (oldval != uval) 898 if (oldval != uval)
888 return -EAGAIN; 899 return -EAGAIN;
889 900
@@ -1071,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1071 plist_del(&q->list, &hb1->chain); 1082 plist_del(&q->list, &hb1->chain);
1072 plist_add(&q->list, &hb2->chain); 1083 plist_add(&q->list, &hb2->chain);
1073 q->lock_ptr = &hb2->lock; 1084 q->lock_ptr = &hb2->lock;
1074#ifdef CONFIG_DEBUG_PI_LIST
1075 q->list.plist.spinlock = &hb2->lock;
1076#endif
1077 } 1085 }
1078 get_futex_key_refs(key2); 1086 get_futex_key_refs(key2);
1079 q->key = *key2; 1087 q->key = *key2;
@@ -1100,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1100 get_futex_key_refs(key); 1108 get_futex_key_refs(key);
1101 q->key = *key; 1109 q->key = *key;
1102 1110
1103 WARN_ON(plist_node_empty(&q->list)); 1111 __unqueue_futex(q);
1104 plist_del(&q->list, &q->list.plist);
1105 1112
1106 WARN_ON(!q->rt_waiter); 1113 WARN_ON(!q->rt_waiter);
1107 q->rt_waiter = NULL; 1114 q->rt_waiter = NULL;
1108 1115
1109 q->lock_ptr = &hb->lock; 1116 q->lock_ptr = &hb->lock;
1110#ifdef CONFIG_DEBUG_PI_LIST
1111 q->list.plist.spinlock = &hb->lock;
1112#endif
1113 1117
1114 wake_up_state(q->task, TASK_NORMAL); 1118 wake_up_state(q->task, TASK_NORMAL);
1115} 1119}
@@ -1457,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1457 prio = min(current->normal_prio, MAX_RT_PRIO); 1461 prio = min(current->normal_prio, MAX_RT_PRIO);
1458 1462
1459 plist_node_init(&q->list, prio); 1463 plist_node_init(&q->list, prio);
1460#ifdef CONFIG_DEBUG_PI_LIST
1461 q->list.plist.spinlock = &hb->lock;
1462#endif
1463 plist_add(&q->list, &hb->chain); 1464 plist_add(&q->list, &hb->chain);
1464 q->task = current; 1465 q->task = current;
1465 spin_unlock(&hb->lock); 1466 spin_unlock(&hb->lock);
@@ -1504,8 +1505,7 @@ retry:
1504 spin_unlock(lock_ptr); 1505 spin_unlock(lock_ptr);
1505 goto retry; 1506 goto retry;
1506 } 1507 }
1507 WARN_ON(plist_node_empty(&q->list)); 1508 __unqueue_futex(q);
1508 plist_del(&q->list, &q->list.plist);
1509 1509
1510 BUG_ON(q->pi_state); 1510 BUG_ON(q->pi_state);
1511 1511
@@ -1525,8 +1525,7 @@ retry:
1525static void unqueue_me_pi(struct futex_q *q) 1525static void unqueue_me_pi(struct futex_q *q)
1526 __releases(q->lock_ptr) 1526 __releases(q->lock_ptr)
1527{ 1527{
1528 WARN_ON(plist_node_empty(&q->list)); 1528 __unqueue_futex(q);
1529 plist_del(&q->list, &q->list.plist);
1530 1529
1531 BUG_ON(!q->pi_state); 1530 BUG_ON(!q->pi_state);
1532 free_pi_state(q->pi_state); 1531 free_pi_state(q->pi_state);
@@ -1556,10 +1555,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1556 1555
1557 /* 1556 /*
1558 * We are here either because we stole the rtmutex from the 1557 * We are here either because we stole the rtmutex from the
1559 * pending owner or we are the pending owner which failed to 1558 * previous highest priority waiter or we are the highest priority
1560 * get the rtmutex. We have to replace the pending owner TID 1559 * waiter but failed to get the rtmutex the first time.
1561 * in the user space variable. This must be atomic as we have 1560 * We have to replace the newowner TID in the user space variable.
1562 * to preserve the owner died bit here. 1561 * This must be atomic as we have to preserve the owner died bit here.
1563 * 1562 *
1564 * Note: We write the user space value _before_ changing the pi_state 1563 * Note: We write the user space value _before_ changing the pi_state
1565 * because we can fault here. Imagine swapped out pages or a fork 1564 * because we can fault here. Imagine swapped out pages or a fork
@@ -1578,9 +1577,7 @@ retry:
1578 while (1) { 1577 while (1) {
1579 newval = (uval & FUTEX_OWNER_DIED) | newtid; 1578 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1580 1579
1581 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 1580 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1582
1583 if (curval == -EFAULT)
1584 goto handle_fault; 1581 goto handle_fault;
1585 if (curval == uval) 1582 if (curval == uval)
1586 break; 1583 break;
@@ -1608,8 +1605,8 @@ retry:
1608 1605
1609 /* 1606 /*
1610 * To handle the page fault we need to drop the hash bucket 1607 * To handle the page fault we need to drop the hash bucket
1611 * lock here. That gives the other task (either the pending 1608 * lock here. That gives the other task (either the highest priority
1612 * owner itself or the task which stole the rtmutex) the 1609 * waiter itself or the task which stole the rtmutex) the
1613 * chance to try the fixup of the pi_state. So once we are 1610 * chance to try the fixup of the pi_state. So once we are
1614 * back from handling the fault we need to check the pi_state 1611 * back from handling the fault we need to check the pi_state
1615 * after reacquiring the hash bucket lock and before trying to 1612 * after reacquiring the hash bucket lock and before trying to
@@ -1685,18 +1682,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1685 /* 1682 /*
1686 * pi_state is incorrect, some other task did a lock steal and 1683 * pi_state is incorrect, some other task did a lock steal and
1687 * we returned due to timeout or signal without taking the 1684 * we returned due to timeout or signal without taking the
1688 * rt_mutex. Too late. We can access the rt_mutex_owner without 1685 * rt_mutex. Too late.
1689 * locking, as the other task is now blocked on the hash bucket
1690 * lock. Fix the state up.
1691 */ 1686 */
1687 raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
1692 owner = rt_mutex_owner(&q->pi_state->pi_mutex); 1688 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1689 if (!owner)
1690 owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
1691 raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
1693 ret = fixup_pi_state_owner(uaddr, q, owner); 1692 ret = fixup_pi_state_owner(uaddr, q, owner);
1694 goto out; 1693 goto out;
1695 } 1694 }
1696 1695
1697 /* 1696 /*
1698 * Paranoia check. If we did not take the lock, then we should not be 1697 * Paranoia check. If we did not take the lock, then we should not be
1699 * the owner, nor the pending owner, of the rt_mutex. 1698 * the owner of the rt_mutex.
1700 */ 1699 */
1701 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) 1700 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1702 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " 1701 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
@@ -1781,13 +1780,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1781 * 1780 *
1782 * The basic logical guarantee of a futex is that it blocks ONLY 1781 * The basic logical guarantee of a futex is that it blocks ONLY
1783 * if cond(var) is known to be true at the time of blocking, for 1782 * if cond(var) is known to be true at the time of blocking, for
1784 * any cond. If we queued after testing *uaddr, that would open 1783 * any cond. If we locked the hash-bucket after testing *uaddr, that
1785 * a race condition where we could block indefinitely with 1784 * would open a race condition where we could block indefinitely with
1786 * cond(var) false, which would violate the guarantee. 1785 * cond(var) false, which would violate the guarantee.
1787 * 1786 *
1788 * A consequence is that futex_wait() can return zero and absorb 1787 * On the other hand, we insert q and release the hash-bucket only
1789 * a wakeup when *uaddr != val on entry to the syscall. This is 1788 * after testing *uaddr. This guarantees that futex_wait() will NOT
1790 * rare, but normal. 1789 * absorb a wakeup if *uaddr does not match the desired values
1790 * while the syscall executes.
1791 */ 1791 */
1792retry: 1792retry:
1793 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); 1793 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
@@ -1886,7 +1886,7 @@ retry:
1886 restart->futex.val = val; 1886 restart->futex.val = val;
1887 restart->futex.time = abs_time->tv64; 1887 restart->futex.time = abs_time->tv64;
1888 restart->futex.bitset = bitset; 1888 restart->futex.bitset = bitset;
1889 restart->futex.flags = flags; 1889 restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
1890 1890
1891 ret = -ERESTART_RESTARTBLOCK; 1891 ret = -ERESTART_RESTARTBLOCK;
1892 1892
@@ -2046,9 +2046,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2046{ 2046{
2047 struct futex_hash_bucket *hb; 2047 struct futex_hash_bucket *hb;
2048 struct futex_q *this, *next; 2048 struct futex_q *this, *next;
2049 u32 uval;
2050 struct plist_head *head; 2049 struct plist_head *head;
2051 union futex_key key = FUTEX_KEY_INIT; 2050 union futex_key key = FUTEX_KEY_INIT;
2051 u32 uval, vpid = task_pid_vnr(current);
2052 int ret; 2052 int ret;
2053 2053
2054retry: 2054retry:
@@ -2057,7 +2057,7 @@ retry:
2057 /* 2057 /*
2058 * We release only a lock we actually own: 2058 * We release only a lock we actually own:
2059 */ 2059 */
2060 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 2060 if ((uval & FUTEX_TID_MASK) != vpid)
2061 return -EPERM; 2061 return -EPERM;
2062 2062
2063 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); 2063 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
@@ -2072,17 +2072,14 @@ retry:
2072 * again. If it succeeds then we can return without waking 2072 * again. If it succeeds then we can return without waking
2073 * anyone else up: 2073 * anyone else up:
2074 */ 2074 */
2075 if (!(uval & FUTEX_OWNER_DIED)) 2075 if (!(uval & FUTEX_OWNER_DIED) &&
2076 uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); 2076 cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2077
2078
2079 if (unlikely(uval == -EFAULT))
2080 goto pi_faulted; 2077 goto pi_faulted;
2081 /* 2078 /*
2082 * Rare case: we managed to release the lock atomically, 2079 * Rare case: we managed to release the lock atomically,
2083 * no need to wake anyone else up: 2080 * no need to wake anyone else up:
2084 */ 2081 */
2085 if (unlikely(uval == task_pid_vnr(current))) 2082 if (unlikely(uval == vpid))
2086 goto out_unlock; 2083 goto out_unlock;
2087 2084
2088 /* 2085 /*
@@ -2167,7 +2164,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2167 * We were woken prior to requeue by a timeout or a signal. 2164 * We were woken prior to requeue by a timeout or a signal.
2168 * Unqueue the futex_q and determine which it was. 2165 * Unqueue the futex_q and determine which it was.
2169 */ 2166 */
2170 plist_del(&q->list, &q->list.plist); 2167 plist_del(&q->list, &hb->chain);
2171 2168
2172 /* Handle spurious wakeups gracefully */ 2169 /* Handle spurious wakeups gracefully */
2173 ret = -EWOULDBLOCK; 2170 ret = -EWOULDBLOCK;
@@ -2421,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
2421 goto err_unlock; 2418 goto err_unlock;
2422 ret = -EPERM; 2419 ret = -EPERM;
2423 pcred = __task_cred(p); 2420 pcred = __task_cred(p);
2421 /* If victim is in different user_ns, then uids are not
2422 comparable, so we must have CAP_SYS_PTRACE */
2423 if (cred->user->user_ns != pcred->user->user_ns) {
2424 if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
2425 goto err_unlock;
2426 goto ok;
2427 }
2428 /* If victim is in same user_ns, then uids are comparable */
2424 if (cred->euid != pcred->euid && 2429 if (cred->euid != pcred->euid &&
2425 cred->euid != pcred->uid && 2430 cred->euid != pcred->uid &&
2426 !capable(CAP_SYS_PTRACE)) 2431 !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
2427 goto err_unlock; 2432 goto err_unlock;
2433ok:
2428 head = p->robust_list; 2434 head = p->robust_list;
2429 rcu_read_unlock(); 2435 rcu_read_unlock();
2430 } 2436 }
@@ -2463,11 +2469,20 @@ retry:
2463 * userspace. 2469 * userspace.
2464 */ 2470 */
2465 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 2471 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2466 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); 2472 /*
2467 2473 * We are not holding a lock here, but we want to have
2468 if (nval == -EFAULT) 2474 * the pagefault_disable/enable() protection because
2469 return -1; 2475 * we want to handle the fault gracefully. If the
2470 2476 * access fails we try to fault in the futex with R/W
2477 * verification via get_user_pages. get_user() above
2478 * does not guarantee R/W access. If that fails we
2479 * give up and leave the futex locked.
2480 */
2481 if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
2482 if (fault_in_user_writeable(uaddr))
2483 return -1;
2484 goto retry;
2485 }
2471 if (nval != uval) 2486 if (nval != uval)
2472 goto retry; 2487 goto retry;
2473 2488
@@ -2678,8 +2693,7 @@ static int __init futex_init(void)
2678 * implementation, the non-functional ones will return 2693 * implementation, the non-functional ones will return
2679 * -ENOSYS. 2694 * -ENOSYS.
2680 */ 2695 */
2681 curval = cmpxchg_futex_value_locked(NULL, 0, 0); 2696 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2682 if (curval == -EFAULT)
2683 futex_cmpxchg_enabled = 1; 2697 futex_cmpxchg_enabled = 1;
2684 2698
2685 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 2699 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index a7934ac75e5b..5f9e689dc8f0 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -153,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
153 goto err_unlock; 153 goto err_unlock;
154 ret = -EPERM; 154 ret = -EPERM;
155 pcred = __task_cred(p); 155 pcred = __task_cred(p);
156 /* If victim is in different user_ns, then uids are not
157 comparable, so we must have CAP_SYS_PTRACE */
158 if (cred->user->user_ns != pcred->user->user_ns) {
159 if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
160 goto err_unlock;
161 goto ok;
162 }
163 /* If victim is in same user_ns, then uids are comparable */
156 if (cred->euid != pcred->euid && 164 if (cred->euid != pcred->euid &&
157 cred->euid != pcred->uid && 165 cred->euid != pcred->uid &&
158 !capable(CAP_SYS_PTRACE)) 166 !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
159 goto err_unlock; 167 goto err_unlock;
168ok:
160 head = p->compat_robust_list; 169 head = p->compat_robust_list;
161 rcu_read_unlock(); 170 rcu_read_unlock();
162 } 171 }
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 70a298d6da71..b8cadf70b1fb 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -34,7 +34,7 @@ config GCOV_KERNEL
34config GCOV_PROFILE_ALL 34config GCOV_PROFILE_ALL
35 bool "Profile entire Kernel" 35 bool "Profile entire Kernel"
36 depends on GCOV_KERNEL 36 depends on GCOV_KERNEL
37 depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE 37 depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
38 default n 38 default n
39 ---help--- 39 ---help---
40 This options activates profiling for the entire kernel. 40 This options activates profiling for the entire kernel.
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 3f761001d517..e97ca59e2520 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,3 +1,3 @@
1EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' 1ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
2 2
3obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o 3obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/groups.c b/kernel/groups.c
index 253dc0f35cf4..1cc476d52dd3 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
233 struct group_info *group_info; 233 struct group_info *group_info;
234 int retval; 234 int retval;
235 235
236 if (!capable(CAP_SETGID)) 236 if (!nsown_capable(CAP_SETGID))
237 return -EPERM; 237 return -EPERM;
238 if ((unsigned)gidsetsize > NGROUPS_MAX) 238 if ((unsigned)gidsetsize > NGROUPS_MAX)
239 return -EINVAL; 239 return -EINVAL;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0c8d7c048615..a9205e32a059 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -53,11 +53,10 @@
53/* 53/*
54 * The timer bases: 54 * The timer bases:
55 * 55 *
56 * Note: If we want to add new timer bases, we have to skip the two 56 * There are more clockids then hrtimer bases. Thus, we index
57 * clock ids captured by the cpu-timers. We do this by holding empty 57 * into the timer bases by the hrtimer_base_type enum. When trying
58 * entries rather than doing math adjustment of the clock ids. 58 * to reach a base using a clockid, hrtimer_clockid_to_base()
59 * This ensures that we capture erroneous accesses to these clock ids 59 * is used to convert from clockid to the proper hrtimer_base_type.
60 * rather than moving them into the range of valid clock id's.
61 */ 60 */
62DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = 61DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
63{ 62{
@@ -65,39 +64,55 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
65 .clock_base = 64 .clock_base =
66 { 65 {
67 { 66 {
68 .index = CLOCK_REALTIME, 67 .index = HRTIMER_BASE_MONOTONIC,
68 .clockid = CLOCK_MONOTONIC,
69 .get_time = &ktime_get,
70 .resolution = KTIME_LOW_RES,
71 },
72 {
73 .index = HRTIMER_BASE_REALTIME,
74 .clockid = CLOCK_REALTIME,
69 .get_time = &ktime_get_real, 75 .get_time = &ktime_get_real,
70 .resolution = KTIME_LOW_RES, 76 .resolution = KTIME_LOW_RES,
71 }, 77 },
72 { 78 {
73 .index = CLOCK_MONOTONIC, 79 .index = HRTIMER_BASE_BOOTTIME,
74 .get_time = &ktime_get, 80 .clockid = CLOCK_BOOTTIME,
81 .get_time = &ktime_get_boottime,
75 .resolution = KTIME_LOW_RES, 82 .resolution = KTIME_LOW_RES,
76 }, 83 },
77 } 84 }
78}; 85};
79 86
87static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
88 [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
89 [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
90 [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
91};
92
93static inline int hrtimer_clockid_to_base(clockid_t clock_id)
94{
95 return hrtimer_clock_to_base_table[clock_id];
96}
97
98
80/* 99/*
81 * Get the coarse grained time at the softirq based on xtime and 100 * Get the coarse grained time at the softirq based on xtime and
82 * wall_to_monotonic. 101 * wall_to_monotonic.
83 */ 102 */
84static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) 103static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
85{ 104{
86 ktime_t xtim, tomono; 105 ktime_t xtim, mono, boot;
87 struct timespec xts, tom; 106 struct timespec xts, tom, slp;
88 unsigned long seq;
89 107
90 do { 108 get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
91 seq = read_seqbegin(&xtime_lock);
92 xts = __current_kernel_time();
93 tom = __get_wall_to_monotonic();
94 } while (read_seqretry(&xtime_lock, seq));
95 109
96 xtim = timespec_to_ktime(xts); 110 xtim = timespec_to_ktime(xts);
97 tomono = timespec_to_ktime(tom); 111 mono = ktime_add(xtim, timespec_to_ktime(tom));
98 base->clock_base[CLOCK_REALTIME].softirq_time = xtim; 112 boot = ktime_add(mono, timespec_to_ktime(slp));
99 base->clock_base[CLOCK_MONOTONIC].softirq_time = 113 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
100 ktime_add(xtim, tomono); 114 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
115 base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
101} 116}
102 117
103/* 118/*
@@ -184,10 +199,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
184 struct hrtimer_cpu_base *new_cpu_base; 199 struct hrtimer_cpu_base *new_cpu_base;
185 int this_cpu = smp_processor_id(); 200 int this_cpu = smp_processor_id();
186 int cpu = hrtimer_get_target(this_cpu, pinned); 201 int cpu = hrtimer_get_target(this_cpu, pinned);
202 int basenum = base->index;
187 203
188again: 204again:
189 new_cpu_base = &per_cpu(hrtimer_bases, cpu); 205 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
190 new_base = &new_cpu_base->clock_base[base->index]; 206 new_base = &new_cpu_base->clock_base[basenum];
191 207
192 if (base != new_base) { 208 if (base != new_base) {
193 /* 209 /*
@@ -334,6 +350,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe);
334 350
335static struct debug_obj_descr hrtimer_debug_descr; 351static struct debug_obj_descr hrtimer_debug_descr;
336 352
353static void *hrtimer_debug_hint(void *addr)
354{
355 return ((struct hrtimer *) addr)->function;
356}
357
337/* 358/*
338 * fixup_init is called when: 359 * fixup_init is called when:
339 * - an active object is initialized 360 * - an active object is initialized
@@ -393,6 +414,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
393 414
394static struct debug_obj_descr hrtimer_debug_descr = { 415static struct debug_obj_descr hrtimer_debug_descr = {
395 .name = "hrtimer", 416 .name = "hrtimer",
417 .debug_hint = hrtimer_debug_hint,
396 .fixup_init = hrtimer_fixup_init, 418 .fixup_init = hrtimer_fixup_init,
397 .fixup_activate = hrtimer_fixup_activate, 419 .fixup_activate = hrtimer_fixup_activate,
398 .fixup_free = hrtimer_fixup_free, 420 .fixup_free = hrtimer_fixup_free,
@@ -602,67 +624,6 @@ static int hrtimer_reprogram(struct hrtimer *timer,
602 return res; 624 return res;
603} 625}
604 626
605
606/*
607 * Retrigger next event is called after clock was set
608 *
609 * Called with interrupts disabled via on_each_cpu()
610 */
611static void retrigger_next_event(void *arg)
612{
613 struct hrtimer_cpu_base *base;
614 struct timespec realtime_offset, wtm;
615 unsigned long seq;
616
617 if (!hrtimer_hres_active())
618 return;
619
620 do {
621 seq = read_seqbegin(&xtime_lock);
622 wtm = __get_wall_to_monotonic();
623 } while (read_seqretry(&xtime_lock, seq));
624 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
625
626 base = &__get_cpu_var(hrtimer_bases);
627
628 /* Adjust CLOCK_REALTIME offset */
629 raw_spin_lock(&base->lock);
630 base->clock_base[CLOCK_REALTIME].offset =
631 timespec_to_ktime(realtime_offset);
632
633 hrtimer_force_reprogram(base, 0);
634 raw_spin_unlock(&base->lock);
635}
636
637/*
638 * Clock realtime was set
639 *
640 * Change the offset of the realtime clock vs. the monotonic
641 * clock.
642 *
643 * We might have to reprogram the high resolution timer interrupt. On
644 * SMP we call the architecture specific code to retrigger _all_ high
645 * resolution timer interrupts. On UP we just disable interrupts and
646 * call the high resolution interrupt code.
647 */
648void clock_was_set(void)
649{
650 /* Retrigger the CPU local events everywhere */
651 on_each_cpu(retrigger_next_event, NULL, 1);
652}
653
654/*
655 * During resume we might have to reprogram the high resolution timer
656 * interrupt (on the local CPU):
657 */
658void hres_timers_resume(void)
659{
660 WARN_ONCE(!irqs_disabled(),
661 KERN_INFO "hres_timers_resume() called with IRQs enabled!");
662
663 retrigger_next_event(NULL);
664}
665
666/* 627/*
667 * Initialize the high resolution related parts of cpu_base 628 * Initialize the high resolution related parts of cpu_base
668 */ 629 */
@@ -673,14 +634,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
673} 634}
674 635
675/* 636/*
676 * Initialize the high resolution related parts of a hrtimer
677 */
678static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
679{
680}
681
682
683/*
684 * When High resolution timers are active, try to reprogram. Note, that in case 637 * When High resolution timers are active, try to reprogram. Note, that in case
685 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry 638 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
686 * check happens. The timer gets enqueued into the rbtree. The reprogramming 639 * check happens. The timer gets enqueued into the rbtree. The reprogramming
@@ -705,11 +658,39 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
705} 658}
706 659
707/* 660/*
661 * Retrigger next event is called after clock was set
662 *
663 * Called with interrupts disabled via on_each_cpu()
664 */
665static void retrigger_next_event(void *arg)
666{
667 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
668 struct timespec realtime_offset, xtim, wtm, sleep;
669
670 if (!hrtimer_hres_active())
671 return;
672
673 /* Optimized out for !HIGH_RES */
674 get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
675 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
676
677 /* Adjust CLOCK_REALTIME offset */
678 raw_spin_lock(&base->lock);
679 base->clock_base[HRTIMER_BASE_REALTIME].offset =
680 timespec_to_ktime(realtime_offset);
681 base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
682 timespec_to_ktime(sleep);
683
684 hrtimer_force_reprogram(base, 0);
685 raw_spin_unlock(&base->lock);
686}
687
688/*
708 * Switch to high resolution mode 689 * Switch to high resolution mode
709 */ 690 */
710static int hrtimer_switch_to_hres(void) 691static int hrtimer_switch_to_hres(void)
711{ 692{
712 int cpu = smp_processor_id(); 693 int i, cpu = smp_processor_id();
713 struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); 694 struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
714 unsigned long flags; 695 unsigned long flags;
715 696
@@ -725,8 +706,8 @@ static int hrtimer_switch_to_hres(void)
725 return 0; 706 return 0;
726 } 707 }
727 base->hres_active = 1; 708 base->hres_active = 1;
728 base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; 709 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
729 base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; 710 base->clock_base[i].resolution = KTIME_HIGH_RES;
730 711
731 tick_setup_sched_timer(); 712 tick_setup_sched_timer();
732 713
@@ -750,10 +731,43 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
750 return 0; 731 return 0;
751} 732}
752static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } 733static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
753static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } 734static inline void retrigger_next_event(void *arg) { }
754 735
755#endif /* CONFIG_HIGH_RES_TIMERS */ 736#endif /* CONFIG_HIGH_RES_TIMERS */
756 737
738/*
739 * Clock realtime was set
740 *
741 * Change the offset of the realtime clock vs. the monotonic
742 * clock.
743 *
744 * We might have to reprogram the high resolution timer interrupt. On
745 * SMP we call the architecture specific code to retrigger _all_ high
746 * resolution timer interrupts. On UP we just disable interrupts and
747 * call the high resolution interrupt code.
748 */
749void clock_was_set(void)
750{
751#ifdef CONFIG_HIGH_RES_TIMERS
752 /* Retrigger the CPU local events everywhere */
753 on_each_cpu(retrigger_next_event, NULL, 1);
754#endif
755 timerfd_clock_was_set();
756}
757
758/*
759 * During resume we might have to reprogram the high resolution timer
760 * interrupt (on the local CPU):
761 */
762void hrtimers_resume(void)
763{
764 WARN_ONCE(!irqs_disabled(),
765 KERN_INFO "hrtimers_resume() called with IRQs enabled!");
766
767 retrigger_next_event(NULL);
768 timerfd_clock_was_set();
769}
770
757static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) 771static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
758{ 772{
759#ifdef CONFIG_TIMER_STATS 773#ifdef CONFIG_TIMER_STATS
@@ -846,6 +860,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
846 debug_activate(timer); 860 debug_activate(timer);
847 861
848 timerqueue_add(&base->active, &timer->node); 862 timerqueue_add(&base->active, &timer->node);
863 base->cpu_base->active_bases |= 1 << base->index;
849 864
850 /* 865 /*
851 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the 866 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
@@ -887,6 +902,8 @@ static void __remove_hrtimer(struct hrtimer *timer,
887#endif 902#endif
888 } 903 }
889 timerqueue_del(&base->active, &timer->node); 904 timerqueue_del(&base->active, &timer->node);
905 if (!timerqueue_getnext(&base->active))
906 base->cpu_base->active_bases &= ~(1 << base->index);
890out: 907out:
891 timer->state = newstate; 908 timer->state = newstate;
892} 909}
@@ -1121,6 +1138,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1121 enum hrtimer_mode mode) 1138 enum hrtimer_mode mode)
1122{ 1139{
1123 struct hrtimer_cpu_base *cpu_base; 1140 struct hrtimer_cpu_base *cpu_base;
1141 int base;
1124 1142
1125 memset(timer, 0, sizeof(struct hrtimer)); 1143 memset(timer, 0, sizeof(struct hrtimer));
1126 1144
@@ -1129,8 +1147,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1129 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) 1147 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
1130 clock_id = CLOCK_MONOTONIC; 1148 clock_id = CLOCK_MONOTONIC;
1131 1149
1132 timer->base = &cpu_base->clock_base[clock_id]; 1150 base = hrtimer_clockid_to_base(clock_id);
1133 hrtimer_init_timer_hres(timer); 1151 timer->base = &cpu_base->clock_base[base];
1134 timerqueue_init(&timer->node); 1152 timerqueue_init(&timer->node);
1135 1153
1136#ifdef CONFIG_TIMER_STATS 1154#ifdef CONFIG_TIMER_STATS
@@ -1165,9 +1183,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
1165int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) 1183int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1166{ 1184{
1167 struct hrtimer_cpu_base *cpu_base; 1185 struct hrtimer_cpu_base *cpu_base;
1186 int base = hrtimer_clockid_to_base(which_clock);
1168 1187
1169 cpu_base = &__raw_get_cpu_var(hrtimer_bases); 1188 cpu_base = &__raw_get_cpu_var(hrtimer_bases);
1170 *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); 1189 *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
1171 1190
1172 return 0; 1191 return 0;
1173} 1192}
@@ -1222,7 +1241,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1222void hrtimer_interrupt(struct clock_event_device *dev) 1241void hrtimer_interrupt(struct clock_event_device *dev)
1223{ 1242{
1224 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1243 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1225 struct hrtimer_clock_base *base;
1226 ktime_t expires_next, now, entry_time, delta; 1244 ktime_t expires_next, now, entry_time, delta;
1227 int i, retries = 0; 1245 int i, retries = 0;
1228 1246
@@ -1244,12 +1262,15 @@ retry:
1244 */ 1262 */
1245 cpu_base->expires_next.tv64 = KTIME_MAX; 1263 cpu_base->expires_next.tv64 = KTIME_MAX;
1246 1264
1247 base = cpu_base->clock_base;
1248
1249 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1265 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1250 ktime_t basenow; 1266 struct hrtimer_clock_base *base;
1251 struct timerqueue_node *node; 1267 struct timerqueue_node *node;
1268 ktime_t basenow;
1269
1270 if (!(cpu_base->active_bases & (1 << i)))
1271 continue;
1252 1272
1273 base = cpu_base->clock_base + i;
1253 basenow = ktime_add(now, base->offset); 1274 basenow = ktime_add(now, base->offset);
1254 1275
1255 while ((node = timerqueue_getnext(&base->active))) { 1276 while ((node = timerqueue_getnext(&base->active))) {
@@ -1282,7 +1303,6 @@ retry:
1282 1303
1283 __run_hrtimer(timer, &basenow); 1304 __run_hrtimer(timer, &basenow);
1284 } 1305 }
1285 base++;
1286 } 1306 }
1287 1307
1288 /* 1308 /*
@@ -1513,7 +1533,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1513 struct timespec __user *rmtp; 1533 struct timespec __user *rmtp;
1514 int ret = 0; 1534 int ret = 0;
1515 1535
1516 hrtimer_init_on_stack(&t.timer, restart->nanosleep.index, 1536 hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
1517 HRTIMER_MODE_ABS); 1537 HRTIMER_MODE_ABS);
1518 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); 1538 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
1519 1539
@@ -1565,7 +1585,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1565 1585
1566 restart = &current_thread_info()->restart_block; 1586 restart = &current_thread_info()->restart_block;
1567 restart->fn = hrtimer_nanosleep_restart; 1587 restart->fn = hrtimer_nanosleep_restart;
1568 restart->nanosleep.index = t.timer.base->index; 1588 restart->nanosleep.clockid = t.timer.base->clockid;
1569 restart->nanosleep.rmtp = rmtp; 1589 restart->nanosleep.rmtp = rmtp;
1570 restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); 1590 restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
1571 1591
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 53ead174da2f..ea640120ab86 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -33,7 +33,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
33/* 33/*
34 * Zero means infinite timeout - no checking done: 34 * Zero means infinite timeout - no checking done:
35 */ 35 */
36unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; 36unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
37 37
38unsigned long __read_mostly sysctl_hung_task_warnings = 10; 38unsigned long __read_mostly sysctl_hung_task_warnings = 10;
39 39
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 8e42fec7686d..d1d051b38e0b 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,5 +1,6 @@
1# Select this to activate the generic irq options below
1config HAVE_GENERIC_HARDIRQS 2config HAVE_GENERIC_HARDIRQS
2 def_bool n 3 bool
3 4
4if HAVE_GENERIC_HARDIRQS 5if HAVE_GENERIC_HARDIRQS
5menu "IRQ subsystem" 6menu "IRQ subsystem"
@@ -9,28 +10,51 @@ menu "IRQ subsystem"
9config GENERIC_HARDIRQS 10config GENERIC_HARDIRQS
10 def_bool y 11 def_bool y
11 12
12# Select this to disable the deprecated stuff
13config GENERIC_HARDIRQS_NO_DEPRECATED
14 def_bool n
15
16# Options selectable by the architecture code 13# Options selectable by the architecture code
14
15# Make sparse irq Kconfig switch below available
17config HAVE_SPARSE_IRQ 16config HAVE_SPARSE_IRQ
18 def_bool n 17 bool
19 18
19# Enable the generic irq autoprobe mechanism
20config GENERIC_IRQ_PROBE 20config GENERIC_IRQ_PROBE
21 def_bool n 21 bool
22
23# Use the generic /proc/interrupts implementation
24config GENERIC_IRQ_SHOW
25 bool
22 26
27# Print level/edge extra information
28config GENERIC_IRQ_SHOW_LEVEL
29 bool
30
31# Support for delayed migration from interrupt context
23config GENERIC_PENDING_IRQ 32config GENERIC_PENDING_IRQ
24 def_bool n 33 bool
25 34
35# Alpha specific irq affinity mechanism
26config AUTO_IRQ_AFFINITY 36config AUTO_IRQ_AFFINITY
27 def_bool n 37 bool
28
29config IRQ_PER_CPU
30 def_bool n
31 38
39# Tasklet based software resend for pending interrupts on enable_irq()
32config HARDIRQS_SW_RESEND 40config HARDIRQS_SW_RESEND
33 def_bool n 41 bool
42
43# Preflow handler support for fasteoi (sparc64)
44config IRQ_PREFLOW_FASTEOI
45 bool
46
47# Edge style eoi based handler (cell)
48config IRQ_EDGE_EOI_HANDLER
49 bool
50
51# Generic configurable interrupt chip implementation
52config GENERIC_IRQ_CHIP
53 bool
54
55# Support forced irq threading
56config IRQ_FORCED_THREADING
57 bool
34 58
35config SPARSE_IRQ 59config SPARSE_IRQ
36 bool "Support sparse irq numbering" 60 bool "Support sparse irq numbering"
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 54329cd7b3ee..73290056cfb6 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,5 +1,6 @@
1 1
2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o 2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 4obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 5obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 6obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 505798f86c36..342d8f44e401 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -17,7 +17,7 @@
17/* 17/*
18 * Autodetection depends on the fact that any interrupt that 18 * Autodetection depends on the fact that any interrupt that
19 * comes in on to an unassigned handler will get stuck with 19 * comes in on to an unassigned handler will get stuck with
20 * "IRQ_WAITING" cleared and the interrupt disabled. 20 * "IRQS_WAITING" cleared and the interrupt disabled.
21 */ 21 */
22static DEFINE_MUTEX(probing_active); 22static DEFINE_MUTEX(probing_active);
23 23
@@ -32,7 +32,6 @@ unsigned long probe_irq_on(void)
32{ 32{
33 struct irq_desc *desc; 33 struct irq_desc *desc;
34 unsigned long mask = 0; 34 unsigned long mask = 0;
35 unsigned int status;
36 int i; 35 int i;
37 36
38 /* 37 /*
@@ -46,13 +45,7 @@ unsigned long probe_irq_on(void)
46 */ 45 */
47 for_each_irq_desc_reverse(i, desc) { 46 for_each_irq_desc_reverse(i, desc) {
48 raw_spin_lock_irq(&desc->lock); 47 raw_spin_lock_irq(&desc->lock);
49 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 48 if (!desc->action && irq_settings_can_probe(desc)) {
50 /*
51 * An old-style architecture might still have
52 * the handle_bad_irq handler there:
53 */
54 compat_irq_chip_set_default_handler(desc);
55
56 /* 49 /*
57 * Some chips need to know about probing in 50 * Some chips need to know about probing in
58 * progress: 51 * progress:
@@ -60,7 +53,7 @@ unsigned long probe_irq_on(void)
60 if (desc->irq_data.chip->irq_set_type) 53 if (desc->irq_data.chip->irq_set_type)
61 desc->irq_data.chip->irq_set_type(&desc->irq_data, 54 desc->irq_data.chip->irq_set_type(&desc->irq_data,
62 IRQ_TYPE_PROBE); 55 IRQ_TYPE_PROBE);
63 desc->irq_data.chip->irq_startup(&desc->irq_data); 56 irq_startup(desc);
64 } 57 }
65 raw_spin_unlock_irq(&desc->lock); 58 raw_spin_unlock_irq(&desc->lock);
66 } 59 }
@@ -75,10 +68,10 @@ unsigned long probe_irq_on(void)
75 */ 68 */
76 for_each_irq_desc_reverse(i, desc) { 69 for_each_irq_desc_reverse(i, desc) {
77 raw_spin_lock_irq(&desc->lock); 70 raw_spin_lock_irq(&desc->lock);
78 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 71 if (!desc->action && irq_settings_can_probe(desc)) {
79 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 72 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
80 if (desc->irq_data.chip->irq_startup(&desc->irq_data)) 73 if (irq_startup(desc))
81 desc->status |= IRQ_PENDING; 74 desc->istate |= IRQS_PENDING;
82 } 75 }
83 raw_spin_unlock_irq(&desc->lock); 76 raw_spin_unlock_irq(&desc->lock);
84 } 77 }
@@ -93,13 +86,12 @@ unsigned long probe_irq_on(void)
93 */ 86 */
94 for_each_irq_desc(i, desc) { 87 for_each_irq_desc(i, desc) {
95 raw_spin_lock_irq(&desc->lock); 88 raw_spin_lock_irq(&desc->lock);
96 status = desc->status;
97 89
98 if (status & IRQ_AUTODETECT) { 90 if (desc->istate & IRQS_AUTODETECT) {
99 /* It triggered already - consider it spurious. */ 91 /* It triggered already - consider it spurious. */
100 if (!(status & IRQ_WAITING)) { 92 if (!(desc->istate & IRQS_WAITING)) {
101 desc->status = status & ~IRQ_AUTODETECT; 93 desc->istate &= ~IRQS_AUTODETECT;
102 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 94 irq_shutdown(desc);
103 } else 95 } else
104 if (i < 32) 96 if (i < 32)
105 mask |= 1 << i; 97 mask |= 1 << i;
@@ -125,20 +117,18 @@ EXPORT_SYMBOL(probe_irq_on);
125 */ 117 */
126unsigned int probe_irq_mask(unsigned long val) 118unsigned int probe_irq_mask(unsigned long val)
127{ 119{
128 unsigned int status, mask = 0; 120 unsigned int mask = 0;
129 struct irq_desc *desc; 121 struct irq_desc *desc;
130 int i; 122 int i;
131 123
132 for_each_irq_desc(i, desc) { 124 for_each_irq_desc(i, desc) {
133 raw_spin_lock_irq(&desc->lock); 125 raw_spin_lock_irq(&desc->lock);
134 status = desc->status; 126 if (desc->istate & IRQS_AUTODETECT) {
135 127 if (i < 16 && !(desc->istate & IRQS_WAITING))
136 if (status & IRQ_AUTODETECT) {
137 if (i < 16 && !(status & IRQ_WAITING))
138 mask |= 1 << i; 128 mask |= 1 << i;
139 129
140 desc->status = status & ~IRQ_AUTODETECT; 130 desc->istate &= ~IRQS_AUTODETECT;
141 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 131 irq_shutdown(desc);
142 } 132 }
143 raw_spin_unlock_irq(&desc->lock); 133 raw_spin_unlock_irq(&desc->lock);
144 } 134 }
@@ -169,20 +159,18 @@ int probe_irq_off(unsigned long val)
169{ 159{
170 int i, irq_found = 0, nr_of_irqs = 0; 160 int i, irq_found = 0, nr_of_irqs = 0;
171 struct irq_desc *desc; 161 struct irq_desc *desc;
172 unsigned int status;
173 162
174 for_each_irq_desc(i, desc) { 163 for_each_irq_desc(i, desc) {
175 raw_spin_lock_irq(&desc->lock); 164 raw_spin_lock_irq(&desc->lock);
176 status = desc->status;
177 165
178 if (status & IRQ_AUTODETECT) { 166 if (desc->istate & IRQS_AUTODETECT) {
179 if (!(status & IRQ_WAITING)) { 167 if (!(desc->istate & IRQS_WAITING)) {
180 if (!nr_of_irqs) 168 if (!nr_of_irqs)
181 irq_found = i; 169 irq_found = i;
182 nr_of_irqs++; 170 nr_of_irqs++;
183 } 171 }
184 desc->status = status & ~IRQ_AUTODETECT; 172 desc->istate &= ~IRQS_AUTODETECT;
185 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 173 irq_shutdown(desc);
186 } 174 }
187 raw_spin_unlock_irq(&desc->lock); 175 raw_spin_unlock_irq(&desc->lock);
188 } 176 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index baa5c4acad83..d5a3009da71a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -19,140 +19,115 @@
19#include "internals.h" 19#include "internals.h"
20 20
21/** 21/**
22 * set_irq_chip - set the irq chip for an irq 22 * irq_set_chip - set the irq chip for an irq
23 * @irq: irq number 23 * @irq: irq number
24 * @chip: pointer to irq chip description structure 24 * @chip: pointer to irq chip description structure
25 */ 25 */
26int set_irq_chip(unsigned int irq, struct irq_chip *chip) 26int irq_set_chip(unsigned int irq, struct irq_chip *chip)
27{ 27{
28 struct irq_desc *desc = irq_to_desc(irq);
29 unsigned long flags; 28 unsigned long flags;
29 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
30 30
31 if (!desc) { 31 if (!desc)
32 WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
33 return -EINVAL; 32 return -EINVAL;
34 }
35 33
36 if (!chip) 34 if (!chip)
37 chip = &no_irq_chip; 35 chip = &no_irq_chip;
38 36
39 raw_spin_lock_irqsave(&desc->lock, flags);
40 irq_chip_set_defaults(chip);
41 desc->irq_data.chip = chip; 37 desc->irq_data.chip = chip;
42 raw_spin_unlock_irqrestore(&desc->lock, flags); 38 irq_put_desc_unlock(desc, flags);
43 39 /*
40 * For !CONFIG_SPARSE_IRQ make the irq show up in
41 * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is
42 * already marked, and this call is harmless.
43 */
44 irq_reserve_irq(irq);
44 return 0; 45 return 0;
45} 46}
46EXPORT_SYMBOL(set_irq_chip); 47EXPORT_SYMBOL(irq_set_chip);
47 48
48/** 49/**
49 * set_irq_type - set the irq trigger type for an irq 50 * irq_set_type - set the irq trigger type for an irq
50 * @irq: irq number 51 * @irq: irq number
51 * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h 52 * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
52 */ 53 */
53int set_irq_type(unsigned int irq, unsigned int type) 54int irq_set_irq_type(unsigned int irq, unsigned int type)
54{ 55{
55 struct irq_desc *desc = irq_to_desc(irq);
56 unsigned long flags; 56 unsigned long flags;
57 int ret = -ENXIO; 57 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
58 int ret = 0;
58 59
59 if (!desc) { 60 if (!desc)
60 printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); 61 return -EINVAL;
61 return -ENODEV;
62 }
63 62
64 type &= IRQ_TYPE_SENSE_MASK; 63 type &= IRQ_TYPE_SENSE_MASK;
65 if (type == IRQ_TYPE_NONE) 64 if (type != IRQ_TYPE_NONE)
66 return 0; 65 ret = __irq_set_trigger(desc, irq, type);
67 66 irq_put_desc_busunlock(desc, flags);
68 raw_spin_lock_irqsave(&desc->lock, flags);
69 ret = __irq_set_trigger(desc, irq, type);
70 raw_spin_unlock_irqrestore(&desc->lock, flags);
71 return ret; 67 return ret;
72} 68}
73EXPORT_SYMBOL(set_irq_type); 69EXPORT_SYMBOL(irq_set_irq_type);
74 70
75/** 71/**
76 * set_irq_data - set irq type data for an irq 72 * irq_set_handler_data - set irq handler data for an irq
77 * @irq: Interrupt number 73 * @irq: Interrupt number
78 * @data: Pointer to interrupt specific data 74 * @data: Pointer to interrupt specific data
79 * 75 *
80 * Set the hardware irq controller data for an irq 76 * Set the hardware irq controller data for an irq
81 */ 77 */
82int set_irq_data(unsigned int irq, void *data) 78int irq_set_handler_data(unsigned int irq, void *data)
83{ 79{
84 struct irq_desc *desc = irq_to_desc(irq);
85 unsigned long flags; 80 unsigned long flags;
81 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
86 82
87 if (!desc) { 83 if (!desc)
88 printk(KERN_ERR
89 "Trying to install controller data for IRQ%d\n", irq);
90 return -EINVAL; 84 return -EINVAL;
91 }
92
93 raw_spin_lock_irqsave(&desc->lock, flags);
94 desc->irq_data.handler_data = data; 85 desc->irq_data.handler_data = data;
95 raw_spin_unlock_irqrestore(&desc->lock, flags); 86 irq_put_desc_unlock(desc, flags);
96 return 0; 87 return 0;
97} 88}
98EXPORT_SYMBOL(set_irq_data); 89EXPORT_SYMBOL(irq_set_handler_data);
99 90
100/** 91/**
101 * set_irq_msi - set MSI descriptor data for an irq 92 * irq_set_msi_desc - set MSI descriptor data for an irq
102 * @irq: Interrupt number 93 * @irq: Interrupt number
103 * @entry: Pointer to MSI descriptor data 94 * @entry: Pointer to MSI descriptor data
104 * 95 *
105 * Set the MSI descriptor entry for an irq 96 * Set the MSI descriptor entry for an irq
106 */ 97 */
107int set_irq_msi(unsigned int irq, struct msi_desc *entry) 98int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
108{ 99{
109 struct irq_desc *desc = irq_to_desc(irq);
110 unsigned long flags; 100 unsigned long flags;
101 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
111 102
112 if (!desc) { 103 if (!desc)
113 printk(KERN_ERR
114 "Trying to install msi data for IRQ%d\n", irq);
115 return -EINVAL; 104 return -EINVAL;
116 }
117
118 raw_spin_lock_irqsave(&desc->lock, flags);
119 desc->irq_data.msi_desc = entry; 105 desc->irq_data.msi_desc = entry;
120 if (entry) 106 if (entry)
121 entry->irq = irq; 107 entry->irq = irq;
122 raw_spin_unlock_irqrestore(&desc->lock, flags); 108 irq_put_desc_unlock(desc, flags);
123 return 0; 109 return 0;
124} 110}
125 111
126/** 112/**
127 * set_irq_chip_data - set irq chip data for an irq 113 * irq_set_chip_data - set irq chip data for an irq
128 * @irq: Interrupt number 114 * @irq: Interrupt number
129 * @data: Pointer to chip specific data 115 * @data: Pointer to chip specific data
130 * 116 *
131 * Set the hardware irq chip data for an irq 117 * Set the hardware irq chip data for an irq
132 */ 118 */
133int set_irq_chip_data(unsigned int irq, void *data) 119int irq_set_chip_data(unsigned int irq, void *data)
134{ 120{
135 struct irq_desc *desc = irq_to_desc(irq);
136 unsigned long flags; 121 unsigned long flags;
122 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
137 123
138 if (!desc) { 124 if (!desc)
139 printk(KERN_ERR
140 "Trying to install chip data for IRQ%d\n", irq);
141 return -EINVAL;
142 }
143
144 if (!desc->irq_data.chip) {
145 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
146 return -EINVAL; 125 return -EINVAL;
147 }
148
149 raw_spin_lock_irqsave(&desc->lock, flags);
150 desc->irq_data.chip_data = data; 126 desc->irq_data.chip_data = data;
151 raw_spin_unlock_irqrestore(&desc->lock, flags); 127 irq_put_desc_unlock(desc, flags);
152
153 return 0; 128 return 0;
154} 129}
155EXPORT_SYMBOL(set_irq_chip_data); 130EXPORT_SYMBOL(irq_set_chip_data);
156 131
157struct irq_data *irq_get_irq_data(unsigned int irq) 132struct irq_data *irq_get_irq_data(unsigned int irq)
158{ 133{
@@ -162,221 +137,71 @@ struct irq_data *irq_get_irq_data(unsigned int irq)
162} 137}
163EXPORT_SYMBOL_GPL(irq_get_irq_data); 138EXPORT_SYMBOL_GPL(irq_get_irq_data);
164 139
165/** 140static void irq_state_clr_disabled(struct irq_desc *desc)
166 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
167 *
168 * @irq: Interrupt number
169 * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag
170 *
171 * The IRQ_NESTED_THREAD flag indicates that on
172 * request_threaded_irq() no separate interrupt thread should be
173 * created for the irq as the handler are called nested in the
174 * context of a demultiplexing interrupt handler thread.
175 */
176void set_irq_nested_thread(unsigned int irq, int nest)
177{
178 struct irq_desc *desc = irq_to_desc(irq);
179 unsigned long flags;
180
181 if (!desc)
182 return;
183
184 raw_spin_lock_irqsave(&desc->lock, flags);
185 if (nest)
186 desc->status |= IRQ_NESTED_THREAD;
187 else
188 desc->status &= ~IRQ_NESTED_THREAD;
189 raw_spin_unlock_irqrestore(&desc->lock, flags);
190}
191EXPORT_SYMBOL_GPL(set_irq_nested_thread);
192
193/*
194 * default enable function
195 */
196static void default_enable(struct irq_data *data)
197{ 141{
198 struct irq_desc *desc = irq_data_to_desc(data); 142 irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED);
199
200 desc->irq_data.chip->irq_unmask(&desc->irq_data);
201 desc->status &= ~IRQ_MASKED;
202} 143}
203 144
204/* 145static void irq_state_set_disabled(struct irq_desc *desc)
205 * default disable function
206 */
207static void default_disable(struct irq_data *data)
208{
209}
210
211/*
212 * default startup function
213 */
214static unsigned int default_startup(struct irq_data *data)
215{ 146{
216 struct irq_desc *desc = irq_data_to_desc(data); 147 irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
217
218 desc->irq_data.chip->irq_enable(data);
219 return 0;
220} 148}
221 149
222/* 150static void irq_state_clr_masked(struct irq_desc *desc)
223 * default shutdown function
224 */
225static void default_shutdown(struct irq_data *data)
226{ 151{
227 struct irq_desc *desc = irq_data_to_desc(data); 152 irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
228
229 desc->irq_data.chip->irq_mask(&desc->irq_data);
230 desc->status |= IRQ_MASKED;
231} 153}
232 154
233#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED 155static void irq_state_set_masked(struct irq_desc *desc)
234/* Temporary migration helpers */
235static void compat_irq_mask(struct irq_data *data)
236{ 156{
237 data->chip->mask(data->irq); 157 irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
238} 158}
239 159
240static void compat_irq_unmask(struct irq_data *data) 160int irq_startup(struct irq_desc *desc)
241{ 161{
242 data->chip->unmask(data->irq); 162 irq_state_clr_disabled(desc);
243} 163 desc->depth = 0;
244 164
245static void compat_irq_ack(struct irq_data *data) 165 if (desc->irq_data.chip->irq_startup) {
246{ 166 int ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
247 data->chip->ack(data->irq); 167 irq_state_clr_masked(desc);
248} 168 return ret;
249 169 }
250static void compat_irq_mask_ack(struct irq_data *data)
251{
252 data->chip->mask_ack(data->irq);
253}
254
255static void compat_irq_eoi(struct irq_data *data)
256{
257 data->chip->eoi(data->irq);
258}
259
260static void compat_irq_enable(struct irq_data *data)
261{
262 data->chip->enable(data->irq);
263}
264
265static void compat_irq_disable(struct irq_data *data)
266{
267 data->chip->disable(data->irq);
268}
269
270static void compat_irq_shutdown(struct irq_data *data)
271{
272 data->chip->shutdown(data->irq);
273}
274
275static unsigned int compat_irq_startup(struct irq_data *data)
276{
277 return data->chip->startup(data->irq);
278}
279
280static int compat_irq_set_affinity(struct irq_data *data,
281 const struct cpumask *dest, bool force)
282{
283 return data->chip->set_affinity(data->irq, dest);
284}
285
286static int compat_irq_set_type(struct irq_data *data, unsigned int type)
287{
288 return data->chip->set_type(data->irq, type);
289}
290
291static int compat_irq_set_wake(struct irq_data *data, unsigned int on)
292{
293 return data->chip->set_wake(data->irq, on);
294}
295 170
296static int compat_irq_retrigger(struct irq_data *data) 171 irq_enable(desc);
297{ 172 return 0;
298 return data->chip->retrigger(data->irq);
299} 173}
300 174
301static void compat_bus_lock(struct irq_data *data) 175void irq_shutdown(struct irq_desc *desc)
302{ 176{
303 data->chip->bus_lock(data->irq); 177 irq_state_set_disabled(desc);
178 desc->depth = 1;
179 if (desc->irq_data.chip->irq_shutdown)
180 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
181 if (desc->irq_data.chip->irq_disable)
182 desc->irq_data.chip->irq_disable(&desc->irq_data);
183 else
184 desc->irq_data.chip->irq_mask(&desc->irq_data);
185 irq_state_set_masked(desc);
304} 186}
305 187
306static void compat_bus_sync_unlock(struct irq_data *data) 188void irq_enable(struct irq_desc *desc)
307{ 189{
308 data->chip->bus_sync_unlock(data->irq); 190 irq_state_clr_disabled(desc);
191 if (desc->irq_data.chip->irq_enable)
192 desc->irq_data.chip->irq_enable(&desc->irq_data);
193 else
194 desc->irq_data.chip->irq_unmask(&desc->irq_data);
195 irq_state_clr_masked(desc);
309} 196}
310#endif
311 197
312/* 198void irq_disable(struct irq_desc *desc)
313 * Fixup enable/disable function pointers
314 */
315void irq_chip_set_defaults(struct irq_chip *chip)
316{ 199{
317#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED 200 irq_state_set_disabled(desc);
318 /* 201 if (desc->irq_data.chip->irq_disable) {
319 * Compat fixup functions need to be before we set the 202 desc->irq_data.chip->irq_disable(&desc->irq_data);
320 * defaults for enable/disable/startup/shutdown 203 irq_state_set_masked(desc);
321 */ 204 }
322 if (chip->enable)
323 chip->irq_enable = compat_irq_enable;
324 if (chip->disable)
325 chip->irq_disable = compat_irq_disable;
326 if (chip->shutdown)
327 chip->irq_shutdown = compat_irq_shutdown;
328 if (chip->startup)
329 chip->irq_startup = compat_irq_startup;
330#endif
331 /*
332 * The real defaults
333 */
334 if (!chip->irq_enable)
335 chip->irq_enable = default_enable;
336 if (!chip->irq_disable)
337 chip->irq_disable = default_disable;
338 if (!chip->irq_startup)
339 chip->irq_startup = default_startup;
340 /*
341 * We use chip->irq_disable, when the user provided its own. When
342 * we have default_disable set for chip->irq_disable, then we need
343 * to use default_shutdown, otherwise the irq line is not
344 * disabled on free_irq():
345 */
346 if (!chip->irq_shutdown)
347 chip->irq_shutdown = chip->irq_disable != default_disable ?
348 chip->irq_disable : default_shutdown;
349
350#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
351 if (!chip->end)
352 chip->end = dummy_irq_chip.end;
353
354 /*
355 * Now fix up the remaining compat handlers
356 */
357 if (chip->bus_lock)
358 chip->irq_bus_lock = compat_bus_lock;
359 if (chip->bus_sync_unlock)
360 chip->irq_bus_sync_unlock = compat_bus_sync_unlock;
361 if (chip->mask)
362 chip->irq_mask = compat_irq_mask;
363 if (chip->unmask)
364 chip->irq_unmask = compat_irq_unmask;
365 if (chip->ack)
366 chip->irq_ack = compat_irq_ack;
367 if (chip->mask_ack)
368 chip->irq_mask_ack = compat_irq_mask_ack;
369 if (chip->eoi)
370 chip->irq_eoi = compat_irq_eoi;
371 if (chip->set_affinity)
372 chip->irq_set_affinity = compat_irq_set_affinity;
373 if (chip->set_type)
374 chip->irq_set_type = compat_irq_set_type;
375 if (chip->set_wake)
376 chip->irq_set_wake = compat_irq_set_wake;
377 if (chip->retrigger)
378 chip->irq_retrigger = compat_irq_retrigger;
379#endif
380} 205}
381 206
382static inline void mask_ack_irq(struct irq_desc *desc) 207static inline void mask_ack_irq(struct irq_desc *desc)
@@ -388,22 +213,22 @@ static inline void mask_ack_irq(struct irq_desc *desc)
388 if (desc->irq_data.chip->irq_ack) 213 if (desc->irq_data.chip->irq_ack)
389 desc->irq_data.chip->irq_ack(&desc->irq_data); 214 desc->irq_data.chip->irq_ack(&desc->irq_data);
390 } 215 }
391 desc->status |= IRQ_MASKED; 216 irq_state_set_masked(desc);
392} 217}
393 218
394static inline void mask_irq(struct irq_desc *desc) 219void mask_irq(struct irq_desc *desc)
395{ 220{
396 if (desc->irq_data.chip->irq_mask) { 221 if (desc->irq_data.chip->irq_mask) {
397 desc->irq_data.chip->irq_mask(&desc->irq_data); 222 desc->irq_data.chip->irq_mask(&desc->irq_data);
398 desc->status |= IRQ_MASKED; 223 irq_state_set_masked(desc);
399 } 224 }
400} 225}
401 226
402static inline void unmask_irq(struct irq_desc *desc) 227void unmask_irq(struct irq_desc *desc)
403{ 228{
404 if (desc->irq_data.chip->irq_unmask) { 229 if (desc->irq_data.chip->irq_unmask) {
405 desc->irq_data.chip->irq_unmask(&desc->irq_data); 230 desc->irq_data.chip->irq_unmask(&desc->irq_data);
406 desc->status &= ~IRQ_MASKED; 231 irq_state_clr_masked(desc);
407 } 232 }
408} 233}
409 234
@@ -428,10 +253,10 @@ void handle_nested_irq(unsigned int irq)
428 kstat_incr_irqs_this_cpu(irq, desc); 253 kstat_incr_irqs_this_cpu(irq, desc);
429 254
430 action = desc->action; 255 action = desc->action;
431 if (unlikely(!action || (desc->status & IRQ_DISABLED))) 256 if (unlikely(!action || irqd_irq_disabled(&desc->irq_data)))
432 goto out_unlock; 257 goto out_unlock;
433 258
434 desc->status |= IRQ_INPROGRESS; 259 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
435 raw_spin_unlock_irq(&desc->lock); 260 raw_spin_unlock_irq(&desc->lock);
436 261
437 action_ret = action->thread_fn(action->irq, action->dev_id); 262 action_ret = action->thread_fn(action->irq, action->dev_id);
@@ -439,13 +264,20 @@ void handle_nested_irq(unsigned int irq)
439 note_interrupt(irq, desc, action_ret); 264 note_interrupt(irq, desc, action_ret);
440 265
441 raw_spin_lock_irq(&desc->lock); 266 raw_spin_lock_irq(&desc->lock);
442 desc->status &= ~IRQ_INPROGRESS; 267 irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
443 268
444out_unlock: 269out_unlock:
445 raw_spin_unlock_irq(&desc->lock); 270 raw_spin_unlock_irq(&desc->lock);
446} 271}
447EXPORT_SYMBOL_GPL(handle_nested_irq); 272EXPORT_SYMBOL_GPL(handle_nested_irq);
448 273
274static bool irq_check_poll(struct irq_desc *desc)
275{
276 if (!(desc->istate & IRQS_POLL_INPROGRESS))
277 return false;
278 return irq_wait_for_poll(desc);
279}
280
449/** 281/**
450 * handle_simple_irq - Simple and software-decoded IRQs. 282 * handle_simple_irq - Simple and software-decoded IRQs.
451 * @irq: the interrupt number 283 * @irq: the interrupt number
@@ -461,32 +293,24 @@ EXPORT_SYMBOL_GPL(handle_nested_irq);
461void 293void
462handle_simple_irq(unsigned int irq, struct irq_desc *desc) 294handle_simple_irq(unsigned int irq, struct irq_desc *desc)
463{ 295{
464 struct irqaction *action;
465 irqreturn_t action_ret;
466
467 raw_spin_lock(&desc->lock); 296 raw_spin_lock(&desc->lock);
468 297
469 if (unlikely(desc->status & IRQ_INPROGRESS)) 298 if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
470 goto out_unlock; 299 if (!irq_check_poll(desc))
471 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 300 goto out_unlock;
301
302 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
472 kstat_incr_irqs_this_cpu(irq, desc); 303 kstat_incr_irqs_this_cpu(irq, desc);
473 304
474 action = desc->action; 305 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
475 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
476 goto out_unlock; 306 goto out_unlock;
477 307
478 desc->status |= IRQ_INPROGRESS; 308 handle_irq_event(desc);
479 raw_spin_unlock(&desc->lock);
480 309
481 action_ret = handle_IRQ_event(irq, action);
482 if (!noirqdebug)
483 note_interrupt(irq, desc, action_ret);
484
485 raw_spin_lock(&desc->lock);
486 desc->status &= ~IRQ_INPROGRESS;
487out_unlock: 310out_unlock:
488 raw_spin_unlock(&desc->lock); 311 raw_spin_unlock(&desc->lock);
489} 312}
313EXPORT_SYMBOL_GPL(handle_simple_irq);
490 314
491/** 315/**
492 * handle_level_irq - Level type irq handler 316 * handle_level_irq - Level type irq handler
@@ -501,42 +325,42 @@ out_unlock:
501void 325void
502handle_level_irq(unsigned int irq, struct irq_desc *desc) 326handle_level_irq(unsigned int irq, struct irq_desc *desc)
503{ 327{
504 struct irqaction *action;
505 irqreturn_t action_ret;
506
507 raw_spin_lock(&desc->lock); 328 raw_spin_lock(&desc->lock);
508 mask_ack_irq(desc); 329 mask_ack_irq(desc);
509 330
510 if (unlikely(desc->status & IRQ_INPROGRESS)) 331 if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
511 goto out_unlock; 332 if (!irq_check_poll(desc))
512 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 333 goto out_unlock;
334
335 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
513 kstat_incr_irqs_this_cpu(irq, desc); 336 kstat_incr_irqs_this_cpu(irq, desc);
514 337
515 /* 338 /*
516 * If its disabled or no action available 339 * If its disabled or no action available
517 * keep it masked and get out of here 340 * keep it masked and get out of here
518 */ 341 */
519 action = desc->action; 342 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
520 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
521 goto out_unlock; 343 goto out_unlock;
522 344
523 desc->status |= IRQ_INPROGRESS; 345 handle_irq_event(desc);
524 raw_spin_unlock(&desc->lock);
525
526 action_ret = handle_IRQ_event(irq, action);
527 if (!noirqdebug)
528 note_interrupt(irq, desc, action_ret);
529
530 raw_spin_lock(&desc->lock);
531 desc->status &= ~IRQ_INPROGRESS;
532 346
533 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) 347 if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT))
534 unmask_irq(desc); 348 unmask_irq(desc);
535out_unlock: 349out_unlock:
536 raw_spin_unlock(&desc->lock); 350 raw_spin_unlock(&desc->lock);
537} 351}
538EXPORT_SYMBOL_GPL(handle_level_irq); 352EXPORT_SYMBOL_GPL(handle_level_irq);
539 353
354#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
355static inline void preflow_handler(struct irq_desc *desc)
356{
357 if (desc->preflow_handler)
358 desc->preflow_handler(&desc->irq_data);
359}
360#else
361static inline void preflow_handler(struct irq_desc *desc) { }
362#endif
363
540/** 364/**
541 * handle_fasteoi_irq - irq handler for transparent controllers 365 * handle_fasteoi_irq - irq handler for transparent controllers
542 * @irq: the interrupt number 366 * @irq: the interrupt number
@@ -550,42 +374,40 @@ EXPORT_SYMBOL_GPL(handle_level_irq);
550void 374void
551handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) 375handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
552{ 376{
553 struct irqaction *action;
554 irqreturn_t action_ret;
555
556 raw_spin_lock(&desc->lock); 377 raw_spin_lock(&desc->lock);
557 378
558 if (unlikely(desc->status & IRQ_INPROGRESS)) 379 if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
559 goto out; 380 if (!irq_check_poll(desc))
381 goto out;
560 382
561 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 383 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
562 kstat_incr_irqs_this_cpu(irq, desc); 384 kstat_incr_irqs_this_cpu(irq, desc);
563 385
564 /* 386 /*
565 * If its disabled or no action available 387 * If its disabled or no action available
566 * then mask it and get out of here: 388 * then mask it and get out of here:
567 */ 389 */
568 action = desc->action; 390 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
569 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 391 desc->istate |= IRQS_PENDING;
570 desc->status |= IRQ_PENDING;
571 mask_irq(desc); 392 mask_irq(desc);
572 goto out; 393 goto out;
573 } 394 }
574 395
575 desc->status |= IRQ_INPROGRESS; 396 if (desc->istate & IRQS_ONESHOT)
576 desc->status &= ~IRQ_PENDING; 397 mask_irq(desc);
577 raw_spin_unlock(&desc->lock);
578 398
579 action_ret = handle_IRQ_event(irq, action); 399 preflow_handler(desc);
580 if (!noirqdebug) 400 handle_irq_event(desc);
581 note_interrupt(irq, desc, action_ret);
582 401
583 raw_spin_lock(&desc->lock); 402out_eoi:
584 desc->status &= ~IRQ_INPROGRESS;
585out:
586 desc->irq_data.chip->irq_eoi(&desc->irq_data); 403 desc->irq_data.chip->irq_eoi(&desc->irq_data);
587 404out_unlock:
588 raw_spin_unlock(&desc->lock); 405 raw_spin_unlock(&desc->lock);
406 return;
407out:
408 if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED))
409 goto out_eoi;
410 goto out_unlock;
589} 411}
590 412
591/** 413/**
@@ -594,7 +416,7 @@ out:
594 * @desc: the interrupt description structure for this irq 416 * @desc: the interrupt description structure for this irq
595 * 417 *
596 * Interrupt occures on the falling and/or rising edge of a hardware 418 * Interrupt occures on the falling and/or rising edge of a hardware
597 * signal. The occurence is latched into the irq controller hardware 419 * signal. The occurrence is latched into the irq controller hardware
598 * and must be acked in order to be reenabled. After the ack another 420 * and must be acked in order to be reenabled. After the ack another
599 * interrupt can happen on the same source even before the first one 421 * interrupt can happen on the same source even before the first one
600 * is handled by the associated event handler. If this happens it 422 * is handled by the associated event handler. If this happens it
@@ -609,32 +431,27 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
609{ 431{
610 raw_spin_lock(&desc->lock); 432 raw_spin_lock(&desc->lock);
611 433
612 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 434 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
613
614 /* 435 /*
615 * If we're currently running this IRQ, or its disabled, 436 * If we're currently running this IRQ, or its disabled,
616 * we shouldn't process the IRQ. Mark it pending, handle 437 * we shouldn't process the IRQ. Mark it pending, handle
617 * the necessary masking and go out 438 * the necessary masking and go out
618 */ 439 */
619 if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || 440 if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
620 !desc->action)) { 441 irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
621 desc->status |= (IRQ_PENDING | IRQ_MASKED); 442 if (!irq_check_poll(desc)) {
622 mask_ack_irq(desc); 443 desc->istate |= IRQS_PENDING;
623 goto out_unlock; 444 mask_ack_irq(desc);
445 goto out_unlock;
446 }
624 } 447 }
625 kstat_incr_irqs_this_cpu(irq, desc); 448 kstat_incr_irqs_this_cpu(irq, desc);
626 449
627 /* Start handling the irq */ 450 /* Start handling the irq */
628 desc->irq_data.chip->irq_ack(&desc->irq_data); 451 desc->irq_data.chip->irq_ack(&desc->irq_data);
629 452
630 /* Mark the IRQ currently in progress.*/
631 desc->status |= IRQ_INPROGRESS;
632
633 do { 453 do {
634 struct irqaction *action = desc->action; 454 if (unlikely(!desc->action)) {
635 irqreturn_t action_ret;
636
637 if (unlikely(!action)) {
638 mask_irq(desc); 455 mask_irq(desc);
639 goto out_unlock; 456 goto out_unlock;
640 } 457 }
@@ -644,26 +461,66 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
644 * one, we could have masked the irq. 461 * one, we could have masked the irq.
645 * Renable it, if it was not disabled in meantime. 462 * Renable it, if it was not disabled in meantime.
646 */ 463 */
647 if (unlikely((desc->status & 464 if (unlikely(desc->istate & IRQS_PENDING)) {
648 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == 465 if (!irqd_irq_disabled(&desc->irq_data) &&
649 (IRQ_PENDING | IRQ_MASKED))) { 466 irqd_irq_masked(&desc->irq_data))
650 unmask_irq(desc); 467 unmask_irq(desc);
651 } 468 }
652 469
653 desc->status &= ~IRQ_PENDING; 470 handle_irq_event(desc);
654 raw_spin_unlock(&desc->lock);
655 action_ret = handle_IRQ_event(irq, action);
656 if (!noirqdebug)
657 note_interrupt(irq, desc, action_ret);
658 raw_spin_lock(&desc->lock);
659 471
660 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); 472 } while ((desc->istate & IRQS_PENDING) &&
473 !irqd_irq_disabled(&desc->irq_data));
661 474
662 desc->status &= ~IRQ_INPROGRESS;
663out_unlock: 475out_unlock:
664 raw_spin_unlock(&desc->lock); 476 raw_spin_unlock(&desc->lock);
665} 477}
666 478
479#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
480/**
481 * handle_edge_eoi_irq - edge eoi type IRQ handler
482 * @irq: the interrupt number
483 * @desc: the interrupt description structure for this irq
484 *
485 * Similar as the above handle_edge_irq, but using eoi and w/o the
486 * mask/unmask logic.
487 */
488void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
489{
490 struct irq_chip *chip = irq_desc_get_chip(desc);
491
492 raw_spin_lock(&desc->lock);
493
494 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
495 /*
496 * If we're currently running this IRQ, or its disabled,
497 * we shouldn't process the IRQ. Mark it pending, handle
498 * the necessary masking and go out
499 */
500 if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
501 irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
502 if (!irq_check_poll(desc)) {
503 desc->istate |= IRQS_PENDING;
504 goto out_eoi;
505 }
506 }
507 kstat_incr_irqs_this_cpu(irq, desc);
508
509 do {
510 if (unlikely(!desc->action))
511 goto out_eoi;
512
513 handle_irq_event(desc);
514
515 } while ((desc->istate & IRQS_PENDING) &&
516 !irqd_irq_disabled(&desc->irq_data));
517
518out_eoi:
519 chip->irq_eoi(&desc->irq_data);
520 raw_spin_unlock(&desc->lock);
521}
522#endif
523
667/** 524/**
668 * handle_percpu_irq - Per CPU local irq handler 525 * handle_percpu_irq - Per CPU local irq handler
669 * @irq: the interrupt number 526 * @irq: the interrupt number
@@ -674,103 +531,147 @@ out_unlock:
674void 531void
675handle_percpu_irq(unsigned int irq, struct irq_desc *desc) 532handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
676{ 533{
677 irqreturn_t action_ret; 534 struct irq_chip *chip = irq_desc_get_chip(desc);
678 535
679 kstat_incr_irqs_this_cpu(irq, desc); 536 kstat_incr_irqs_this_cpu(irq, desc);
680 537
681 if (desc->irq_data.chip->irq_ack) 538 if (chip->irq_ack)
682 desc->irq_data.chip->irq_ack(&desc->irq_data); 539 chip->irq_ack(&desc->irq_data);
683 540
684 action_ret = handle_IRQ_event(irq, desc->action); 541 handle_irq_event_percpu(desc, desc->action);
685 if (!noirqdebug)
686 note_interrupt(irq, desc, action_ret);
687 542
688 if (desc->irq_data.chip->irq_eoi) 543 if (chip->irq_eoi)
689 desc->irq_data.chip->irq_eoi(&desc->irq_data); 544 chip->irq_eoi(&desc->irq_data);
690} 545}
691 546
692void 547void
693__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, 548__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
694 const char *name) 549 const char *name)
695{ 550{
696 struct irq_desc *desc = irq_to_desc(irq);
697 unsigned long flags; 551 unsigned long flags;
552 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
698 553
699 if (!desc) { 554 if (!desc)
700 printk(KERN_ERR
701 "Trying to install type control for IRQ%d\n", irq);
702 return; 555 return;
703 }
704 556
705 if (!handle) 557 if (!handle) {
706 handle = handle_bad_irq; 558 handle = handle_bad_irq;
707 else if (desc->irq_data.chip == &no_irq_chip) { 559 } else {
708 printk(KERN_WARNING "Trying to install %sinterrupt handler " 560 if (WARN_ON(desc->irq_data.chip == &no_irq_chip))
709 "for IRQ%d\n", is_chained ? "chained " : "", irq); 561 goto out;
710 /*
711 * Some ARM implementations install a handler for really dumb
712 * interrupt hardware without setting an irq_chip. This worked
713 * with the ARM no_irq_chip but the check in setup_irq would
714 * prevent us to setup the interrupt at all. Switch it to
715 * dummy_irq_chip for easy transition.
716 */
717 desc->irq_data.chip = &dummy_irq_chip;
718 } 562 }
719 563
720 chip_bus_lock(desc);
721 raw_spin_lock_irqsave(&desc->lock, flags);
722
723 /* Uninstall? */ 564 /* Uninstall? */
724 if (handle == handle_bad_irq) { 565 if (handle == handle_bad_irq) {
725 if (desc->irq_data.chip != &no_irq_chip) 566 if (desc->irq_data.chip != &no_irq_chip)
726 mask_ack_irq(desc); 567 mask_ack_irq(desc);
727 desc->status |= IRQ_DISABLED; 568 irq_state_set_disabled(desc);
728 desc->depth = 1; 569 desc->depth = 1;
729 } 570 }
730 desc->handle_irq = handle; 571 desc->handle_irq = handle;
731 desc->name = name; 572 desc->name = name;
732 573
733 if (handle != handle_bad_irq && is_chained) { 574 if (handle != handle_bad_irq && is_chained) {
734 desc->status &= ~IRQ_DISABLED; 575 irq_settings_set_noprobe(desc);
735 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; 576 irq_settings_set_norequest(desc);
736 desc->depth = 0; 577 irq_settings_set_nothread(desc);
737 desc->irq_data.chip->irq_startup(&desc->irq_data); 578 irq_startup(desc);
738 } 579 }
739 raw_spin_unlock_irqrestore(&desc->lock, flags); 580out:
740 chip_bus_sync_unlock(desc); 581 irq_put_desc_busunlock(desc, flags);
741}
742EXPORT_SYMBOL_GPL(__set_irq_handler);
743
744void
745set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
746 irq_flow_handler_t handle)
747{
748 set_irq_chip(irq, chip);
749 __set_irq_handler(irq, handle, 0, NULL);
750} 582}
583EXPORT_SYMBOL_GPL(__irq_set_handler);
751 584
752void 585void
753set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, 586irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
754 irq_flow_handler_t handle, const char *name) 587 irq_flow_handler_t handle, const char *name)
755{ 588{
756 set_irq_chip(irq, chip); 589 irq_set_chip(irq, chip);
757 __set_irq_handler(irq, handle, 0, name); 590 __irq_set_handler(irq, handle, 0, name);
758} 591}
759 592
760void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) 593void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
761{ 594{
762 struct irq_desc *desc = irq_to_desc(irq);
763 unsigned long flags; 595 unsigned long flags;
596 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
764 597
765 if (!desc) 598 if (!desc)
766 return; 599 return;
600 irq_settings_clr_and_set(desc, clr, set);
601
602 irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
603 IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
604 if (irq_settings_has_no_balance_set(desc))
605 irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
606 if (irq_settings_is_per_cpu(desc))
607 irqd_set(&desc->irq_data, IRQD_PER_CPU);
608 if (irq_settings_can_move_pcntxt(desc))
609 irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
610 if (irq_settings_is_level(desc))
611 irqd_set(&desc->irq_data, IRQD_LEVEL);
612
613 irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
614
615 irq_put_desc_unlock(desc, flags);
616}
617EXPORT_SYMBOL_GPL(irq_modify_status);
618
619/**
620 * irq_cpu_online - Invoke all irq_cpu_online functions.
621 *
622 * Iterate through all irqs and invoke the chip.irq_cpu_online()
623 * for each.
624 */
625void irq_cpu_online(void)
626{
627 struct irq_desc *desc;
628 struct irq_chip *chip;
629 unsigned long flags;
630 unsigned int irq;
631
632 for_each_active_irq(irq) {
633 desc = irq_to_desc(irq);
634 if (!desc)
635 continue;
767 636
768 /* Sanitize flags */ 637 raw_spin_lock_irqsave(&desc->lock, flags);
769 set &= IRQF_MODIFY_MASK;
770 clr &= IRQF_MODIFY_MASK;
771 638
772 raw_spin_lock_irqsave(&desc->lock, flags); 639 chip = irq_data_get_irq_chip(&desc->irq_data);
773 desc->status &= ~clr; 640 if (chip && chip->irq_cpu_online &&
774 desc->status |= set; 641 (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
775 raw_spin_unlock_irqrestore(&desc->lock, flags); 642 !irqd_irq_disabled(&desc->irq_data)))
643 chip->irq_cpu_online(&desc->irq_data);
644
645 raw_spin_unlock_irqrestore(&desc->lock, flags);
646 }
647}
648
649/**
650 * irq_cpu_offline - Invoke all irq_cpu_offline functions.
651 *
652 * Iterate through all irqs and invoke the chip.irq_cpu_offline()
653 * for each.
654 */
655void irq_cpu_offline(void)
656{
657 struct irq_desc *desc;
658 struct irq_chip *chip;
659 unsigned long flags;
660 unsigned int irq;
661
662 for_each_active_irq(irq) {
663 desc = irq_to_desc(irq);
664 if (!desc)
665 continue;
666
667 raw_spin_lock_irqsave(&desc->lock, flags);
668
669 chip = irq_data_get_irq_chip(&desc->irq_data);
670 if (chip && chip->irq_cpu_offline &&
671 (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
672 !irqd_irq_disabled(&desc->irq_data)))
673 chip->irq_cpu_offline(&desc->irq_data);
674
675 raw_spin_unlock_irqrestore(&desc->lock, flags);
676 }
776} 677}
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
new file mode 100644
index 000000000000..97a8bfadc88a
--- /dev/null
+++ b/kernel/irq/debug.h
@@ -0,0 +1,45 @@
1/*
2 * Debugging printout:
3 */
4
5#include <linux/kallsyms.h>
6
7#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
8#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
9/* FIXME */
10#define PD(f) do { } while (0)
11
12static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
13{
14 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
15 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
16 printk("->handle_irq(): %p, ", desc->handle_irq);
17 print_symbol("%s\n", (unsigned long)desc->handle_irq);
18 printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
19 print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
20 printk("->action(): %p\n", desc->action);
21 if (desc->action) {
22 printk("->action->handler(): %p, ", desc->action->handler);
23 print_symbol("%s\n", (unsigned long)desc->action->handler);
24 }
25
26 P(IRQ_LEVEL);
27 P(IRQ_PER_CPU);
28 P(IRQ_NOPROBE);
29 P(IRQ_NOREQUEST);
30 P(IRQ_NOTHREAD);
31 P(IRQ_NOAUTOEN);
32
33 PS(IRQS_AUTODETECT);
34 PS(IRQS_REPLAY);
35 PS(IRQS_WAITING);
36 PS(IRQS_PENDING);
37
38 PD(IRQS_INPROGRESS);
39 PD(IRQS_DISABLED);
40 PD(IRQS_MASKED);
41}
42
43#undef P
44#undef PS
45#undef PD
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 20dc5474947e..b5fcd96c7102 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -31,13 +31,6 @@ static unsigned int noop_ret(struct irq_data *data)
31 return 0; 31 return 0;
32} 32}
33 33
34#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
35static void compat_noop(unsigned int irq) { }
36#define END_INIT .end = compat_noop
37#else
38#define END_INIT
39#endif
40
41/* 34/*
42 * Generic no controller implementation 35 * Generic no controller implementation
43 */ 36 */
@@ -48,7 +41,6 @@ struct irq_chip no_irq_chip = {
48 .irq_enable = noop, 41 .irq_enable = noop,
49 .irq_disable = noop, 42 .irq_disable = noop,
50 .irq_ack = ack_bad, 43 .irq_ack = ack_bad,
51 END_INIT
52}; 44};
53 45
54/* 46/*
@@ -64,5 +56,4 @@ struct irq_chip dummy_irq_chip = {
64 .irq_ack = noop, 56 .irq_ack = noop,
65 .irq_mask = noop, 57 .irq_mask = noop,
66 .irq_unmask = noop, 58 .irq_unmask = noop,
67 END_INIT
68}; 59};
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
new file mode 100644
index 000000000000..31a9db711906
--- /dev/null
+++ b/kernel/irq/generic-chip.c
@@ -0,0 +1,354 @@
1/*
2 * Library implementing the most common irq chip callback functions
3 *
4 * Copyright (C) 2011, Thomas Gleixner
5 */
6#include <linux/io.h>
7#include <linux/irq.h>
8#include <linux/slab.h>
9#include <linux/interrupt.h>
10#include <linux/kernel_stat.h>
11#include <linux/syscore_ops.h>
12
13#include "internals.h"
14
15static LIST_HEAD(gc_list);
16static DEFINE_RAW_SPINLOCK(gc_lock);
17
18static inline struct irq_chip_regs *cur_regs(struct irq_data *d)
19{
20 return &container_of(d->chip, struct irq_chip_type, chip)->regs;
21}
22
23/**
24 * irq_gc_noop - NOOP function
25 * @d: irq_data
26 */
27void irq_gc_noop(struct irq_data *d)
28{
29}
30
31/**
32 * irq_gc_mask_disable_reg - Mask chip via disable register
33 * @d: irq_data
34 *
35 * Chip has separate enable/disable registers instead of a single mask
36 * register.
37 */
38void irq_gc_mask_disable_reg(struct irq_data *d)
39{
40 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
41 u32 mask = 1 << (d->irq - gc->irq_base);
42
43 irq_gc_lock(gc);
44 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable);
45 gc->mask_cache &= ~mask;
46 irq_gc_unlock(gc);
47}
48
49/**
50 * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register
51 * @d: irq_data
52 *
53 * Chip has a single mask register. Values of this register are cached
54 * and protected by gc->lock
55 */
56void irq_gc_mask_set_bit(struct irq_data *d)
57{
58 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
59 u32 mask = 1 << (d->irq - gc->irq_base);
60
61 irq_gc_lock(gc);
62 gc->mask_cache |= mask;
63 irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
64 irq_gc_unlock(gc);
65}
66
67/**
68 * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register
69 * @d: irq_data
70 *
71 * Chip has a single mask register. Values of this register are cached
72 * and protected by gc->lock
73 */
74void irq_gc_mask_clr_bit(struct irq_data *d)
75{
76 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
77 u32 mask = 1 << (d->irq - gc->irq_base);
78
79 irq_gc_lock(gc);
80 gc->mask_cache &= ~mask;
81 irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
82 irq_gc_unlock(gc);
83}
84
85/**
86 * irq_gc_unmask_enable_reg - Unmask chip via enable register
87 * @d: irq_data
88 *
89 * Chip has separate enable/disable registers instead of a single mask
90 * register.
91 */
92void irq_gc_unmask_enable_reg(struct irq_data *d)
93{
94 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
95 u32 mask = 1 << (d->irq - gc->irq_base);
96
97 irq_gc_lock(gc);
98 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable);
99 gc->mask_cache |= mask;
100 irq_gc_unlock(gc);
101}
102
103/**
104 * irq_gc_ack - Ack pending interrupt
105 * @d: irq_data
106 */
107void irq_gc_ack(struct irq_data *d)
108{
109 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
110 u32 mask = 1 << (d->irq - gc->irq_base);
111
112 irq_gc_lock(gc);
113 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
114 irq_gc_unlock(gc);
115}
116
117/**
118 * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt
119 * @d: irq_data
120 */
121void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
122{
123 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
124 u32 mask = 1 << (d->irq - gc->irq_base);
125
126 irq_gc_lock(gc);
127 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask);
128 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
129 irq_gc_unlock(gc);
130}
131
132/**
133 * irq_gc_eoi - EOI interrupt
134 * @d: irq_data
135 */
136void irq_gc_eoi(struct irq_data *d)
137{
138 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
139 u32 mask = 1 << (d->irq - gc->irq_base);
140
141 irq_gc_lock(gc);
142 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi);
143 irq_gc_unlock(gc);
144}
145
146/**
147 * irq_gc_set_wake - Set/clr wake bit for an interrupt
148 * @d: irq_data
149 *
150 * For chips where the wake from suspend functionality is not
151 * configured in a separate register and the wakeup active state is
152 * just stored in a bitmask.
153 */
154int irq_gc_set_wake(struct irq_data *d, unsigned int on)
155{
156 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
157 u32 mask = 1 << (d->irq - gc->irq_base);
158
159 if (!(mask & gc->wake_enabled))
160 return -EINVAL;
161
162 irq_gc_lock(gc);
163 if (on)
164 gc->wake_active |= mask;
165 else
166 gc->wake_active &= ~mask;
167 irq_gc_unlock(gc);
168 return 0;
169}
170
171/**
172 * irq_alloc_generic_chip - Allocate a generic chip and initialize it
173 * @name: Name of the irq chip
174 * @num_ct: Number of irq_chip_type instances associated with this
175 * @irq_base: Interrupt base nr for this chip
176 * @reg_base: Register base address (virtual)
177 * @handler: Default flow handler associated with this chip
178 *
179 * Returns an initialized irq_chip_generic structure. The chip defaults
180 * to the primary (index 0) irq_chip_type and @handler
181 */
182struct irq_chip_generic *
183irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
184 void __iomem *reg_base, irq_flow_handler_t handler)
185{
186 struct irq_chip_generic *gc;
187 unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
188
189 gc = kzalloc(sz, GFP_KERNEL);
190 if (gc) {
191 raw_spin_lock_init(&gc->lock);
192 gc->num_ct = num_ct;
193 gc->irq_base = irq_base;
194 gc->reg_base = reg_base;
195 gc->chip_types->chip.name = name;
196 gc->chip_types->handler = handler;
197 }
198 return gc;
199}
200
201/*
202 * Separate lockdep class for interrupt chip which can nest irq_desc
203 * lock.
204 */
205static struct lock_class_key irq_nested_lock_class;
206
207/**
208 * irq_setup_generic_chip - Setup a range of interrupts with a generic chip
209 * @gc: Generic irq chip holding all data
210 * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base
211 * @flags: Flags for initialization
212 * @clr: IRQ_* bits to clear
213 * @set: IRQ_* bits to set
214 *
215 * Set up max. 32 interrupts starting from gc->irq_base. Note, this
216 * initializes all interrupts to the primary irq_chip_type and its
217 * associated handler.
218 */
219void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
220 enum irq_gc_flags flags, unsigned int clr,
221 unsigned int set)
222{
223 struct irq_chip_type *ct = gc->chip_types;
224 unsigned int i;
225
226 raw_spin_lock(&gc_lock);
227 list_add_tail(&gc->list, &gc_list);
228 raw_spin_unlock(&gc_lock);
229
230 /* Init mask cache ? */
231 if (flags & IRQ_GC_INIT_MASK_CACHE)
232 gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
233
234 for (i = gc->irq_base; msk; msk >>= 1, i++) {
235 if (!msk & 0x01)
236 continue;
237
238 if (flags & IRQ_GC_INIT_NESTED_LOCK)
239 irq_set_lockdep_class(i, &irq_nested_lock_class);
240
241 irq_set_chip_and_handler(i, &ct->chip, ct->handler);
242 irq_set_chip_data(i, gc);
243 irq_modify_status(i, clr, set);
244 }
245 gc->irq_cnt = i - gc->irq_base;
246}
247
248/**
249 * irq_setup_alt_chip - Switch to alternative chip
250 * @d: irq_data for this interrupt
251 * @type Flow type to be initialized
252 *
253 * Only to be called from chip->irq_set_type() callbacks.
254 */
255int irq_setup_alt_chip(struct irq_data *d, unsigned int type)
256{
257 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
258 struct irq_chip_type *ct = gc->chip_types;
259 unsigned int i;
260
261 for (i = 0; i < gc->num_ct; i++, ct++) {
262 if (ct->type & type) {
263 d->chip = &ct->chip;
264 irq_data_to_desc(d)->handle_irq = ct->handler;
265 return 0;
266 }
267 }
268 return -EINVAL;
269}
270
271/**
272 * irq_remove_generic_chip - Remove a chip
273 * @gc: Generic irq chip holding all data
274 * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base
275 * @clr: IRQ_* bits to clear
276 * @set: IRQ_* bits to set
277 *
278 * Remove up to 32 interrupts starting from gc->irq_base.
279 */
280void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
281 unsigned int clr, unsigned int set)
282{
283 unsigned int i = gc->irq_base;
284
285 raw_spin_lock(&gc_lock);
286 list_del(&gc->list);
287 raw_spin_unlock(&gc_lock);
288
289 for (; msk; msk >>= 1, i++) {
290 if (!msk & 0x01)
291 continue;
292
293 /* Remove handler first. That will mask the irq line */
294 irq_set_handler(i, NULL);
295 irq_set_chip(i, &no_irq_chip);
296 irq_set_chip_data(i, NULL);
297 irq_modify_status(i, clr, set);
298 }
299}
300
301#ifdef CONFIG_PM
302static int irq_gc_suspend(void)
303{
304 struct irq_chip_generic *gc;
305
306 list_for_each_entry(gc, &gc_list, list) {
307 struct irq_chip_type *ct = gc->chip_types;
308
309 if (ct->chip.irq_suspend)
310 ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base));
311 }
312 return 0;
313}
314
315static void irq_gc_resume(void)
316{
317 struct irq_chip_generic *gc;
318
319 list_for_each_entry(gc, &gc_list, list) {
320 struct irq_chip_type *ct = gc->chip_types;
321
322 if (ct->chip.irq_resume)
323 ct->chip.irq_resume(irq_get_irq_data(gc->irq_base));
324 }
325}
326#else
327#define irq_gc_suspend NULL
328#define irq_gc_resume NULL
329#endif
330
331static void irq_gc_shutdown(void)
332{
333 struct irq_chip_generic *gc;
334
335 list_for_each_entry(gc, &gc_list, list) {
336 struct irq_chip_type *ct = gc->chip_types;
337
338 if (ct->chip.irq_pm_shutdown)
339 ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base));
340 }
341}
342
343static struct syscore_ops irq_gc_syscore_ops = {
344 .suspend = irq_gc_suspend,
345 .resume = irq_gc_resume,
346 .shutdown = irq_gc_shutdown,
347};
348
349static int __init irq_gc_init_ops(void)
350{
351 register_syscore_ops(&irq_gc_syscore_ops);
352 return 0;
353}
354device_initcall(irq_gc_init_ops);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3540a7190122..90cb55f6d7eb 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -51,30 +51,92 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
51 "but no thread function available.", irq, action->name); 51 "but no thread function available.", irq, action->name);
52} 52}
53 53
54/** 54static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
55 * handle_IRQ_event - irq action chain handler 55{
56 * @irq: the interrupt number 56 /*
57 * @action: the interrupt action chain for this irq 57 * Wake up the handler thread for this action. In case the
58 * 58 * thread crashed and was killed we just pretend that we
59 * Handles the action chain of an irq event 59 * handled the interrupt. The hardirq handler has disabled the
60 */ 60 * device interrupt, so no irq storm is lurking. If the
61irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) 61 * RUNTHREAD bit is already set, nothing to do.
62 */
63 if (test_bit(IRQTF_DIED, &action->thread_flags) ||
64 test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
65 return;
66
67 /*
68 * It's safe to OR the mask lockless here. We have only two
69 * places which write to threads_oneshot: This code and the
70 * irq thread.
71 *
72 * This code is the hard irq context and can never run on two
73 * cpus in parallel. If it ever does we have more serious
74 * problems than this bitmask.
75 *
76 * The irq threads of this irq which clear their "running" bit
77 * in threads_oneshot are serialized via desc->lock against
78 * each other and they are serialized against this code by
79 * IRQS_INPROGRESS.
80 *
81 * Hard irq handler:
82 *
83 * spin_lock(desc->lock);
84 * desc->state |= IRQS_INPROGRESS;
85 * spin_unlock(desc->lock);
86 * set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
87 * desc->threads_oneshot |= mask;
88 * spin_lock(desc->lock);
89 * desc->state &= ~IRQS_INPROGRESS;
90 * spin_unlock(desc->lock);
91 *
92 * irq thread:
93 *
94 * again:
95 * spin_lock(desc->lock);
96 * if (desc->state & IRQS_INPROGRESS) {
97 * spin_unlock(desc->lock);
98 * while(desc->state & IRQS_INPROGRESS)
99 * cpu_relax();
100 * goto again;
101 * }
102 * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
103 * desc->threads_oneshot &= ~mask;
104 * spin_unlock(desc->lock);
105 *
106 * So either the thread waits for us to clear IRQS_INPROGRESS
107 * or we are waiting in the flow handler for desc->lock to be
108 * released before we reach this point. The thread also checks
109 * IRQTF_RUNTHREAD under desc->lock. If set it leaves
110 * threads_oneshot untouched and runs the thread another time.
111 */
112 desc->threads_oneshot |= action->thread_mask;
113 wake_up_process(action->thread);
114}
115
116irqreturn_t
117handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
62{ 118{
63 irqreturn_t ret, retval = IRQ_NONE; 119 irqreturn_t retval = IRQ_NONE;
64 unsigned int status = 0; 120 unsigned int random = 0, irq = desc->irq_data.irq;
65 121
66 do { 122 do {
123 irqreturn_t res;
124
67 trace_irq_handler_entry(irq, action); 125 trace_irq_handler_entry(irq, action);
68 ret = action->handler(irq, action->dev_id); 126 res = action->handler(irq, action->dev_id);
69 trace_irq_handler_exit(irq, action, ret); 127 trace_irq_handler_exit(irq, action, res);
128
129 if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
130 irq, action->handler))
131 local_irq_disable();
70 132
71 switch (ret) { 133 switch (res) {
72 case IRQ_WAKE_THREAD: 134 case IRQ_WAKE_THREAD:
73 /* 135 /*
74 * Set result to handled so the spurious check 136 * Set result to handled so the spurious check
75 * does not trigger. 137 * does not trigger.
76 */ 138 */
77 ret = IRQ_HANDLED; 139 res = IRQ_HANDLED;
78 140
79 /* 141 /*
80 * Catch drivers which return WAKE_THREAD but 142 * Catch drivers which return WAKE_THREAD but
@@ -85,36 +147,41 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
85 break; 147 break;
86 } 148 }
87 149
88 /* 150 irq_wake_thread(desc, action);
89 * Wake up the handler thread for this
90 * action. In case the thread crashed and was
91 * killed we just pretend that we handled the
92 * interrupt. The hardirq handler above has
93 * disabled the device interrupt, so no irq
94 * storm is lurking.
95 */
96 if (likely(!test_bit(IRQTF_DIED,
97 &action->thread_flags))) {
98 set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
99 wake_up_process(action->thread);
100 }
101 151
102 /* Fall through to add to randomness */ 152 /* Fall through to add to randomness */
103 case IRQ_HANDLED: 153 case IRQ_HANDLED:
104 status |= action->flags; 154 random |= action->flags;
105 break; 155 break;
106 156
107 default: 157 default:
108 break; 158 break;
109 } 159 }
110 160
111 retval |= ret; 161 retval |= res;
112 action = action->next; 162 action = action->next;
113 } while (action); 163 } while (action);
114 164
115 if (status & IRQF_SAMPLE_RANDOM) 165 if (random & IRQF_SAMPLE_RANDOM)
116 add_interrupt_randomness(irq); 166 add_interrupt_randomness(irq);
117 local_irq_disable();
118 167
168 if (!noirqdebug)
169 note_interrupt(irq, desc, retval);
119 return retval; 170 return retval;
120} 171}
172
173irqreturn_t handle_irq_event(struct irq_desc *desc)
174{
175 struct irqaction *action = desc->action;
176 irqreturn_t ret;
177
178 desc->istate &= ~IRQS_PENDING;
179 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
180 raw_spin_unlock(&desc->lock);
181
182 ret = handle_irq_event_percpu(desc, action);
183
184 raw_spin_lock(&desc->lock);
185 irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
186 return ret;
187}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 4571ae7e085a..6546431447d7 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,27 +1,87 @@
1/* 1/*
2 * IRQ subsystem internal functions and variables: 2 * IRQ subsystem internal functions and variables:
3 *
4 * Do not ever include this file from anything else than
5 * kernel/irq/. Do not even think about using any information outside
6 * of this file for your non core code.
3 */ 7 */
4#include <linux/irqdesc.h> 8#include <linux/irqdesc.h>
5 9
10#ifdef CONFIG_SPARSE_IRQ
11# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
12#else
13# define IRQ_BITMAP_BITS NR_IRQS
14#endif
15
16#define istate core_internal_state__do_not_mess_with_it
17
6extern int noirqdebug; 18extern int noirqdebug;
7 19
8#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) 20/*
21 * Bits used by threaded handlers:
22 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
23 * IRQTF_DIED - handler thread died
24 * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
25 * IRQTF_AFFINITY - irq thread is requested to adjust affinity
26 * IRQTF_FORCED_THREAD - irq action is force threaded
27 */
28enum {
29 IRQTF_RUNTHREAD,
30 IRQTF_DIED,
31 IRQTF_WARNED,
32 IRQTF_AFFINITY,
33 IRQTF_FORCED_THREAD,
34};
9 35
10/* Set default functions for irq_chip structures: */ 36/*
11extern void irq_chip_set_defaults(struct irq_chip *chip); 37 * Bit masks for desc->state
38 *
39 * IRQS_AUTODETECT - autodetection in progress
40 * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt
41 * detection
42 * IRQS_POLL_INPROGRESS - polling in progress
43 * IRQS_ONESHOT - irq is not unmasked in primary handler
44 * IRQS_REPLAY - irq is replayed
45 * IRQS_WAITING - irq is waiting
46 * IRQS_PENDING - irq is pending and replayed later
47 * IRQS_SUSPENDED - irq is suspended
48 */
49enum {
50 IRQS_AUTODETECT = 0x00000001,
51 IRQS_SPURIOUS_DISABLED = 0x00000002,
52 IRQS_POLL_INPROGRESS = 0x00000008,
53 IRQS_ONESHOT = 0x00000020,
54 IRQS_REPLAY = 0x00000040,
55 IRQS_WAITING = 0x00000080,
56 IRQS_PENDING = 0x00000200,
57 IRQS_SUSPENDED = 0x00000800,
58};
59
60#include "debug.h"
61#include "settings.h"
12 62
13/* Set default handler: */ 63#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
14extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
15 64
16extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 65extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
17 unsigned long flags); 66 unsigned long flags);
18extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); 67extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
19extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 68extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
20 69
70extern int irq_startup(struct irq_desc *desc);
71extern void irq_shutdown(struct irq_desc *desc);
72extern void irq_enable(struct irq_desc *desc);
73extern void irq_disable(struct irq_desc *desc);
74extern void mask_irq(struct irq_desc *desc);
75extern void unmask_irq(struct irq_desc *desc);
76
21extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 77extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
22 78
79irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
80irqreturn_t handle_irq_event(struct irq_desc *desc);
81
23/* Resending of interrupts :*/ 82/* Resending of interrupts :*/
24void check_irq_resend(struct irq_desc *desc, unsigned int irq); 83void check_irq_resend(struct irq_desc *desc, unsigned int irq);
84bool irq_wait_for_poll(struct irq_desc *desc);
25 85
26#ifdef CONFIG_PROC_FS 86#ifdef CONFIG_PROC_FS
27extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); 87extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
@@ -37,20 +97,10 @@ static inline void unregister_handler_proc(unsigned int irq,
37 struct irqaction *action) { } 97 struct irqaction *action) { }
38#endif 98#endif
39 99
40extern int irq_select_affinity_usr(unsigned int irq); 100extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
41 101
42extern void irq_set_thread_affinity(struct irq_desc *desc); 102extern void irq_set_thread_affinity(struct irq_desc *desc);
43 103
44#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
45static inline void irq_end(unsigned int irq, struct irq_desc *desc)
46{
47 if (desc->irq_data.chip && desc->irq_data.chip->end)
48 desc->irq_data.chip->end(irq);
49}
50#else
51static inline void irq_end(unsigned int irq, struct irq_desc *desc) { }
52#endif
53
54/* Inline functions for support of irq chips on slow busses */ 104/* Inline functions for support of irq chips on slow busses */
55static inline void chip_bus_lock(struct irq_desc *desc) 105static inline void chip_bus_lock(struct irq_desc *desc)
56{ 106{
@@ -64,43 +114,58 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
64 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); 114 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
65} 115}
66 116
117struct irq_desc *
118__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus);
119void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
120
121static inline struct irq_desc *
122irq_get_desc_buslock(unsigned int irq, unsigned long *flags)
123{
124 return __irq_get_desc_lock(irq, flags, true);
125}
126
127static inline void
128irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
129{
130 __irq_put_desc_unlock(desc, flags, true);
131}
132
133static inline struct irq_desc *
134irq_get_desc_lock(unsigned int irq, unsigned long *flags)
135{
136 return __irq_get_desc_lock(irq, flags, false);
137}
138
139static inline void
140irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
141{
142 __irq_put_desc_unlock(desc, flags, false);
143}
144
67/* 145/*
68 * Debugging printout: 146 * Manipulation functions for irq_data.state
69 */ 147 */
148static inline void irqd_set_move_pending(struct irq_data *d)
149{
150 d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
151}
70 152
71#include <linux/kallsyms.h> 153static inline void irqd_clr_move_pending(struct irq_data *d)
72 154{
73#define P(f) if (desc->status & f) printk("%14s set\n", #f) 155 d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
156}
74 157
75static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) 158static inline void irqd_clear(struct irq_data *d, unsigned int mask)
76{ 159{
77 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", 160 d->state_use_accessors &= ~mask;
78 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
79 printk("->handle_irq(): %p, ", desc->handle_irq);
80 print_symbol("%s\n", (unsigned long)desc->handle_irq);
81 printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
82 print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
83 printk("->action(): %p\n", desc->action);
84 if (desc->action) {
85 printk("->action->handler(): %p, ", desc->action->handler);
86 print_symbol("%s\n", (unsigned long)desc->action->handler);
87 }
88
89 P(IRQ_INPROGRESS);
90 P(IRQ_DISABLED);
91 P(IRQ_PENDING);
92 P(IRQ_REPLAY);
93 P(IRQ_AUTODETECT);
94 P(IRQ_WAITING);
95 P(IRQ_LEVEL);
96 P(IRQ_MASKED);
97#ifdef CONFIG_IRQ_PER_CPU
98 P(IRQ_PER_CPU);
99#endif
100 P(IRQ_NOPROBE);
101 P(IRQ_NOREQUEST);
102 P(IRQ_NOAUTOEN);
103} 161}
104 162
105#undef P 163static inline void irqd_set(struct irq_data *d, unsigned int mask)
164{
165 d->state_use_accessors |= mask;
166}
106 167
168static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
169{
170 return d->state_use_accessors & mask;
171}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 282f20230e67..886e80347b32 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -22,7 +22,7 @@
22 */ 22 */
23static struct lock_class_key irq_desc_lock_class; 23static struct lock_class_key irq_desc_lock_class;
24 24
25#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) 25#if defined(CONFIG_SMP)
26static void __init init_irq_default_affinity(void) 26static void __init init_irq_default_affinity(void)
27{ 27{
28 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); 28 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
@@ -79,7 +79,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
79 desc->irq_data.chip_data = NULL; 79 desc->irq_data.chip_data = NULL;
80 desc->irq_data.handler_data = NULL; 80 desc->irq_data.handler_data = NULL;
81 desc->irq_data.msi_desc = NULL; 81 desc->irq_data.msi_desc = NULL;
82 desc->status = IRQ_DEFAULT_INIT_FLAGS; 82 irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
83 irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
83 desc->handle_irq = handle_bad_irq; 84 desc->handle_irq = handle_bad_irq;
84 desc->depth = 1; 85 desc->depth = 1;
85 desc->irq_count = 0; 86 desc->irq_count = 0;
@@ -94,7 +95,7 @@ int nr_irqs = NR_IRQS;
94EXPORT_SYMBOL_GPL(nr_irqs); 95EXPORT_SYMBOL_GPL(nr_irqs);
95 96
96static DEFINE_MUTEX(sparse_irq_lock); 97static DEFINE_MUTEX(sparse_irq_lock);
97static DECLARE_BITMAP(allocated_irqs, NR_IRQS); 98static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
98 99
99#ifdef CONFIG_SPARSE_IRQ 100#ifdef CONFIG_SPARSE_IRQ
100 101
@@ -197,13 +198,12 @@ err:
197 return -ENOMEM; 198 return -ENOMEM;
198} 199}
199 200
200struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) 201static int irq_expand_nr_irqs(unsigned int nr)
201{ 202{
202 int res = irq_alloc_descs(irq, irq, 1, node); 203 if (nr > IRQ_BITMAP_BITS)
203 204 return -ENOMEM;
204 if (res == -EEXIST || res == irq) 205 nr_irqs = nr;
205 return irq_to_desc(irq); 206 return 0;
206 return NULL;
207} 207}
208 208
209int __init early_irq_init(void) 209int __init early_irq_init(void)
@@ -217,6 +217,15 @@ int __init early_irq_init(void)
217 initcnt = arch_probe_nr_irqs(); 217 initcnt = arch_probe_nr_irqs();
218 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); 218 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
219 219
220 if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
221 nr_irqs = IRQ_BITMAP_BITS;
222
223 if (WARN_ON(initcnt > IRQ_BITMAP_BITS))
224 initcnt = IRQ_BITMAP_BITS;
225
226 if (initcnt > nr_irqs)
227 nr_irqs = initcnt;
228
220 for (i = 0; i < initcnt; i++) { 229 for (i = 0; i < initcnt; i++) {
221 desc = alloc_desc(i, node); 230 desc = alloc_desc(i, node);
222 set_bit(i, allocated_irqs); 231 set_bit(i, allocated_irqs);
@@ -229,7 +238,6 @@ int __init early_irq_init(void)
229 238
230struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { 239struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
231 [0 ... NR_IRQS-1] = { 240 [0 ... NR_IRQS-1] = {
232 .status = IRQ_DEFAULT_INIT_FLAGS,
233 .handle_irq = handle_bad_irq, 241 .handle_irq = handle_bad_irq,
234 .depth = 1, 242 .depth = 1,
235 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), 243 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
@@ -251,8 +259,8 @@ int __init early_irq_init(void)
251 for (i = 0; i < count; i++) { 259 for (i = 0; i < count; i++) {
252 desc[i].irq_data.irq = i; 260 desc[i].irq_data.irq = i;
253 desc[i].irq_data.chip = &no_irq_chip; 261 desc[i].irq_data.chip = &no_irq_chip;
254 /* TODO : do this allocation on-demand ... */
255 desc[i].kstat_irqs = alloc_percpu(unsigned int); 262 desc[i].kstat_irqs = alloc_percpu(unsigned int);
263 irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
256 alloc_masks(desc + i, GFP_KERNEL, node); 264 alloc_masks(desc + i, GFP_KERNEL, node);
257 desc_smp_init(desc + i, node); 265 desc_smp_init(desc + i, node);
258 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 266 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -265,11 +273,6 @@ struct irq_desc *irq_to_desc(unsigned int irq)
265 return (irq < NR_IRQS) ? irq_desc + irq : NULL; 273 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
266} 274}
267 275
268struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
269{
270 return irq_to_desc(irq);
271}
272
273static void free_desc(unsigned int irq) 276static void free_desc(unsigned int irq)
274{ 277{
275 dynamic_irq_cleanup(irq); 278 dynamic_irq_cleanup(irq);
@@ -277,26 +280,32 @@ static void free_desc(unsigned int irq)
277 280
278static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) 281static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
279{ 282{
280#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
281 struct irq_desc *desc;
282 unsigned int i;
283
284 for (i = 0; i < cnt; i++) {
285 desc = irq_to_desc(start + i);
286 if (desc && !desc->kstat_irqs) {
287 unsigned int __percpu *stats = alloc_percpu(unsigned int);
288
289 if (!stats)
290 return -1;
291 if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
292 free_percpu(stats);
293 }
294 }
295#endif
296 return start; 283 return start;
297} 284}
285
286static int irq_expand_nr_irqs(unsigned int nr)
287{
288 return -ENOMEM;
289}
290
298#endif /* !CONFIG_SPARSE_IRQ */ 291#endif /* !CONFIG_SPARSE_IRQ */
299 292
293/**
294 * generic_handle_irq - Invoke the handler for a particular irq
295 * @irq: The irq number to handle
296 *
297 */
298int generic_handle_irq(unsigned int irq)
299{
300 struct irq_desc *desc = irq_to_desc(irq);
301
302 if (!desc)
303 return -EINVAL;
304 generic_handle_irq_desc(irq, desc);
305 return 0;
306}
307EXPORT_SYMBOL_GPL(generic_handle_irq);
308
300/* Dynamic interrupt handling */ 309/* Dynamic interrupt handling */
301 310
302/** 311/**
@@ -318,6 +327,7 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
318 bitmap_clear(allocated_irqs, from, cnt); 327 bitmap_clear(allocated_irqs, from, cnt);
319 mutex_unlock(&sparse_irq_lock); 328 mutex_unlock(&sparse_irq_lock);
320} 329}
330EXPORT_SYMBOL_GPL(irq_free_descs);
321 331
322/** 332/**
323 * irq_alloc_descs - allocate and initialize a range of irq descriptors 333 * irq_alloc_descs - allocate and initialize a range of irq descriptors
@@ -338,14 +348,17 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
338 348
339 mutex_lock(&sparse_irq_lock); 349 mutex_lock(&sparse_irq_lock);
340 350
341 start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); 351 start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
352 from, cnt, 0);
342 ret = -EEXIST; 353 ret = -EEXIST;
343 if (irq >=0 && start != irq) 354 if (irq >=0 && start != irq)
344 goto err; 355 goto err;
345 356
346 ret = -ENOMEM; 357 if (start + cnt > nr_irqs) {
347 if (start >= nr_irqs) 358 ret = irq_expand_nr_irqs(start + cnt);
348 goto err; 359 if (ret)
360 goto err;
361 }
349 362
350 bitmap_set(allocated_irqs, start, cnt); 363 bitmap_set(allocated_irqs, start, cnt);
351 mutex_unlock(&sparse_irq_lock); 364 mutex_unlock(&sparse_irq_lock);
@@ -355,6 +368,7 @@ err:
355 mutex_unlock(&sparse_irq_lock); 368 mutex_unlock(&sparse_irq_lock);
356 return ret; 369 return ret;
357} 370}
371EXPORT_SYMBOL_GPL(irq_alloc_descs);
358 372
359/** 373/**
360 * irq_reserve_irqs - mark irqs allocated 374 * irq_reserve_irqs - mark irqs allocated
@@ -392,6 +406,26 @@ unsigned int irq_get_next_irq(unsigned int offset)
392 return find_next_bit(allocated_irqs, nr_irqs, offset); 406 return find_next_bit(allocated_irqs, nr_irqs, offset);
393} 407}
394 408
409struct irq_desc *
410__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus)
411{
412 struct irq_desc *desc = irq_to_desc(irq);
413
414 if (desc) {
415 if (bus)
416 chip_bus_lock(desc);
417 raw_spin_lock_irqsave(&desc->lock, *flags);
418 }
419 return desc;
420}
421
422void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
423{
424 raw_spin_unlock_irqrestore(&desc->lock, flags);
425 if (bus)
426 chip_bus_sync_unlock(desc);
427}
428
395/** 429/**
396 * dynamic_irq_cleanup - cleanup a dynamically allocated irq 430 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
397 * @irq: irq number to initialize 431 * @irq: irq number to initialize
@@ -414,7 +448,6 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
414 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; 448 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
415} 449}
416 450
417#ifdef CONFIG_GENERIC_HARDIRQS
418unsigned int kstat_irqs(unsigned int irq) 451unsigned int kstat_irqs(unsigned int irq)
419{ 452{
420 struct irq_desc *desc = irq_to_desc(irq); 453 struct irq_desc *desc = irq_to_desc(irq);
@@ -427,4 +460,3 @@ unsigned int kstat_irqs(unsigned int irq)
427 sum += *per_cpu_ptr(desc->kstat_irqs, cpu); 460 sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
428 return sum; 461 return sum;
429} 462}
430#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0caa59f747dd..f7ce0021e1c4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,17 @@
17 17
18#include "internals.h" 18#include "internals.h"
19 19
20#ifdef CONFIG_IRQ_FORCED_THREADING
21__read_mostly bool force_irqthreads;
22
23static int __init setup_forced_irqthreads(char *arg)
24{
25 force_irqthreads = true;
26 return 0;
27}
28early_param("threadirqs", setup_forced_irqthreads);
29#endif
30
20/** 31/**
21 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 32 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
22 * @irq: interrupt number to wait for 33 * @irq: interrupt number to wait for
@@ -30,7 +41,7 @@
30void synchronize_irq(unsigned int irq) 41void synchronize_irq(unsigned int irq)
31{ 42{
32 struct irq_desc *desc = irq_to_desc(irq); 43 struct irq_desc *desc = irq_to_desc(irq);
33 unsigned int status; 44 bool inprogress;
34 45
35 if (!desc) 46 if (!desc)
36 return; 47 return;
@@ -42,16 +53,16 @@ void synchronize_irq(unsigned int irq)
42 * Wait until we're out of the critical section. This might 53 * Wait until we're out of the critical section. This might
43 * give the wrong answer due to the lack of memory barriers. 54 * give the wrong answer due to the lack of memory barriers.
44 */ 55 */
45 while (desc->status & IRQ_INPROGRESS) 56 while (irqd_irq_inprogress(&desc->irq_data))
46 cpu_relax(); 57 cpu_relax();
47 58
48 /* Ok, that indicated we're done: double-check carefully. */ 59 /* Ok, that indicated we're done: double-check carefully. */
49 raw_spin_lock_irqsave(&desc->lock, flags); 60 raw_spin_lock_irqsave(&desc->lock, flags);
50 status = desc->status; 61 inprogress = irqd_irq_inprogress(&desc->irq_data);
51 raw_spin_unlock_irqrestore(&desc->lock, flags); 62 raw_spin_unlock_irqrestore(&desc->lock, flags);
52 63
53 /* Oops, that failed? */ 64 /* Oops, that failed? */
54 } while (status & IRQ_INPROGRESS); 65 } while (inprogress);
55 66
56 /* 67 /*
57 * We made sure that no hardirq handler is running. Now verify 68 * We made sure that no hardirq handler is running. Now verify
@@ -73,8 +84,8 @@ int irq_can_set_affinity(unsigned int irq)
73{ 84{
74 struct irq_desc *desc = irq_to_desc(irq); 85 struct irq_desc *desc = irq_to_desc(irq);
75 86
76 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip || 87 if (!desc || !irqd_can_balance(&desc->irq_data) ||
77 !desc->irq_data.chip->irq_set_affinity) 88 !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
78 return 0; 89 return 0;
79 90
80 return 1; 91 return 1;
@@ -100,67 +111,180 @@ void irq_set_thread_affinity(struct irq_desc *desc)
100 } 111 }
101} 112}
102 113
114#ifdef CONFIG_GENERIC_PENDING_IRQ
115static inline bool irq_can_move_pcntxt(struct irq_data *data)
116{
117 return irqd_can_move_in_process_context(data);
118}
119static inline bool irq_move_pending(struct irq_data *data)
120{
121 return irqd_is_setaffinity_pending(data);
122}
123static inline void
124irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
125{
126 cpumask_copy(desc->pending_mask, mask);
127}
128static inline void
129irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
130{
131 cpumask_copy(mask, desc->pending_mask);
132}
133#else
134static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; }
135static inline bool irq_move_pending(struct irq_data *data) { return false; }
136static inline void
137irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { }
138static inline void
139irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
140#endif
141
142int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
143{
144 struct irq_chip *chip = irq_data_get_irq_chip(data);
145 struct irq_desc *desc = irq_data_to_desc(data);
146 int ret = 0;
147
148 if (!chip || !chip->irq_set_affinity)
149 return -EINVAL;
150
151 if (irq_can_move_pcntxt(data)) {
152 ret = chip->irq_set_affinity(data, mask, false);
153 switch (ret) {
154 case IRQ_SET_MASK_OK:
155 cpumask_copy(data->affinity, mask);
156 case IRQ_SET_MASK_OK_NOCOPY:
157 irq_set_thread_affinity(desc);
158 ret = 0;
159 }
160 } else {
161 irqd_set_move_pending(data);
162 irq_copy_pending(desc, mask);
163 }
164
165 if (desc->affinity_notify) {
166 kref_get(&desc->affinity_notify->kref);
167 schedule_work(&desc->affinity_notify->work);
168 }
169 irqd_set(data, IRQD_AFFINITY_SET);
170
171 return ret;
172}
173
103/** 174/**
104 * irq_set_affinity - Set the irq affinity of a given irq 175 * irq_set_affinity - Set the irq affinity of a given irq
105 * @irq: Interrupt to set affinity 176 * @irq: Interrupt to set affinity
106 * @cpumask: cpumask 177 * @mask: cpumask
107 * 178 *
108 */ 179 */
109int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) 180int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
110{ 181{
111 struct irq_desc *desc = irq_to_desc(irq); 182 struct irq_desc *desc = irq_to_desc(irq);
112 struct irq_chip *chip = desc->irq_data.chip;
113 unsigned long flags; 183 unsigned long flags;
184 int ret;
114 185
115 if (!chip->irq_set_affinity) 186 if (!desc)
116 return -EINVAL; 187 return -EINVAL;
117 188
118 raw_spin_lock_irqsave(&desc->lock, flags); 189 raw_spin_lock_irqsave(&desc->lock, flags);
119 190 ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask);
120#ifdef CONFIG_GENERIC_PENDING_IRQ
121 if (desc->status & IRQ_MOVE_PCNTXT) {
122 if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
123 cpumask_copy(desc->irq_data.affinity, cpumask);
124 irq_set_thread_affinity(desc);
125 }
126 }
127 else {
128 desc->status |= IRQ_MOVE_PENDING;
129 cpumask_copy(desc->pending_mask, cpumask);
130 }
131#else
132 if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
133 cpumask_copy(desc->irq_data.affinity, cpumask);
134 irq_set_thread_affinity(desc);
135 }
136#endif
137 desc->status |= IRQ_AFFINITY_SET;
138 raw_spin_unlock_irqrestore(&desc->lock, flags); 191 raw_spin_unlock_irqrestore(&desc->lock, flags);
139 return 0; 192 return ret;
140} 193}
141 194
142int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) 195int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
143{ 196{
197 unsigned long flags;
198 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
199
200 if (!desc)
201 return -EINVAL;
202 desc->affinity_hint = m;
203 irq_put_desc_unlock(desc, flags);
204 return 0;
205}
206EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
207
208static void irq_affinity_notify(struct work_struct *work)
209{
210 struct irq_affinity_notify *notify =
211 container_of(work, struct irq_affinity_notify, work);
212 struct irq_desc *desc = irq_to_desc(notify->irq);
213 cpumask_var_t cpumask;
214 unsigned long flags;
215
216 if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL))
217 goto out;
218
219 raw_spin_lock_irqsave(&desc->lock, flags);
220 if (irq_move_pending(&desc->irq_data))
221 irq_get_pending(cpumask, desc);
222 else
223 cpumask_copy(cpumask, desc->irq_data.affinity);
224 raw_spin_unlock_irqrestore(&desc->lock, flags);
225
226 notify->notify(notify, cpumask);
227
228 free_cpumask_var(cpumask);
229out:
230 kref_put(&notify->kref, notify->release);
231}
232
233/**
234 * irq_set_affinity_notifier - control notification of IRQ affinity changes
235 * @irq: Interrupt for which to enable/disable notification
236 * @notify: Context for notification, or %NULL to disable
237 * notification. Function pointers must be initialised;
238 * the other fields will be initialised by this function.
239 *
240 * Must be called in process context. Notification may only be enabled
241 * after the IRQ is allocated and must be disabled before the IRQ is
242 * freed using free_irq().
243 */
244int
245irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
246{
144 struct irq_desc *desc = irq_to_desc(irq); 247 struct irq_desc *desc = irq_to_desc(irq);
248 struct irq_affinity_notify *old_notify;
145 unsigned long flags; 249 unsigned long flags;
146 250
251 /* The release function is promised process context */
252 might_sleep();
253
147 if (!desc) 254 if (!desc)
148 return -EINVAL; 255 return -EINVAL;
149 256
257 /* Complete initialisation of *notify */
258 if (notify) {
259 notify->irq = irq;
260 kref_init(&notify->kref);
261 INIT_WORK(&notify->work, irq_affinity_notify);
262 }
263
150 raw_spin_lock_irqsave(&desc->lock, flags); 264 raw_spin_lock_irqsave(&desc->lock, flags);
151 desc->affinity_hint = m; 265 old_notify = desc->affinity_notify;
266 desc->affinity_notify = notify;
152 raw_spin_unlock_irqrestore(&desc->lock, flags); 267 raw_spin_unlock_irqrestore(&desc->lock, flags);
153 268
269 if (old_notify)
270 kref_put(&old_notify->kref, old_notify->release);
271
154 return 0; 272 return 0;
155} 273}
156EXPORT_SYMBOL_GPL(irq_set_affinity_hint); 274EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
157 275
158#ifndef CONFIG_AUTO_IRQ_AFFINITY 276#ifndef CONFIG_AUTO_IRQ_AFFINITY
159/* 277/*
160 * Generic version of the affinity autoselector. 278 * Generic version of the affinity autoselector.
161 */ 279 */
162static int setup_affinity(unsigned int irq, struct irq_desc *desc) 280static int
281setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
163{ 282{
283 struct irq_chip *chip = irq_desc_get_chip(desc);
284 struct cpumask *set = irq_default_affinity;
285 int ret;
286
287 /* Excludes PER_CPU and NO_BALANCE interrupts */
164 if (!irq_can_set_affinity(irq)) 288 if (!irq_can_set_affinity(irq))
165 return 0; 289 return 0;
166 290
@@ -168,22 +292,27 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
168 * Preserve an userspace affinity setup, but make sure that 292 * Preserve an userspace affinity setup, but make sure that
169 * one of the targets is online. 293 * one of the targets is online.
170 */ 294 */
171 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { 295 if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
172 if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) 296 if (cpumask_intersects(desc->irq_data.affinity,
173 < nr_cpu_ids) 297 cpu_online_mask))
174 goto set_affinity; 298 set = desc->irq_data.affinity;
175 else 299 else
176 desc->status &= ~IRQ_AFFINITY_SET; 300 irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
177 } 301 }
178 302
179 cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity); 303 cpumask_and(mask, cpu_online_mask, set);
180set_affinity: 304 ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
181 desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); 305 switch (ret) {
182 306 case IRQ_SET_MASK_OK:
307 cpumask_copy(desc->irq_data.affinity, mask);
308 case IRQ_SET_MASK_OK_NOCOPY:
309 irq_set_thread_affinity(desc);
310 }
183 return 0; 311 return 0;
184} 312}
185#else 313#else
186static inline int setup_affinity(unsigned int irq, struct irq_desc *d) 314static inline int
315setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask)
187{ 316{
188 return irq_select_affinity(irq); 317 return irq_select_affinity(irq);
189} 318}
@@ -192,23 +321,21 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
192/* 321/*
193 * Called when affinity is set via /proc/irq 322 * Called when affinity is set via /proc/irq
194 */ 323 */
195int irq_select_affinity_usr(unsigned int irq) 324int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
196{ 325{
197 struct irq_desc *desc = irq_to_desc(irq); 326 struct irq_desc *desc = irq_to_desc(irq);
198 unsigned long flags; 327 unsigned long flags;
199 int ret; 328 int ret;
200 329
201 raw_spin_lock_irqsave(&desc->lock, flags); 330 raw_spin_lock_irqsave(&desc->lock, flags);
202 ret = setup_affinity(irq, desc); 331 ret = setup_affinity(irq, desc, mask);
203 if (!ret)
204 irq_set_thread_affinity(desc);
205 raw_spin_unlock_irqrestore(&desc->lock, flags); 332 raw_spin_unlock_irqrestore(&desc->lock, flags);
206
207 return ret; 333 return ret;
208} 334}
209 335
210#else 336#else
211static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) 337static inline int
338setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
212{ 339{
213 return 0; 340 return 0;
214} 341}
@@ -219,13 +346,23 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
219 if (suspend) { 346 if (suspend) {
220 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) 347 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
221 return; 348 return;
222 desc->status |= IRQ_SUSPENDED; 349 desc->istate |= IRQS_SUSPENDED;
223 } 350 }
224 351
225 if (!desc->depth++) { 352 if (!desc->depth++)
226 desc->status |= IRQ_DISABLED; 353 irq_disable(desc);
227 desc->irq_data.chip->irq_disable(&desc->irq_data); 354}
228 } 355
356static int __disable_irq_nosync(unsigned int irq)
357{
358 unsigned long flags;
359 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
360
361 if (!desc)
362 return -EINVAL;
363 __disable_irq(desc, irq, false);
364 irq_put_desc_busunlock(desc, flags);
365 return 0;
229} 366}
230 367
231/** 368/**
@@ -241,17 +378,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
241 */ 378 */
242void disable_irq_nosync(unsigned int irq) 379void disable_irq_nosync(unsigned int irq)
243{ 380{
244 struct irq_desc *desc = irq_to_desc(irq); 381 __disable_irq_nosync(irq);
245 unsigned long flags;
246
247 if (!desc)
248 return;
249
250 chip_bus_lock(desc);
251 raw_spin_lock_irqsave(&desc->lock, flags);
252 __disable_irq(desc, irq, false);
253 raw_spin_unlock_irqrestore(&desc->lock, flags);
254 chip_bus_sync_unlock(desc);
255} 382}
256EXPORT_SYMBOL(disable_irq_nosync); 383EXPORT_SYMBOL(disable_irq_nosync);
257 384
@@ -269,21 +396,24 @@ EXPORT_SYMBOL(disable_irq_nosync);
269 */ 396 */
270void disable_irq(unsigned int irq) 397void disable_irq(unsigned int irq)
271{ 398{
272 struct irq_desc *desc = irq_to_desc(irq); 399 if (!__disable_irq_nosync(irq))
273
274 if (!desc)
275 return;
276
277 disable_irq_nosync(irq);
278 if (desc->action)
279 synchronize_irq(irq); 400 synchronize_irq(irq);
280} 401}
281EXPORT_SYMBOL(disable_irq); 402EXPORT_SYMBOL(disable_irq);
282 403
283void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) 404void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
284{ 405{
285 if (resume) 406 if (resume) {
286 desc->status &= ~IRQ_SUSPENDED; 407 if (!(desc->istate & IRQS_SUSPENDED)) {
408 if (!desc->action)
409 return;
410 if (!(desc->action->flags & IRQF_FORCE_RESUME))
411 return;
412 /* Pretend that it got disabled ! */
413 desc->depth++;
414 }
415 desc->istate &= ~IRQS_SUSPENDED;
416 }
287 417
288 switch (desc->depth) { 418 switch (desc->depth) {
289 case 0: 419 case 0:
@@ -291,12 +421,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
291 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); 421 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
292 break; 422 break;
293 case 1: { 423 case 1: {
294 unsigned int status = desc->status & ~IRQ_DISABLED; 424 if (desc->istate & IRQS_SUSPENDED)
295
296 if (desc->status & IRQ_SUSPENDED)
297 goto err_out; 425 goto err_out;
298 /* Prevent probing on this irq: */ 426 /* Prevent probing on this irq: */
299 desc->status = status | IRQ_NOPROBE; 427 irq_settings_set_noprobe(desc);
428 irq_enable(desc);
300 check_irq_resend(desc, irq); 429 check_irq_resend(desc, irq);
301 /* fall-through */ 430 /* fall-through */
302 } 431 }
@@ -318,21 +447,18 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
318 */ 447 */
319void enable_irq(unsigned int irq) 448void enable_irq(unsigned int irq)
320{ 449{
321 struct irq_desc *desc = irq_to_desc(irq);
322 unsigned long flags; 450 unsigned long flags;
451 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
323 452
324 if (!desc) 453 if (!desc)
325 return; 454 return;
455 if (WARN(!desc->irq_data.chip,
456 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
457 goto out;
326 458
327 if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable,
328 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
329 return;
330
331 chip_bus_lock(desc);
332 raw_spin_lock_irqsave(&desc->lock, flags);
333 __enable_irq(desc, irq, false); 459 __enable_irq(desc, irq, false);
334 raw_spin_unlock_irqrestore(&desc->lock, flags); 460out:
335 chip_bus_sync_unlock(desc); 461 irq_put_desc_busunlock(desc, flags);
336} 462}
337EXPORT_SYMBOL(enable_irq); 463EXPORT_SYMBOL(enable_irq);
338 464
@@ -348,7 +474,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
348} 474}
349 475
350/** 476/**
351 * set_irq_wake - control irq power management wakeup 477 * irq_set_irq_wake - control irq power management wakeup
352 * @irq: interrupt to control 478 * @irq: interrupt to control
353 * @on: enable/disable power management wakeup 479 * @on: enable/disable power management wakeup
354 * 480 *
@@ -359,23 +485,22 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
359 * Wakeup mode lets this IRQ wake the system from sleep 485 * Wakeup mode lets this IRQ wake the system from sleep
360 * states like "suspend to RAM". 486 * states like "suspend to RAM".
361 */ 487 */
362int set_irq_wake(unsigned int irq, unsigned int on) 488int irq_set_irq_wake(unsigned int irq, unsigned int on)
363{ 489{
364 struct irq_desc *desc = irq_to_desc(irq);
365 unsigned long flags; 490 unsigned long flags;
491 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
366 int ret = 0; 492 int ret = 0;
367 493
368 /* wakeup-capable irqs can be shared between drivers that 494 /* wakeup-capable irqs can be shared between drivers that
369 * don't need to have the same sleep mode behaviors. 495 * don't need to have the same sleep mode behaviors.
370 */ 496 */
371 raw_spin_lock_irqsave(&desc->lock, flags);
372 if (on) { 497 if (on) {
373 if (desc->wake_depth++ == 0) { 498 if (desc->wake_depth++ == 0) {
374 ret = set_irq_wake_real(irq, on); 499 ret = set_irq_wake_real(irq, on);
375 if (ret) 500 if (ret)
376 desc->wake_depth = 0; 501 desc->wake_depth = 0;
377 else 502 else
378 desc->status |= IRQ_WAKEUP; 503 irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
379 } 504 }
380 } else { 505 } else {
381 if (desc->wake_depth == 0) { 506 if (desc->wake_depth == 0) {
@@ -385,14 +510,13 @@ int set_irq_wake(unsigned int irq, unsigned int on)
385 if (ret) 510 if (ret)
386 desc->wake_depth = 1; 511 desc->wake_depth = 1;
387 else 512 else
388 desc->status &= ~IRQ_WAKEUP; 513 irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
389 } 514 }
390 } 515 }
391 516 irq_put_desc_busunlock(desc, flags);
392 raw_spin_unlock_irqrestore(&desc->lock, flags);
393 return ret; 517 return ret;
394} 518}
395EXPORT_SYMBOL(set_irq_wake); 519EXPORT_SYMBOL(irq_set_irq_wake);
396 520
397/* 521/*
398 * Internal function that tells the architecture code whether a 522 * Internal function that tells the architecture code whether a
@@ -401,43 +525,27 @@ EXPORT_SYMBOL(set_irq_wake);
401 */ 525 */
402int can_request_irq(unsigned int irq, unsigned long irqflags) 526int can_request_irq(unsigned int irq, unsigned long irqflags)
403{ 527{
404 struct irq_desc *desc = irq_to_desc(irq);
405 struct irqaction *action;
406 unsigned long flags; 528 unsigned long flags;
529 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
530 int canrequest = 0;
407 531
408 if (!desc) 532 if (!desc)
409 return 0; 533 return 0;
410 534
411 if (desc->status & IRQ_NOREQUEST) 535 if (irq_settings_can_request(desc)) {
412 return 0; 536 if (desc->action)
413 537 if (irqflags & desc->action->flags & IRQF_SHARED)
414 raw_spin_lock_irqsave(&desc->lock, flags); 538 canrequest =1;
415 action = desc->action; 539 }
416 if (action) 540 irq_put_desc_unlock(desc, flags);
417 if (irqflags & action->flags & IRQF_SHARED) 541 return canrequest;
418 action = NULL;
419
420 raw_spin_unlock_irqrestore(&desc->lock, flags);
421
422 return !action;
423}
424
425void compat_irq_chip_set_default_handler(struct irq_desc *desc)
426{
427 /*
428 * If the architecture still has not overriden
429 * the flow handler then zap the default. This
430 * should catch incorrect flow-type setting.
431 */
432 if (desc->handle_irq == &handle_bad_irq)
433 desc->handle_irq = NULL;
434} 542}
435 543
436int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 544int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
437 unsigned long flags) 545 unsigned long flags)
438{ 546{
439 int ret;
440 struct irq_chip *chip = desc->irq_data.chip; 547 struct irq_chip *chip = desc->irq_data.chip;
548 int ret, unmask = 0;
441 549
442 if (!chip || !chip->irq_set_type) { 550 if (!chip || !chip->irq_set_type) {
443 /* 551 /*
@@ -449,23 +557,41 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
449 return 0; 557 return 0;
450 } 558 }
451 559
560 flags &= IRQ_TYPE_SENSE_MASK;
561
562 if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
563 if (!irqd_irq_masked(&desc->irq_data))
564 mask_irq(desc);
565 if (!irqd_irq_disabled(&desc->irq_data))
566 unmask = 1;
567 }
568
452 /* caller masked out all except trigger mode flags */ 569 /* caller masked out all except trigger mode flags */
453 ret = chip->irq_set_type(&desc->irq_data, flags); 570 ret = chip->irq_set_type(&desc->irq_data, flags);
454 571
455 if (ret) 572 switch (ret) {
573 case IRQ_SET_MASK_OK:
574 irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
575 irqd_set(&desc->irq_data, flags);
576
577 case IRQ_SET_MASK_OK_NOCOPY:
578 flags = irqd_get_trigger_type(&desc->irq_data);
579 irq_settings_set_trigger_mask(desc, flags);
580 irqd_clear(&desc->irq_data, IRQD_LEVEL);
581 irq_settings_clr_level(desc);
582 if (flags & IRQ_TYPE_LEVEL_MASK) {
583 irq_settings_set_level(desc);
584 irqd_set(&desc->irq_data, IRQD_LEVEL);
585 }
586
587 ret = 0;
588 break;
589 default:
456 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", 590 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
457 flags, irq, chip->irq_set_type); 591 flags, irq, chip->irq_set_type);
458 else {
459 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
460 flags |= IRQ_LEVEL;
461 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
462 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
463 desc->status |= flags;
464
465 if (chip != desc->irq_data.chip)
466 irq_chip_set_defaults(desc->irq_data.chip);
467 } 592 }
468 593 if (unmask)
594 unmask_irq(desc);
469 return ret; 595 return ret;
470} 596}
471 597
@@ -509,8 +635,11 @@ static int irq_wait_for_interrupt(struct irqaction *action)
509 * handler finished. unmask if the interrupt has not been disabled and 635 * handler finished. unmask if the interrupt has not been disabled and
510 * is marked MASKED. 636 * is marked MASKED.
511 */ 637 */
512static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 638static void irq_finalize_oneshot(struct irq_desc *desc,
639 struct irqaction *action, bool force)
513{ 640{
641 if (!(desc->istate & IRQS_ONESHOT))
642 return;
514again: 643again:
515 chip_bus_lock(desc); 644 chip_bus_lock(desc);
516 raw_spin_lock_irq(&desc->lock); 645 raw_spin_lock_irq(&desc->lock);
@@ -522,26 +651,42 @@ again:
522 * The thread is faster done than the hard interrupt handler 651 * The thread is faster done than the hard interrupt handler
523 * on the other CPU. If we unmask the irq line then the 652 * on the other CPU. If we unmask the irq line then the
524 * interrupt can come in again and masks the line, leaves due 653 * interrupt can come in again and masks the line, leaves due
525 * to IRQ_INPROGRESS and the irq line is masked forever. 654 * to IRQS_INPROGRESS and the irq line is masked forever.
655 *
656 * This also serializes the state of shared oneshot handlers
657 * versus "desc->threads_onehsot |= action->thread_mask;" in
658 * irq_wake_thread(). See the comment there which explains the
659 * serialization.
526 */ 660 */
527 if (unlikely(desc->status & IRQ_INPROGRESS)) { 661 if (unlikely(irqd_irq_inprogress(&desc->irq_data))) {
528 raw_spin_unlock_irq(&desc->lock); 662 raw_spin_unlock_irq(&desc->lock);
529 chip_bus_sync_unlock(desc); 663 chip_bus_sync_unlock(desc);
530 cpu_relax(); 664 cpu_relax();
531 goto again; 665 goto again;
532 } 666 }
533 667
534 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 668 /*
535 desc->status &= ~IRQ_MASKED; 669 * Now check again, whether the thread should run. Otherwise
536 desc->irq_data.chip->irq_unmask(&desc->irq_data); 670 * we would clear the threads_oneshot bit of this thread which
537 } 671 * was just set.
672 */
673 if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
674 goto out_unlock;
675
676 desc->threads_oneshot &= ~action->thread_mask;
677
678 if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) &&
679 irqd_irq_masked(&desc->irq_data))
680 unmask_irq(desc);
681
682out_unlock:
538 raw_spin_unlock_irq(&desc->lock); 683 raw_spin_unlock_irq(&desc->lock);
539 chip_bus_sync_unlock(desc); 684 chip_bus_sync_unlock(desc);
540} 685}
541 686
542#ifdef CONFIG_SMP 687#ifdef CONFIG_SMP
543/* 688/*
544 * Check whether we need to change the affinity of the interrupt thread. 689 * Check whether we need to chasnge the affinity of the interrupt thread.
545 */ 690 */
546static void 691static void
547irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) 692irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
@@ -573,6 +718,32 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
573#endif 718#endif
574 719
575/* 720/*
721 * Interrupts which are not explicitely requested as threaded
722 * interrupts rely on the implicit bh/preempt disable of the hard irq
723 * context. So we need to disable bh here to avoid deadlocks and other
724 * side effects.
725 */
726static void
727irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
728{
729 local_bh_disable();
730 action->thread_fn(action->irq, action->dev_id);
731 irq_finalize_oneshot(desc, action, false);
732 local_bh_enable();
733}
734
735/*
736 * Interrupts explicitely requested as threaded interupts want to be
737 * preemtible - many of them need to sleep and wait for slow busses to
738 * complete.
739 */
740static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action)
741{
742 action->thread_fn(action->irq, action->dev_id);
743 irq_finalize_oneshot(desc, action, false);
744}
745
746/*
576 * Interrupt handler thread 747 * Interrupt handler thread
577 */ 748 */
578static int irq_thread(void *data) 749static int irq_thread(void *data)
@@ -582,7 +753,14 @@ static int irq_thread(void *data)
582 }; 753 };
583 struct irqaction *action = data; 754 struct irqaction *action = data;
584 struct irq_desc *desc = irq_to_desc(action->irq); 755 struct irq_desc *desc = irq_to_desc(action->irq);
585 int wake, oneshot = desc->status & IRQ_ONESHOT; 756 void (*handler_fn)(struct irq_desc *desc, struct irqaction *action);
757 int wake;
758
759 if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
760 &action->thread_flags))
761 handler_fn = irq_forced_thread_fn;
762 else
763 handler_fn = irq_thread_fn;
586 764
587 sched_setscheduler(current, SCHED_FIFO, &param); 765 sched_setscheduler(current, SCHED_FIFO, &param);
588 current->irqaction = action; 766 current->irqaction = action;
@@ -594,23 +772,19 @@ static int irq_thread(void *data)
594 atomic_inc(&desc->threads_active); 772 atomic_inc(&desc->threads_active);
595 773
596 raw_spin_lock_irq(&desc->lock); 774 raw_spin_lock_irq(&desc->lock);
597 if (unlikely(desc->status & IRQ_DISABLED)) { 775 if (unlikely(irqd_irq_disabled(&desc->irq_data))) {
598 /* 776 /*
599 * CHECKME: We might need a dedicated 777 * CHECKME: We might need a dedicated
600 * IRQ_THREAD_PENDING flag here, which 778 * IRQ_THREAD_PENDING flag here, which
601 * retriggers the thread in check_irq_resend() 779 * retriggers the thread in check_irq_resend()
602 * but AFAICT IRQ_PENDING should be fine as it 780 * but AFAICT IRQS_PENDING should be fine as it
603 * retriggers the interrupt itself --- tglx 781 * retriggers the interrupt itself --- tglx
604 */ 782 */
605 desc->status |= IRQ_PENDING; 783 desc->istate |= IRQS_PENDING;
606 raw_spin_unlock_irq(&desc->lock); 784 raw_spin_unlock_irq(&desc->lock);
607 } else { 785 } else {
608 raw_spin_unlock_irq(&desc->lock); 786 raw_spin_unlock_irq(&desc->lock);
609 787 handler_fn(desc, action);
610 action->thread_fn(action->irq, action->dev_id);
611
612 if (oneshot)
613 irq_finalize_oneshot(action->irq, desc);
614 } 788 }
615 789
616 wake = atomic_dec_and_test(&desc->threads_active); 790 wake = atomic_dec_and_test(&desc->threads_active);
@@ -619,6 +793,9 @@ static int irq_thread(void *data)
619 wake_up(&desc->wait_for_threads); 793 wake_up(&desc->wait_for_threads);
620 } 794 }
621 795
796 /* Prevent a stale desc->threads_oneshot */
797 irq_finalize_oneshot(desc, action, true);
798
622 /* 799 /*
623 * Clear irqaction. Otherwise exit_irq_thread() would make 800 * Clear irqaction. Otherwise exit_irq_thread() would make
624 * fuzz about an active irq thread going into nirvana. 801 * fuzz about an active irq thread going into nirvana.
@@ -633,6 +810,7 @@ static int irq_thread(void *data)
633void exit_irq_thread(void) 810void exit_irq_thread(void)
634{ 811{
635 struct task_struct *tsk = current; 812 struct task_struct *tsk = current;
813 struct irq_desc *desc;
636 814
637 if (!tsk->irqaction) 815 if (!tsk->irqaction)
638 return; 816 return;
@@ -641,6 +819,14 @@ void exit_irq_thread(void)
641 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", 819 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
642 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); 820 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
643 821
822 desc = irq_to_desc(tsk->irqaction->irq);
823
824 /*
825 * Prevent a stale desc->threads_oneshot. Must be called
826 * before setting the IRQTF_DIED flag.
827 */
828 irq_finalize_oneshot(desc, tsk->irqaction, true);
829
644 /* 830 /*
645 * Set the THREAD DIED flag to prevent further wakeups of the 831 * Set the THREAD DIED flag to prevent further wakeups of the
646 * soon to be gone threaded handler. 832 * soon to be gone threaded handler.
@@ -648,6 +834,22 @@ void exit_irq_thread(void)
648 set_bit(IRQTF_DIED, &tsk->irqaction->flags); 834 set_bit(IRQTF_DIED, &tsk->irqaction->flags);
649} 835}
650 836
837static void irq_setup_forced_threading(struct irqaction *new)
838{
839 if (!force_irqthreads)
840 return;
841 if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
842 return;
843
844 new->flags |= IRQF_ONESHOT;
845
846 if (!new->thread_fn) {
847 set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
848 new->thread_fn = new->handler;
849 new->handler = irq_default_primary_handler;
850 }
851}
852
651/* 853/*
652 * Internal function to register an irqaction - typically used to 854 * Internal function to register an irqaction - typically used to
653 * allocate special interrupts that are part of the architecture. 855 * allocate special interrupts that are part of the architecture.
@@ -657,9 +859,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
657{ 859{
658 struct irqaction *old, **old_ptr; 860 struct irqaction *old, **old_ptr;
659 const char *old_name = NULL; 861 const char *old_name = NULL;
660 unsigned long flags; 862 unsigned long flags, thread_mask = 0;
661 int nested, shared = 0; 863 int ret, nested, shared = 0;
662 int ret; 864 cpumask_var_t mask;
663 865
664 if (!desc) 866 if (!desc)
665 return -EINVAL; 867 return -EINVAL;
@@ -683,15 +885,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
683 rand_initialize_irq(irq); 885 rand_initialize_irq(irq);
684 } 886 }
685 887
686 /* Oneshot interrupts are not allowed with shared */
687 if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
688 return -EINVAL;
689
690 /* 888 /*
691 * Check whether the interrupt nests into another interrupt 889 * Check whether the interrupt nests into another interrupt
692 * thread. 890 * thread.
693 */ 891 */
694 nested = desc->status & IRQ_NESTED_THREAD; 892 nested = irq_settings_is_nested_thread(desc);
695 if (nested) { 893 if (nested) {
696 if (!new->thread_fn) 894 if (!new->thread_fn)
697 return -EINVAL; 895 return -EINVAL;
@@ -701,6 +899,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
701 * dummy function which warns when called. 899 * dummy function which warns when called.
702 */ 900 */
703 new->handler = irq_nested_primary_handler; 901 new->handler = irq_nested_primary_handler;
902 } else {
903 if (irq_settings_can_thread(desc))
904 irq_setup_forced_threading(new);
704 } 905 }
705 906
706 /* 907 /*
@@ -724,6 +925,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
724 new->thread = t; 925 new->thread = t;
725 } 926 }
726 927
928 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
929 ret = -ENOMEM;
930 goto out_thread;
931 }
932
727 /* 933 /*
728 * The following block of code has to be executed atomically 934 * The following block of code has to be executed atomically
729 */ 935 */
@@ -735,32 +941,41 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
735 * Can't share interrupts unless both agree to and are 941 * Can't share interrupts unless both agree to and are
736 * the same type (level, edge, polarity). So both flag 942 * the same type (level, edge, polarity). So both flag
737 * fields must have IRQF_SHARED set and the bits which 943 * fields must have IRQF_SHARED set and the bits which
738 * set the trigger type must match. 944 * set the trigger type must match. Also all must
945 * agree on ONESHOT.
739 */ 946 */
740 if (!((old->flags & new->flags) & IRQF_SHARED) || 947 if (!((old->flags & new->flags) & IRQF_SHARED) ||
741 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) { 948 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
949 ((old->flags ^ new->flags) & IRQF_ONESHOT)) {
742 old_name = old->name; 950 old_name = old->name;
743 goto mismatch; 951 goto mismatch;
744 } 952 }
745 953
746#if defined(CONFIG_IRQ_PER_CPU)
747 /* All handlers must agree on per-cpuness */ 954 /* All handlers must agree on per-cpuness */
748 if ((old->flags & IRQF_PERCPU) != 955 if ((old->flags & IRQF_PERCPU) !=
749 (new->flags & IRQF_PERCPU)) 956 (new->flags & IRQF_PERCPU))
750 goto mismatch; 957 goto mismatch;
751#endif
752 958
753 /* add new interrupt at end of irq queue */ 959 /* add new interrupt at end of irq queue */
754 do { 960 do {
961 thread_mask |= old->thread_mask;
755 old_ptr = &old->next; 962 old_ptr = &old->next;
756 old = *old_ptr; 963 old = *old_ptr;
757 } while (old); 964 } while (old);
758 shared = 1; 965 shared = 1;
759 } 966 }
760 967
761 if (!shared) { 968 /*
762 irq_chip_set_defaults(desc->irq_data.chip); 969 * Setup the thread mask for this irqaction. Unlikely to have
970 * 32 resp 64 irqs sharing one line, but who knows.
971 */
972 if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) {
973 ret = -EBUSY;
974 goto out_mask;
975 }
976 new->thread_mask = 1 << ffz(thread_mask);
763 977
978 if (!shared) {
764 init_waitqueue_head(&desc->wait_for_threads); 979 init_waitqueue_head(&desc->wait_for_threads);
765 980
766 /* Setup the type (level, edge polarity) if configured: */ 981 /* Setup the type (level, edge polarity) if configured: */
@@ -769,42 +984,44 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
769 new->flags & IRQF_TRIGGER_MASK); 984 new->flags & IRQF_TRIGGER_MASK);
770 985
771 if (ret) 986 if (ret)
772 goto out_thread; 987 goto out_mask;
773 } else 988 }
774 compat_irq_chip_set_default_handler(desc); 989
775#if defined(CONFIG_IRQ_PER_CPU) 990 desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
776 if (new->flags & IRQF_PERCPU) 991 IRQS_ONESHOT | IRQS_WAITING);
777 desc->status |= IRQ_PER_CPU; 992 irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
778#endif
779 993
780 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT | 994 if (new->flags & IRQF_PERCPU) {
781 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); 995 irqd_set(&desc->irq_data, IRQD_PER_CPU);
996 irq_settings_set_per_cpu(desc);
997 }
782 998
783 if (new->flags & IRQF_ONESHOT) 999 if (new->flags & IRQF_ONESHOT)
784 desc->status |= IRQ_ONESHOT; 1000 desc->istate |= IRQS_ONESHOT;
785 1001
786 if (!(desc->status & IRQ_NOAUTOEN)) { 1002 if (irq_settings_can_autoenable(desc))
787 desc->depth = 0; 1003 irq_startup(desc);
788 desc->status &= ~IRQ_DISABLED; 1004 else
789 desc->irq_data.chip->irq_startup(&desc->irq_data);
790 } else
791 /* Undo nested disables: */ 1005 /* Undo nested disables: */
792 desc->depth = 1; 1006 desc->depth = 1;
793 1007
794 /* Exclude IRQ from balancing if requested */ 1008 /* Exclude IRQ from balancing if requested */
795 if (new->flags & IRQF_NOBALANCING) 1009 if (new->flags & IRQF_NOBALANCING) {
796 desc->status |= IRQ_NO_BALANCING; 1010 irq_settings_set_no_balancing(desc);
1011 irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
1012 }
797 1013
798 /* Set default affinity mask once everything is setup */ 1014 /* Set default affinity mask once everything is setup */
799 setup_affinity(irq, desc); 1015 setup_affinity(irq, desc, mask);
800 1016
801 } else if ((new->flags & IRQF_TRIGGER_MASK) 1017 } else if (new->flags & IRQF_TRIGGER_MASK) {
802 && (new->flags & IRQF_TRIGGER_MASK) 1018 unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
803 != (desc->status & IRQ_TYPE_SENSE_MASK)) { 1019 unsigned int omsk = irq_settings_get_trigger_mask(desc);
804 /* hope the handler works with the actual trigger mode... */ 1020
805 pr_warning("IRQ %d uses trigger mode %d; requested %d\n", 1021 if (nmsk != omsk)
806 irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK), 1022 /* hope the handler works with current trigger mode */
807 (int)(new->flags & IRQF_TRIGGER_MASK)); 1023 pr_warning("IRQ %d uses trigger mode %u; requested %u\n",
1024 irq, nmsk, omsk);
808 } 1025 }
809 1026
810 new->irq = irq; 1027 new->irq = irq;
@@ -818,8 +1035,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
818 * Check whether we disabled the irq via the spurious handler 1035 * Check whether we disabled the irq via the spurious handler
819 * before. Reenable it and give it another chance. 1036 * before. Reenable it and give it another chance.
820 */ 1037 */
821 if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { 1038 if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
822 desc->status &= ~IRQ_SPURIOUS_DISABLED; 1039 desc->istate &= ~IRQS_SPURIOUS_DISABLED;
823 __enable_irq(desc, irq, false); 1040 __enable_irq(desc, irq, false);
824 } 1041 }
825 1042
@@ -835,6 +1052,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
835 register_irq_proc(irq, desc); 1052 register_irq_proc(irq, desc);
836 new->dir = NULL; 1053 new->dir = NULL;
837 register_handler_proc(irq, new); 1054 register_handler_proc(irq, new);
1055 free_cpumask_var(mask);
838 1056
839 return 0; 1057 return 0;
840 1058
@@ -849,8 +1067,11 @@ mismatch:
849#endif 1067#endif
850 ret = -EBUSY; 1068 ret = -EBUSY;
851 1069
852out_thread: 1070out_mask:
853 raw_spin_unlock_irqrestore(&desc->lock, flags); 1071 raw_spin_unlock_irqrestore(&desc->lock, flags);
1072 free_cpumask_var(mask);
1073
1074out_thread:
854 if (new->thread) { 1075 if (new->thread) {
855 struct task_struct *t = new->thread; 1076 struct task_struct *t = new->thread;
856 1077
@@ -871,9 +1092,14 @@ out_thread:
871 */ 1092 */
872int setup_irq(unsigned int irq, struct irqaction *act) 1093int setup_irq(unsigned int irq, struct irqaction *act)
873{ 1094{
1095 int retval;
874 struct irq_desc *desc = irq_to_desc(irq); 1096 struct irq_desc *desc = irq_to_desc(irq);
875 1097
876 return __setup_irq(irq, desc, act); 1098 chip_bus_lock(desc);
1099 retval = __setup_irq(irq, desc, act);
1100 chip_bus_sync_unlock(desc);
1101
1102 return retval;
877} 1103}
878EXPORT_SYMBOL_GPL(setup_irq); 1104EXPORT_SYMBOL_GPL(setup_irq);
879 1105
@@ -924,13 +1150,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
924#endif 1150#endif
925 1151
926 /* If this was the last handler, shut down the IRQ line: */ 1152 /* If this was the last handler, shut down the IRQ line: */
927 if (!desc->action) { 1153 if (!desc->action)
928 desc->status |= IRQ_DISABLED; 1154 irq_shutdown(desc);
929 if (desc->irq_data.chip->irq_shutdown)
930 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
931 else
932 desc->irq_data.chip->irq_disable(&desc->irq_data);
933 }
934 1155
935#ifdef CONFIG_SMP 1156#ifdef CONFIG_SMP
936 /* make sure affinity_hint is cleaned up */ 1157 /* make sure affinity_hint is cleaned up */
@@ -1004,6 +1225,11 @@ void free_irq(unsigned int irq, void *dev_id)
1004 if (!desc) 1225 if (!desc)
1005 return; 1226 return;
1006 1227
1228#ifdef CONFIG_SMP
1229 if (WARN_ON(desc->affinity_notify))
1230 desc->affinity_notify = NULL;
1231#endif
1232
1007 chip_bus_lock(desc); 1233 chip_bus_lock(desc);
1008 kfree(__free_irq(irq, dev_id)); 1234 kfree(__free_irq(irq, dev_id));
1009 chip_bus_sync_unlock(desc); 1235 chip_bus_sync_unlock(desc);
@@ -1074,7 +1300,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1074 if (!desc) 1300 if (!desc)
1075 return -EINVAL; 1301 return -EINVAL;
1076 1302
1077 if (desc->status & IRQ_NOREQUEST) 1303 if (!irq_settings_can_request(desc))
1078 return -EINVAL; 1304 return -EINVAL;
1079 1305
1080 if (!handler) { 1306 if (!handler) {
@@ -1100,7 +1326,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1100 if (retval) 1326 if (retval)
1101 kfree(action); 1327 kfree(action);
1102 1328
1103#ifdef CONFIG_DEBUG_SHIRQ 1329#ifdef CONFIG_DEBUG_SHIRQ_FIXME
1104 if (!retval && (irqflags & IRQF_SHARED)) { 1330 if (!retval && (irqflags & IRQF_SHARED)) {
1105 /* 1331 /*
1106 * It's a shared IRQ -- the driver ought to be prepared for it 1332 * It's a shared IRQ -- the driver ought to be prepared for it
@@ -1149,7 +1375,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
1149 if (!desc) 1375 if (!desc)
1150 return -EINVAL; 1376 return -EINVAL;
1151 1377
1152 if (desc->status & IRQ_NESTED_THREAD) { 1378 if (irq_settings_is_nested_thread(desc)) {
1153 ret = request_threaded_irq(irq, NULL, handler, 1379 ret = request_threaded_irq(irq, NULL, handler,
1154 flags, name, dev_id); 1380 flags, name, dev_id);
1155 return !ret ? IRQC_IS_NESTED : ret; 1381 return !ret ? IRQC_IS_NESTED : ret;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 441fd629ff04..47420908fba0 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,23 +4,23 @@
4 4
5#include "internals.h" 5#include "internals.h"
6 6
7void move_masked_irq(int irq) 7void irq_move_masked_irq(struct irq_data *idata)
8{ 8{
9 struct irq_desc *desc = irq_to_desc(irq); 9 struct irq_desc *desc = irq_data_to_desc(idata);
10 struct irq_chip *chip = desc->irq_data.chip; 10 struct irq_chip *chip = idata->chip;
11 11
12 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 12 if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
13 return; 13 return;
14 14
15 /* 15 /*
16 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. 16 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
17 */ 17 */
18 if (CHECK_IRQ_PER_CPU(desc->status)) { 18 if (!irqd_can_balance(&desc->irq_data)) {
19 WARN_ON(1); 19 WARN_ON(1);
20 return; 20 return;
21 } 21 }
22 22
23 desc->status &= ~IRQ_MOVE_PENDING; 23 irqd_clr_move_pending(&desc->irq_data);
24 24
25 if (unlikely(cpumask_empty(desc->pending_mask))) 25 if (unlikely(cpumask_empty(desc->pending_mask)))
26 return; 26 return;
@@ -35,7 +35,7 @@ void move_masked_irq(int irq)
35 * do the disable, re-program, enable sequence. 35 * do the disable, re-program, enable sequence.
36 * This is *not* particularly important for level triggered 36 * This is *not* particularly important for level triggered
37 * but in a edge trigger case, we might be setting rte 37 * but in a edge trigger case, we might be setting rte
38 * when an active trigger is comming in. This could 38 * when an active trigger is coming in. This could
39 * cause some ioapics to mal-function. 39 * cause some ioapics to mal-function.
40 * Being paranoid i guess! 40 * Being paranoid i guess!
41 * 41 *
@@ -53,15 +53,14 @@ void move_masked_irq(int irq)
53 cpumask_clear(desc->pending_mask); 53 cpumask_clear(desc->pending_mask);
54} 54}
55 55
56void move_native_irq(int irq) 56void irq_move_irq(struct irq_data *idata)
57{ 57{
58 struct irq_desc *desc = irq_to_desc(irq);
59 bool masked; 58 bool masked;
60 59
61 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 60 if (likely(!irqd_is_setaffinity_pending(idata)))
62 return; 61 return;
63 62
64 if (unlikely(desc->status & IRQ_DISABLED)) 63 if (unlikely(irqd_irq_disabled(idata)))
65 return; 64 return;
66 65
67 /* 66 /*
@@ -69,10 +68,10 @@ void move_native_irq(int irq)
69 * threaded interrupt with ONESHOT set, we can end up with an 68 * threaded interrupt with ONESHOT set, we can end up with an
70 * interrupt storm. 69 * interrupt storm.
71 */ 70 */
72 masked = desc->status & IRQ_MASKED; 71 masked = irqd_irq_masked(idata);
73 if (!masked) 72 if (!masked)
74 desc->irq_data.chip->irq_mask(&desc->irq_data); 73 idata->chip->irq_mask(idata);
75 move_masked_irq(irq); 74 irq_move_masked_irq(idata);
76 if (!masked) 75 if (!masked)
77 desc->irq_data.chip->irq_unmask(&desc->irq_data); 76 idata->chip->irq_unmask(idata);
78} 77}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 0d4005d85b03..f76fc00c9877 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -18,7 +18,7 @@
18 * During system-wide suspend or hibernation device drivers need to be prevented 18 * During system-wide suspend or hibernation device drivers need to be prevented
19 * from receiving interrupts and this function is provided for this purpose. 19 * from receiving interrupts and this function is provided for this purpose.
20 * It marks all interrupt lines in use, except for the timer ones, as disabled 20 * It marks all interrupt lines in use, except for the timer ones, as disabled
21 * and sets the IRQ_SUSPENDED flag for each of them. 21 * and sets the IRQS_SUSPENDED flag for each of them.
22 */ 22 */
23void suspend_device_irqs(void) 23void suspend_device_irqs(void)
24{ 24{
@@ -34,7 +34,7 @@ void suspend_device_irqs(void)
34 } 34 }
35 35
36 for_each_irq_desc(irq, desc) 36 for_each_irq_desc(irq, desc)
37 if (desc->status & IRQ_SUSPENDED) 37 if (desc->istate & IRQS_SUSPENDED)
38 synchronize_irq(irq); 38 synchronize_irq(irq);
39} 39}
40EXPORT_SYMBOL_GPL(suspend_device_irqs); 40EXPORT_SYMBOL_GPL(suspend_device_irqs);
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs);
43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() 43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
44 * 44 *
45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that 45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that
46 * have the IRQ_SUSPENDED flag set. 46 * have the IRQS_SUSPENDED flag set.
47 */ 47 */
48void resume_device_irqs(void) 48void resume_device_irqs(void)
49{ 49{
@@ -53,9 +53,6 @@ void resume_device_irqs(void)
53 for_each_irq_desc(irq, desc) { 53 for_each_irq_desc(irq, desc) {
54 unsigned long flags; 54 unsigned long flags;
55 55
56 if (!(desc->status & IRQ_SUSPENDED))
57 continue;
58
59 raw_spin_lock_irqsave(&desc->lock, flags); 56 raw_spin_lock_irqsave(&desc->lock, flags);
60 __enable_irq(desc, irq, true); 57 __enable_irq(desc, irq, true);
61 raw_spin_unlock_irqrestore(&desc->lock, flags); 58 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -71,9 +68,24 @@ int check_wakeup_irqs(void)
71 struct irq_desc *desc; 68 struct irq_desc *desc;
72 int irq; 69 int irq;
73 70
74 for_each_irq_desc(irq, desc) 71 for_each_irq_desc(irq, desc) {
75 if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING)) 72 if (irqd_is_wakeup_set(&desc->irq_data)) {
76 return -EBUSY; 73 if (desc->istate & IRQS_PENDING)
74 return -EBUSY;
75 continue;
76 }
77 /*
78 * Check the non wakeup interrupts whether they need
79 * to be masked before finally going into suspend
80 * state. That's for hardware which has no wakeup
81 * source configuration facility. The chip
82 * implementation indicates that with
83 * IRQCHIP_MASK_ON_SUSPEND.
84 */
85 if (desc->istate & IRQS_SUSPENDED &&
86 irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
87 mask_irq(desc);
88 }
77 89
78 return 0; 90 return 0;
79} 91}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6c8a2a9f8a7b..4bd4faa6323a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -11,6 +11,7 @@
11#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/kernel_stat.h>
14 15
15#include "internals.h" 16#include "internals.h"
16 17
@@ -18,16 +19,19 @@ static struct proc_dir_entry *root_irq_dir;
18 19
19#ifdef CONFIG_SMP 20#ifdef CONFIG_SMP
20 21
21static int irq_affinity_proc_show(struct seq_file *m, void *v) 22static int show_irq_affinity(int type, struct seq_file *m, void *v)
22{ 23{
23 struct irq_desc *desc = irq_to_desc((long)m->private); 24 struct irq_desc *desc = irq_to_desc((long)m->private);
24 const struct cpumask *mask = desc->irq_data.affinity; 25 const struct cpumask *mask = desc->irq_data.affinity;
25 26
26#ifdef CONFIG_GENERIC_PENDING_IRQ 27#ifdef CONFIG_GENERIC_PENDING_IRQ
27 if (desc->status & IRQ_MOVE_PENDING) 28 if (irqd_is_setaffinity_pending(&desc->irq_data))
28 mask = desc->pending_mask; 29 mask = desc->pending_mask;
29#endif 30#endif
30 seq_cpumask(m, mask); 31 if (type)
32 seq_cpumask_list(m, mask);
33 else
34 seq_cpumask(m, mask);
31 seq_putc(m, '\n'); 35 seq_putc(m, '\n');
32 return 0; 36 return 0;
33} 37}
@@ -58,21 +62,34 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
58#endif 62#endif
59 63
60int no_irq_affinity; 64int no_irq_affinity;
61static ssize_t irq_affinity_proc_write(struct file *file, 65static int irq_affinity_proc_show(struct seq_file *m, void *v)
66{
67 return show_irq_affinity(0, m, v);
68}
69
70static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
71{
72 return show_irq_affinity(1, m, v);
73}
74
75
76static ssize_t write_irq_affinity(int type, struct file *file,
62 const char __user *buffer, size_t count, loff_t *pos) 77 const char __user *buffer, size_t count, loff_t *pos)
63{ 78{
64 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; 79 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
65 cpumask_var_t new_value; 80 cpumask_var_t new_value;
66 int err; 81 int err;
67 82
68 if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity || 83 if (!irq_can_set_affinity(irq) || no_irq_affinity)
69 irq_balancing_disabled(irq))
70 return -EIO; 84 return -EIO;
71 85
72 if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) 86 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
73 return -ENOMEM; 87 return -ENOMEM;
74 88
75 err = cpumask_parse_user(buffer, count, new_value); 89 if (type)
90 err = cpumask_parselist_user(buffer, count, new_value);
91 else
92 err = cpumask_parse_user(buffer, count, new_value);
76 if (err) 93 if (err)
77 goto free_cpumask; 94 goto free_cpumask;
78 95
@@ -89,7 +106,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
89 if (!cpumask_intersects(new_value, cpu_online_mask)) { 106 if (!cpumask_intersects(new_value, cpu_online_mask)) {
90 /* Special case for empty set - allow the architecture 107 /* Special case for empty set - allow the architecture
91 code to set default SMP affinity. */ 108 code to set default SMP affinity. */
92 err = irq_select_affinity_usr(irq) ? -EINVAL : count; 109 err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count;
93 } else { 110 } else {
94 irq_set_affinity(irq, new_value); 111 irq_set_affinity(irq, new_value);
95 err = count; 112 err = count;
@@ -100,11 +117,28 @@ free_cpumask:
100 return err; 117 return err;
101} 118}
102 119
120static ssize_t irq_affinity_proc_write(struct file *file,
121 const char __user *buffer, size_t count, loff_t *pos)
122{
123 return write_irq_affinity(0, file, buffer, count, pos);
124}
125
126static ssize_t irq_affinity_list_proc_write(struct file *file,
127 const char __user *buffer, size_t count, loff_t *pos)
128{
129 return write_irq_affinity(1, file, buffer, count, pos);
130}
131
103static int irq_affinity_proc_open(struct inode *inode, struct file *file) 132static int irq_affinity_proc_open(struct inode *inode, struct file *file)
104{ 133{
105 return single_open(file, irq_affinity_proc_show, PDE(inode)->data); 134 return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
106} 135}
107 136
137static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
138{
139 return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data);
140}
141
108static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) 142static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
109{ 143{
110 return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); 144 return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
@@ -125,6 +159,14 @@ static const struct file_operations irq_affinity_hint_proc_fops = {
125 .release = single_release, 159 .release = single_release,
126}; 160};
127 161
162static const struct file_operations irq_affinity_list_proc_fops = {
163 .open = irq_affinity_list_proc_open,
164 .read = seq_read,
165 .llseek = seq_lseek,
166 .release = single_release,
167 .write = irq_affinity_list_proc_write,
168};
169
128static int default_affinity_show(struct seq_file *m, void *v) 170static int default_affinity_show(struct seq_file *m, void *v)
129{ 171{
130 seq_cpumask(m, irq_default_affinity); 172 seq_cpumask(m, irq_default_affinity);
@@ -289,6 +331,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
289 proc_create_data("affinity_hint", 0400, desc->dir, 331 proc_create_data("affinity_hint", 0400, desc->dir,
290 &irq_affinity_hint_proc_fops, (void *)(long)irq); 332 &irq_affinity_hint_proc_fops, (void *)(long)irq);
291 333
334 /* create /proc/irq/<irq>/smp_affinity_list */
335 proc_create_data("smp_affinity_list", 0600, desc->dir,
336 &irq_affinity_list_proc_fops, (void *)(long)irq);
337
292 proc_create_data("node", 0444, desc->dir, 338 proc_create_data("node", 0444, desc->dir,
293 &irq_node_proc_fops, (void *)(long)irq); 339 &irq_node_proc_fops, (void *)(long)irq);
294#endif 340#endif
@@ -306,6 +352,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
306#ifdef CONFIG_SMP 352#ifdef CONFIG_SMP
307 remove_proc_entry("smp_affinity", desc->dir); 353 remove_proc_entry("smp_affinity", desc->dir);
308 remove_proc_entry("affinity_hint", desc->dir); 354 remove_proc_entry("affinity_hint", desc->dir);
355 remove_proc_entry("smp_affinity_list", desc->dir);
309 remove_proc_entry("node", desc->dir); 356 remove_proc_entry("node", desc->dir);
310#endif 357#endif
311 remove_proc_entry("spurious", desc->dir); 358 remove_proc_entry("spurious", desc->dir);
@@ -357,3 +404,83 @@ void init_irq_proc(void)
357 } 404 }
358} 405}
359 406
407#ifdef CONFIG_GENERIC_IRQ_SHOW
408
409int __weak arch_show_interrupts(struct seq_file *p, int prec)
410{
411 return 0;
412}
413
414#ifndef ACTUAL_NR_IRQS
415# define ACTUAL_NR_IRQS nr_irqs
416#endif
417
418int show_interrupts(struct seq_file *p, void *v)
419{
420 static int prec;
421
422 unsigned long flags, any_count = 0;
423 int i = *(loff_t *) v, j;
424 struct irqaction *action;
425 struct irq_desc *desc;
426
427 if (i > ACTUAL_NR_IRQS)
428 return 0;
429
430 if (i == ACTUAL_NR_IRQS)
431 return arch_show_interrupts(p, prec);
432
433 /* print header and calculate the width of the first column */
434 if (i == 0) {
435 for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
436 j *= 10;
437
438 seq_printf(p, "%*s", prec + 8, "");
439 for_each_online_cpu(j)
440 seq_printf(p, "CPU%-8d", j);
441 seq_putc(p, '\n');
442 }
443
444 desc = irq_to_desc(i);
445 if (!desc)
446 return 0;
447
448 raw_spin_lock_irqsave(&desc->lock, flags);
449 for_each_online_cpu(j)
450 any_count |= kstat_irqs_cpu(i, j);
451 action = desc->action;
452 if (!action && !any_count)
453 goto out;
454
455 seq_printf(p, "%*d: ", prec, i);
456 for_each_online_cpu(j)
457 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
458
459 if (desc->irq_data.chip) {
460 if (desc->irq_data.chip->irq_print_chip)
461 desc->irq_data.chip->irq_print_chip(&desc->irq_data, p);
462 else if (desc->irq_data.chip->name)
463 seq_printf(p, " %8s", desc->irq_data.chip->name);
464 else
465 seq_printf(p, " %8s", "-");
466 } else {
467 seq_printf(p, " %8s", "None");
468 }
469#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
470 seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
471#endif
472 if (desc->name)
473 seq_printf(p, "-%-8s", desc->name);
474
475 if (action) {
476 seq_printf(p, " %s", action->name);
477 while ((action = action->next) != NULL)
478 seq_printf(p, ", %s", action->name);
479 }
480
481 seq_putc(p, '\n');
482out:
483 raw_spin_unlock_irqrestore(&desc->lock, flags);
484 return 0;
485}
486#endif
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 891115a929aa..14dd5761e8c9 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -23,7 +23,7 @@
23#ifdef CONFIG_HARDIRQS_SW_RESEND 23#ifdef CONFIG_HARDIRQS_SW_RESEND
24 24
25/* Bitmap to handle software resend of interrupts: */ 25/* Bitmap to handle software resend of interrupts: */
26static DECLARE_BITMAP(irqs_resend, NR_IRQS); 26static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
27 27
28/* 28/*
29 * Run software resends of IRQ's 29 * Run software resends of IRQ's
@@ -55,20 +55,18 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
55 */ 55 */
56void check_irq_resend(struct irq_desc *desc, unsigned int irq) 56void check_irq_resend(struct irq_desc *desc, unsigned int irq)
57{ 57{
58 unsigned int status = desc->status;
59
60 /*
61 * Make sure the interrupt is enabled, before resending it:
62 */
63 desc->irq_data.chip->irq_enable(&desc->irq_data);
64
65 /* 58 /*
66 * We do not resend level type interrupts. Level type 59 * We do not resend level type interrupts. Level type
67 * interrupts are resent by hardware when they are still 60 * interrupts are resent by hardware when they are still
68 * active. 61 * active.
69 */ 62 */
70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 63 if (irq_settings_is_level(desc))
71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; 64 return;
65 if (desc->istate & IRQS_REPLAY)
66 return;
67 if (desc->istate & IRQS_PENDING) {
68 desc->istate &= ~IRQS_PENDING;
69 desc->istate |= IRQS_REPLAY;
72 70
73 if (!desc->irq_data.chip->irq_retrigger || 71 if (!desc->irq_data.chip->irq_retrigger ||
74 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { 72 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
new file mode 100644
index 000000000000..f1667833d444
--- /dev/null
+++ b/kernel/irq/settings.h
@@ -0,0 +1,142 @@
1/*
2 * Internal header to deal with irq_desc->status which will be renamed
3 * to irq_desc->settings.
4 */
5enum {
6 _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS,
7 _IRQ_PER_CPU = IRQ_PER_CPU,
8 _IRQ_LEVEL = IRQ_LEVEL,
9 _IRQ_NOPROBE = IRQ_NOPROBE,
10 _IRQ_NOREQUEST = IRQ_NOREQUEST,
11 _IRQ_NOTHREAD = IRQ_NOTHREAD,
12 _IRQ_NOAUTOEN = IRQ_NOAUTOEN,
13 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
14 _IRQ_NO_BALANCING = IRQ_NO_BALANCING,
15 _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
16 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
17};
18
19#define IRQ_PER_CPU GOT_YOU_MORON
20#define IRQ_NO_BALANCING GOT_YOU_MORON
21#define IRQ_LEVEL GOT_YOU_MORON
22#define IRQ_NOPROBE GOT_YOU_MORON
23#define IRQ_NOREQUEST GOT_YOU_MORON
24#define IRQ_NOTHREAD GOT_YOU_MORON
25#define IRQ_NOAUTOEN GOT_YOU_MORON
26#define IRQ_NESTED_THREAD GOT_YOU_MORON
27#undef IRQF_MODIFY_MASK
28#define IRQF_MODIFY_MASK GOT_YOU_MORON
29
30static inline void
31irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
32{
33 desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK);
34 desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
35}
36
37static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
38{
39 return desc->status_use_accessors & _IRQ_PER_CPU;
40}
41
42static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
43{
44 desc->status_use_accessors |= _IRQ_PER_CPU;
45}
46
47static inline void irq_settings_set_no_balancing(struct irq_desc *desc)
48{
49 desc->status_use_accessors |= _IRQ_NO_BALANCING;
50}
51
52static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
53{
54 return desc->status_use_accessors & _IRQ_NO_BALANCING;
55}
56
57static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc)
58{
59 return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK;
60}
61
62static inline void
63irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask)
64{
65 desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK;
66 desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK;
67}
68
69static inline bool irq_settings_is_level(struct irq_desc *desc)
70{
71 return desc->status_use_accessors & _IRQ_LEVEL;
72}
73
74static inline void irq_settings_clr_level(struct irq_desc *desc)
75{
76 desc->status_use_accessors &= ~_IRQ_LEVEL;
77}
78
79static inline void irq_settings_set_level(struct irq_desc *desc)
80{
81 desc->status_use_accessors |= _IRQ_LEVEL;
82}
83
84static inline bool irq_settings_can_request(struct irq_desc *desc)
85{
86 return !(desc->status_use_accessors & _IRQ_NOREQUEST);
87}
88
89static inline void irq_settings_clr_norequest(struct irq_desc *desc)
90{
91 desc->status_use_accessors &= ~_IRQ_NOREQUEST;
92}
93
94static inline void irq_settings_set_norequest(struct irq_desc *desc)
95{
96 desc->status_use_accessors |= _IRQ_NOREQUEST;
97}
98
99static inline bool irq_settings_can_thread(struct irq_desc *desc)
100{
101 return !(desc->status_use_accessors & _IRQ_NOTHREAD);
102}
103
104static inline void irq_settings_clr_nothread(struct irq_desc *desc)
105{
106 desc->status_use_accessors &= ~_IRQ_NOTHREAD;
107}
108
109static inline void irq_settings_set_nothread(struct irq_desc *desc)
110{
111 desc->status_use_accessors |= _IRQ_NOTHREAD;
112}
113
114static inline bool irq_settings_can_probe(struct irq_desc *desc)
115{
116 return !(desc->status_use_accessors & _IRQ_NOPROBE);
117}
118
119static inline void irq_settings_clr_noprobe(struct irq_desc *desc)
120{
121 desc->status_use_accessors &= ~_IRQ_NOPROBE;
122}
123
124static inline void irq_settings_set_noprobe(struct irq_desc *desc)
125{
126 desc->status_use_accessors |= _IRQ_NOPROBE;
127}
128
129static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
130{
131 return desc->status_use_accessors & _IRQ_MOVE_PCNTXT;
132}
133
134static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
135{
136 return !(desc->status_use_accessors & _IRQ_NOAUTOEN);
137}
138
139static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
140{
141 return desc->status_use_accessors & _IRQ_NESTED_THREAD;
142}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 3089d3b9d5f3..dfbd550401b2 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -21,70 +21,93 @@ static int irqfixup __read_mostly;
21#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) 21#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
22static void poll_spurious_irqs(unsigned long dummy); 22static void poll_spurious_irqs(unsigned long dummy);
23static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); 23static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
24static int irq_poll_cpu;
25static atomic_t irq_poll_active;
26
27/*
28 * We wait here for a poller to finish.
29 *
30 * If the poll runs on this CPU, then we yell loudly and return
31 * false. That will leave the interrupt line disabled in the worst
32 * case, but it should never happen.
33 *
34 * We wait until the poller is done and then recheck disabled and
35 * action (about to be disabled). Only if it's still active, we return
36 * true and let the handler run.
37 */
38bool irq_wait_for_poll(struct irq_desc *desc)
39{
40 if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
41 "irq poll in progress on cpu %d for irq %d\n",
42 smp_processor_id(), desc->irq_data.irq))
43 return false;
44
45#ifdef CONFIG_SMP
46 do {
47 raw_spin_unlock(&desc->lock);
48 while (irqd_irq_inprogress(&desc->irq_data))
49 cpu_relax();
50 raw_spin_lock(&desc->lock);
51 } while (irqd_irq_inprogress(&desc->irq_data));
52 /* Might have been disabled in meantime */
53 return !irqd_irq_disabled(&desc->irq_data) && desc->action;
54#else
55 return false;
56#endif
57}
58
24 59
25/* 60/*
26 * Recovery handler for misrouted interrupts. 61 * Recovery handler for misrouted interrupts.
27 */ 62 */
28static int try_one_irq(int irq, struct irq_desc *desc) 63static int try_one_irq(int irq, struct irq_desc *desc, bool force)
29{ 64{
65 irqreturn_t ret = IRQ_NONE;
30 struct irqaction *action; 66 struct irqaction *action;
31 int ok = 0, work = 0;
32 67
33 raw_spin_lock(&desc->lock); 68 raw_spin_lock(&desc->lock);
34 /* Already running on another processor */
35 if (desc->status & IRQ_INPROGRESS) {
36 /*
37 * Already running: If it is shared get the other
38 * CPU to go looking for our mystery interrupt too
39 */
40 if (desc->action && (desc->action->flags & IRQF_SHARED))
41 desc->status |= IRQ_PENDING;
42 raw_spin_unlock(&desc->lock);
43 return ok;
44 }
45 /* Honour the normal IRQ locking */
46 desc->status |= IRQ_INPROGRESS;
47 action = desc->action;
48 raw_spin_unlock(&desc->lock);
49 69
50 while (action) { 70 /* PER_CPU and nested thread interrupts are never polled */
51 /* Only shared IRQ handlers are safe to call */ 71 if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc))
52 if (action->flags & IRQF_SHARED) { 72 goto out;
53 if (action->handler(irq, action->dev_id) ==
54 IRQ_HANDLED)
55 ok = 1;
56 }
57 action = action->next;
58 }
59 local_irq_disable();
60 /* Now clean up the flags */
61 raw_spin_lock(&desc->lock);
62 action = desc->action;
63 73
64 /* 74 /*
65 * While we were looking for a fixup someone queued a real 75 * Do not poll disabled interrupts unless the spurious
66 * IRQ clashing with our walk: 76 * disabled poller asks explicitely.
67 */ 77 */
68 while ((desc->status & IRQ_PENDING) && action) { 78 if (irqd_irq_disabled(&desc->irq_data) && !force)
79 goto out;
80
81 /*
82 * All handlers must agree on IRQF_SHARED, so we test just the
83 * first. Check for action->next as well.
84 */
85 action = desc->action;
86 if (!action || !(action->flags & IRQF_SHARED) ||
87 (action->flags & __IRQF_TIMER) || !action->next)
88 goto out;
89
90 /* Already running on another processor */
91 if (irqd_irq_inprogress(&desc->irq_data)) {
69 /* 92 /*
70 * Perform real IRQ processing for the IRQ we deferred 93 * Already running: If it is shared get the other
94 * CPU to go looking for our mystery interrupt too
71 */ 95 */
72 work = 1; 96 desc->istate |= IRQS_PENDING;
73 raw_spin_unlock(&desc->lock); 97 goto out;
74 handle_IRQ_event(irq, action);
75 raw_spin_lock(&desc->lock);
76 desc->status &= ~IRQ_PENDING;
77 } 98 }
78 desc->status &= ~IRQ_INPROGRESS;
79 /*
80 * If we did actual work for the real IRQ line we must let the
81 * IRQ controller clean up too
82 */
83 if (work)
84 irq_end(irq, desc);
85 raw_spin_unlock(&desc->lock);
86 99
87 return ok; 100 /* Mark it poll in progress */
101 desc->istate |= IRQS_POLL_INPROGRESS;
102 do {
103 if (handle_irq_event(desc) == IRQ_HANDLED)
104 ret = IRQ_HANDLED;
105 action = desc->action;
106 } while ((desc->istate & IRQS_PENDING) && action);
107 desc->istate &= ~IRQS_POLL_INPROGRESS;
108out:
109 raw_spin_unlock(&desc->lock);
110 return ret == IRQ_HANDLED;
88} 111}
89 112
90static int misrouted_irq(int irq) 113static int misrouted_irq(int irq)
@@ -92,6 +115,11 @@ static int misrouted_irq(int irq)
92 struct irq_desc *desc; 115 struct irq_desc *desc;
93 int i, ok = 0; 116 int i, ok = 0;
94 117
118 if (atomic_inc_return(&irq_poll_active) == 1)
119 goto out;
120
121 irq_poll_cpu = smp_processor_id();
122
95 for_each_irq_desc(i, desc) { 123 for_each_irq_desc(i, desc) {
96 if (!i) 124 if (!i)
97 continue; 125 continue;
@@ -99,9 +127,11 @@ static int misrouted_irq(int irq)
99 if (i == irq) /* Already tried */ 127 if (i == irq) /* Already tried */
100 continue; 128 continue;
101 129
102 if (try_one_irq(i, desc)) 130 if (try_one_irq(i, desc, false))
103 ok = 1; 131 ok = 1;
104 } 132 }
133out:
134 atomic_dec(&irq_poll_active);
105 /* So the caller can adjust the irq error counts */ 135 /* So the caller can adjust the irq error counts */
106 return ok; 136 return ok;
107} 137}
@@ -111,23 +141,28 @@ static void poll_spurious_irqs(unsigned long dummy)
111 struct irq_desc *desc; 141 struct irq_desc *desc;
112 int i; 142 int i;
113 143
144 if (atomic_inc_return(&irq_poll_active) != 1)
145 goto out;
146 irq_poll_cpu = smp_processor_id();
147
114 for_each_irq_desc(i, desc) { 148 for_each_irq_desc(i, desc) {
115 unsigned int status; 149 unsigned int state;
116 150
117 if (!i) 151 if (!i)
118 continue; 152 continue;
119 153
120 /* Racy but it doesn't matter */ 154 /* Racy but it doesn't matter */
121 status = desc->status; 155 state = desc->istate;
122 barrier(); 156 barrier();
123 if (!(status & IRQ_SPURIOUS_DISABLED)) 157 if (!(state & IRQS_SPURIOUS_DISABLED))
124 continue; 158 continue;
125 159
126 local_irq_disable(); 160 local_irq_disable();
127 try_one_irq(i, desc); 161 try_one_irq(i, desc, true);
128 local_irq_enable(); 162 local_irq_enable();
129 } 163 }
130 164out:
165 atomic_dec(&irq_poll_active);
131 mod_timer(&poll_spurious_irq_timer, 166 mod_timer(&poll_spurious_irq_timer,
132 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 167 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
133} 168}
@@ -139,15 +174,13 @@ static void poll_spurious_irqs(unsigned long dummy)
139 * 174 *
140 * (The other 100-of-100,000 interrupts may have been a correctly 175 * (The other 100-of-100,000 interrupts may have been a correctly
141 * functioning device sharing an IRQ with the failing one) 176 * functioning device sharing an IRQ with the failing one)
142 *
143 * Called under desc->lock
144 */ 177 */
145
146static void 178static void
147__report_bad_irq(unsigned int irq, struct irq_desc *desc, 179__report_bad_irq(unsigned int irq, struct irq_desc *desc,
148 irqreturn_t action_ret) 180 irqreturn_t action_ret)
149{ 181{
150 struct irqaction *action; 182 struct irqaction *action;
183 unsigned long flags;
151 184
152 if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { 185 if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
153 printk(KERN_ERR "irq event %d: bogus return value %x\n", 186 printk(KERN_ERR "irq event %d: bogus return value %x\n",
@@ -159,6 +192,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
159 dump_stack(); 192 dump_stack();
160 printk(KERN_ERR "handlers:\n"); 193 printk(KERN_ERR "handlers:\n");
161 194
195 /*
196 * We need to take desc->lock here. note_interrupt() is called
197 * w/o desc->lock held, but IRQ_PROGRESS set. We might race
198 * with something else removing an action. It's ok to take
199 * desc->lock here. See synchronize_irq().
200 */
201 raw_spin_lock_irqsave(&desc->lock, flags);
162 action = desc->action; 202 action = desc->action;
163 while (action) { 203 while (action) {
164 printk(KERN_ERR "[<%p>]", action->handler); 204 printk(KERN_ERR "[<%p>]", action->handler);
@@ -167,6 +207,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
167 printk("\n"); 207 printk("\n");
168 action = action->next; 208 action = action->next;
169 } 209 }
210 raw_spin_unlock_irqrestore(&desc->lock, flags);
170} 211}
171 212
172static void 213static void
@@ -218,6 +259,9 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
218void note_interrupt(unsigned int irq, struct irq_desc *desc, 259void note_interrupt(unsigned int irq, struct irq_desc *desc,
219 irqreturn_t action_ret) 260 irqreturn_t action_ret)
220{ 261{
262 if (desc->istate & IRQS_POLL_INPROGRESS)
263 return;
264
221 if (unlikely(action_ret != IRQ_HANDLED)) { 265 if (unlikely(action_ret != IRQ_HANDLED)) {
222 /* 266 /*
223 * If we are seeing only the odd spurious IRQ caused by 267 * If we are seeing only the odd spurious IRQ caused by
@@ -254,9 +298,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
254 * Now kill the IRQ 298 * Now kill the IRQ
255 */ 299 */
256 printk(KERN_EMERG "Disabling IRQ #%d\n", irq); 300 printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
257 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; 301 desc->istate |= IRQS_SPURIOUS_DISABLED;
258 desc->depth++; 302 desc->depth++;
259 desc->irq_data.chip->irq_disable(&desc->irq_data); 303 irq_disable(desc);
260 304
261 mod_timer(&poll_spurious_irq_timer, 305 mod_timer(&poll_spurious_irq_timer,
262 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 306 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 3b79bd938330..74d1c099fbd1 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -2,43 +2,23 @@
2 * jump label support 2 * jump label support
3 * 3 *
4 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> 4 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
5 * Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com>
5 * 6 *
6 */ 7 */
7#include <linux/jump_label.h>
8#include <linux/memory.h> 8#include <linux/memory.h>
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/list.h> 11#include <linux/list.h>
12#include <linux/jhash.h>
13#include <linux/slab.h> 12#include <linux/slab.h>
14#include <linux/sort.h> 13#include <linux/sort.h>
15#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/jump_label.h>
16 16
17#ifdef HAVE_JUMP_LABEL 17#ifdef HAVE_JUMP_LABEL
18 18
19#define JUMP_LABEL_HASH_BITS 6
20#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS)
21static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE];
22
23/* mutex to protect coming/going of the the jump_label table */ 19/* mutex to protect coming/going of the the jump_label table */
24static DEFINE_MUTEX(jump_label_mutex); 20static DEFINE_MUTEX(jump_label_mutex);
25 21
26struct jump_label_entry {
27 struct hlist_node hlist;
28 struct jump_entry *table;
29 int nr_entries;
30 /* hang modules off here */
31 struct hlist_head modules;
32 unsigned long key;
33};
34
35struct jump_label_module_entry {
36 struct hlist_node hlist;
37 struct jump_entry *table;
38 int nr_entries;
39 struct module *mod;
40};
41
42void jump_label_lock(void) 22void jump_label_lock(void)
43{ 23{
44 mutex_lock(&jump_label_mutex); 24 mutex_lock(&jump_label_mutex);
@@ -49,6 +29,11 @@ void jump_label_unlock(void)
49 mutex_unlock(&jump_label_mutex); 29 mutex_unlock(&jump_label_mutex);
50} 30}
51 31
32bool jump_label_enabled(struct jump_label_key *key)
33{
34 return !!atomic_read(&key->enabled);
35}
36
52static int jump_label_cmp(const void *a, const void *b) 37static int jump_label_cmp(const void *a, const void *b)
53{ 38{
54 const struct jump_entry *jea = a; 39 const struct jump_entry *jea = a;
@@ -64,7 +49,7 @@ static int jump_label_cmp(const void *a, const void *b)
64} 49}
65 50
66static void 51static void
67sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) 52jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
68{ 53{
69 unsigned long size; 54 unsigned long size;
70 55
@@ -73,118 +58,25 @@ sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
73 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); 58 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
74} 59}
75 60
76static struct jump_label_entry *get_jump_label_entry(jump_label_t key) 61static void jump_label_update(struct jump_label_key *key, int enable);
77{
78 struct hlist_head *head;
79 struct hlist_node *node;
80 struct jump_label_entry *e;
81 u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
82
83 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
84 hlist_for_each_entry(e, node, head, hlist) {
85 if (key == e->key)
86 return e;
87 }
88 return NULL;
89}
90 62
91static struct jump_label_entry * 63void jump_label_inc(struct jump_label_key *key)
92add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table)
93{ 64{
94 struct hlist_head *head; 65 if (atomic_inc_not_zero(&key->enabled))
95 struct jump_label_entry *e; 66 return;
96 u32 hash;
97
98 e = get_jump_label_entry(key);
99 if (e)
100 return ERR_PTR(-EEXIST);
101
102 e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL);
103 if (!e)
104 return ERR_PTR(-ENOMEM);
105
106 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
107 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
108 e->key = key;
109 e->table = table;
110 e->nr_entries = nr_entries;
111 INIT_HLIST_HEAD(&(e->modules));
112 hlist_add_head(&e->hlist, head);
113 return e;
114}
115 67
116static int 68 jump_label_lock();
117build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop) 69 if (atomic_add_return(1, &key->enabled) == 1)
118{ 70 jump_label_update(key, JUMP_LABEL_ENABLE);
119 struct jump_entry *iter, *iter_begin; 71 jump_label_unlock();
120 struct jump_label_entry *entry;
121 int count;
122
123 sort_jump_label_entries(start, stop);
124 iter = start;
125 while (iter < stop) {
126 entry = get_jump_label_entry(iter->key);
127 if (!entry) {
128 iter_begin = iter;
129 count = 0;
130 while ((iter < stop) &&
131 (iter->key == iter_begin->key)) {
132 iter++;
133 count++;
134 }
135 entry = add_jump_label_entry(iter_begin->key,
136 count, iter_begin);
137 if (IS_ERR(entry))
138 return PTR_ERR(entry);
139 } else {
140 WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n");
141 return -1;
142 }
143 }
144 return 0;
145} 72}
146 73
147/*** 74void jump_label_dec(struct jump_label_key *key)
148 * jump_label_update - update jump label text
149 * @key - key value associated with a a jump label
150 * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE
151 *
152 * Will enable/disable the jump for jump label @key, depending on the
153 * value of @type.
154 *
155 */
156
157void jump_label_update(unsigned long key, enum jump_label_type type)
158{ 75{
159 struct jump_entry *iter; 76 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
160 struct jump_label_entry *entry; 77 return;
161 struct hlist_node *module_node;
162 struct jump_label_module_entry *e_module;
163 int count;
164 78
165 jump_label_lock(); 79 jump_label_update(key, JUMP_LABEL_DISABLE);
166 entry = get_jump_label_entry((jump_label_t)key);
167 if (entry) {
168 count = entry->nr_entries;
169 iter = entry->table;
170 while (count--) {
171 if (kernel_text_address(iter->code))
172 arch_jump_label_transform(iter, type);
173 iter++;
174 }
175 /* eanble/disable jump labels in modules */
176 hlist_for_each_entry(e_module, module_node, &(entry->modules),
177 hlist) {
178 count = e_module->nr_entries;
179 iter = e_module->table;
180 while (count--) {
181 if (iter->key &&
182 kernel_text_address(iter->code))
183 arch_jump_label_transform(iter, type);
184 iter++;
185 }
186 }
187 }
188 jump_label_unlock(); 80 jump_label_unlock();
189} 81}
190 82
@@ -197,77 +89,33 @@ static int addr_conflict(struct jump_entry *entry, void *start, void *end)
197 return 0; 89 return 0;
198} 90}
199 91
200#ifdef CONFIG_MODULES 92static int __jump_label_text_reserved(struct jump_entry *iter_start,
201 93 struct jump_entry *iter_stop, void *start, void *end)
202static int module_conflict(void *start, void *end)
203{ 94{
204 struct hlist_head *head;
205 struct hlist_node *node, *node_next, *module_node, *module_node_next;
206 struct jump_label_entry *e;
207 struct jump_label_module_entry *e_module;
208 struct jump_entry *iter; 95 struct jump_entry *iter;
209 int i, count;
210 int conflict = 0;
211
212 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
213 head = &jump_label_table[i];
214 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
215 hlist_for_each_entry_safe(e_module, module_node,
216 module_node_next,
217 &(e->modules), hlist) {
218 count = e_module->nr_entries;
219 iter = e_module->table;
220 while (count--) {
221 if (addr_conflict(iter, start, end)) {
222 conflict = 1;
223 goto out;
224 }
225 iter++;
226 }
227 }
228 }
229 }
230out:
231 return conflict;
232}
233
234#endif
235
236/***
237 * jump_label_text_reserved - check if addr range is reserved
238 * @start: start text addr
239 * @end: end text addr
240 *
241 * checks if the text addr located between @start and @end
242 * overlaps with any of the jump label patch addresses. Code
243 * that wants to modify kernel text should first verify that
244 * it does not overlap with any of the jump label addresses.
245 * Caller must hold jump_label_mutex.
246 *
247 * returns 1 if there is an overlap, 0 otherwise
248 */
249int jump_label_text_reserved(void *start, void *end)
250{
251 struct jump_entry *iter;
252 struct jump_entry *iter_start = __start___jump_table;
253 struct jump_entry *iter_stop = __start___jump_table;
254 int conflict = 0;
255 96
256 iter = iter_start; 97 iter = iter_start;
257 while (iter < iter_stop) { 98 while (iter < iter_stop) {
258 if (addr_conflict(iter, start, end)) { 99 if (addr_conflict(iter, start, end))
259 conflict = 1; 100 return 1;
260 goto out;
261 }
262 iter++; 101 iter++;
263 } 102 }
264 103
265 /* now check modules */ 104 return 0;
266#ifdef CONFIG_MODULES 105}
267 conflict = module_conflict(start, end); 106
268#endif 107static void __jump_label_update(struct jump_label_key *key,
269out: 108 struct jump_entry *entry, int enable)
270 return conflict; 109{
110 for (; entry->key == (jump_label_t)(unsigned long)key; entry++) {
111 /*
112 * entry->code set to 0 invalidates module init text sections
113 * kernel_text_address() verifies we are not in core kernel
114 * init code, see jump_label_invalidate_module_init().
115 */
116 if (entry->code && kernel_text_address(entry->code))
117 arch_jump_label_transform(entry, enable);
118 }
271} 119}
272 120
273/* 121/*
@@ -277,142 +125,173 @@ void __weak arch_jump_label_text_poke_early(jump_label_t addr)
277{ 125{
278} 126}
279 127
280static __init int init_jump_label(void) 128static __init int jump_label_init(void)
281{ 129{
282 int ret;
283 struct jump_entry *iter_start = __start___jump_table; 130 struct jump_entry *iter_start = __start___jump_table;
284 struct jump_entry *iter_stop = __stop___jump_table; 131 struct jump_entry *iter_stop = __stop___jump_table;
132 struct jump_label_key *key = NULL;
285 struct jump_entry *iter; 133 struct jump_entry *iter;
286 134
287 jump_label_lock(); 135 jump_label_lock();
288 ret = build_jump_label_hashtable(__start___jump_table, 136 jump_label_sort_entries(iter_start, iter_stop);
289 __stop___jump_table); 137
290 iter = iter_start; 138 for (iter = iter_start; iter < iter_stop; iter++) {
291 while (iter < iter_stop) {
292 arch_jump_label_text_poke_early(iter->code); 139 arch_jump_label_text_poke_early(iter->code);
293 iter++; 140 if (iter->key == (jump_label_t)(unsigned long)key)
141 continue;
142
143 key = (struct jump_label_key *)(unsigned long)iter->key;
144 atomic_set(&key->enabled, 0);
145 key->entries = iter;
146#ifdef CONFIG_MODULES
147 key->next = NULL;
148#endif
294 } 149 }
295 jump_label_unlock(); 150 jump_label_unlock();
296 return ret; 151
152 return 0;
297} 153}
298early_initcall(init_jump_label); 154early_initcall(jump_label_init);
299 155
300#ifdef CONFIG_MODULES 156#ifdef CONFIG_MODULES
301 157
302static struct jump_label_module_entry * 158struct jump_label_mod {
303add_jump_label_module_entry(struct jump_label_entry *entry, 159 struct jump_label_mod *next;
304 struct jump_entry *iter_begin, 160 struct jump_entry *entries;
305 int count, struct module *mod) 161 struct module *mod;
162};
163
164static int __jump_label_mod_text_reserved(void *start, void *end)
165{
166 struct module *mod;
167
168 mod = __module_text_address((unsigned long)start);
169 if (!mod)
170 return 0;
171
172 WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
173
174 return __jump_label_text_reserved(mod->jump_entries,
175 mod->jump_entries + mod->num_jump_entries,
176 start, end);
177}
178
179static void __jump_label_mod_update(struct jump_label_key *key, int enable)
180{
181 struct jump_label_mod *mod = key->next;
182
183 while (mod) {
184 __jump_label_update(key, mod->entries, enable);
185 mod = mod->next;
186 }
187}
188
189/***
190 * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
191 * @mod: module to patch
192 *
193 * Allow for run-time selection of the optimal nops. Before the module
194 * loads patch these with arch_get_jump_label_nop(), which is specified by
195 * the arch specific jump label code.
196 */
197void jump_label_apply_nops(struct module *mod)
306{ 198{
307 struct jump_label_module_entry *e; 199 struct jump_entry *iter_start = mod->jump_entries;
308 200 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
309 e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL); 201 struct jump_entry *iter;
310 if (!e) 202
311 return ERR_PTR(-ENOMEM); 203 /* if the module doesn't have jump label entries, just return */
312 e->mod = mod; 204 if (iter_start == iter_stop)
313 e->nr_entries = count; 205 return;
314 e->table = iter_begin; 206
315 hlist_add_head(&e->hlist, &entry->modules); 207 for (iter = iter_start; iter < iter_stop; iter++)
316 return e; 208 arch_jump_label_text_poke_early(iter->code);
317} 209}
318 210
319static int add_jump_label_module(struct module *mod) 211static int jump_label_add_module(struct module *mod)
320{ 212{
321 struct jump_entry *iter, *iter_begin; 213 struct jump_entry *iter_start = mod->jump_entries;
322 struct jump_label_entry *entry; 214 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
323 struct jump_label_module_entry *module_entry; 215 struct jump_entry *iter;
324 int count; 216 struct jump_label_key *key = NULL;
217 struct jump_label_mod *jlm;
325 218
326 /* if the module doesn't have jump label entries, just return */ 219 /* if the module doesn't have jump label entries, just return */
327 if (!mod->num_jump_entries) 220 if (iter_start == iter_stop)
328 return 0; 221 return 0;
329 222
330 sort_jump_label_entries(mod->jump_entries, 223 jump_label_sort_entries(iter_start, iter_stop);
331 mod->jump_entries + mod->num_jump_entries); 224
332 iter = mod->jump_entries; 225 for (iter = iter_start; iter < iter_stop; iter++) {
333 while (iter < mod->jump_entries + mod->num_jump_entries) { 226 if (iter->key == (jump_label_t)(unsigned long)key)
334 entry = get_jump_label_entry(iter->key); 227 continue;
335 iter_begin = iter; 228
336 count = 0; 229 key = (struct jump_label_key *)(unsigned long)iter->key;
337 while ((iter < mod->jump_entries + mod->num_jump_entries) && 230
338 (iter->key == iter_begin->key)) { 231 if (__module_address(iter->key) == mod) {
339 iter++; 232 atomic_set(&key->enabled, 0);
340 count++; 233 key->entries = iter;
341 } 234 key->next = NULL;
342 if (!entry) { 235 continue;
343 entry = add_jump_label_entry(iter_begin->key, 0, NULL);
344 if (IS_ERR(entry))
345 return PTR_ERR(entry);
346 } 236 }
347 module_entry = add_jump_label_module_entry(entry, iter_begin, 237
348 count, mod); 238 jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL);
349 if (IS_ERR(module_entry)) 239 if (!jlm)
350 return PTR_ERR(module_entry); 240 return -ENOMEM;
241
242 jlm->mod = mod;
243 jlm->entries = iter;
244 jlm->next = key->next;
245 key->next = jlm;
246
247 if (jump_label_enabled(key))
248 __jump_label_update(key, iter, JUMP_LABEL_ENABLE);
351 } 249 }
250
352 return 0; 251 return 0;
353} 252}
354 253
355static void remove_jump_label_module(struct module *mod) 254static void jump_label_del_module(struct module *mod)
356{ 255{
357 struct hlist_head *head; 256 struct jump_entry *iter_start = mod->jump_entries;
358 struct hlist_node *node, *node_next, *module_node, *module_node_next; 257 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
359 struct jump_label_entry *e; 258 struct jump_entry *iter;
360 struct jump_label_module_entry *e_module; 259 struct jump_label_key *key = NULL;
361 int i; 260 struct jump_label_mod *jlm, **prev;
362 261
363 /* if the module doesn't have jump label entries, just return */ 262 for (iter = iter_start; iter < iter_stop; iter++) {
364 if (!mod->num_jump_entries) 263 if (iter->key == (jump_label_t)(unsigned long)key)
365 return; 264 continue;
265
266 key = (struct jump_label_key *)(unsigned long)iter->key;
267
268 if (__module_address(iter->key) == mod)
269 continue;
270
271 prev = &key->next;
272 jlm = key->next;
366 273
367 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { 274 while (jlm && jlm->mod != mod) {
368 head = &jump_label_table[i]; 275 prev = &jlm->next;
369 hlist_for_each_entry_safe(e, node, node_next, head, hlist) { 276 jlm = jlm->next;
370 hlist_for_each_entry_safe(e_module, module_node, 277 }
371 module_node_next, 278
372 &(e->modules), hlist) { 279 if (jlm) {
373 if (e_module->mod == mod) { 280 *prev = jlm->next;
374 hlist_del(&e_module->hlist); 281 kfree(jlm);
375 kfree(e_module);
376 }
377 }
378 if (hlist_empty(&e->modules) && (e->nr_entries == 0)) {
379 hlist_del(&e->hlist);
380 kfree(e);
381 }
382 } 282 }
383 } 283 }
384} 284}
385 285
386static void remove_jump_label_module_init(struct module *mod) 286static void jump_label_invalidate_module_init(struct module *mod)
387{ 287{
388 struct hlist_head *head; 288 struct jump_entry *iter_start = mod->jump_entries;
389 struct hlist_node *node, *node_next, *module_node, *module_node_next; 289 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
390 struct jump_label_entry *e;
391 struct jump_label_module_entry *e_module;
392 struct jump_entry *iter; 290 struct jump_entry *iter;
393 int i, count;
394
395 /* if the module doesn't have jump label entries, just return */
396 if (!mod->num_jump_entries)
397 return;
398 291
399 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { 292 for (iter = iter_start; iter < iter_stop; iter++) {
400 head = &jump_label_table[i]; 293 if (within_module_init(iter->code, mod))
401 hlist_for_each_entry_safe(e, node, node_next, head, hlist) { 294 iter->code = 0;
402 hlist_for_each_entry_safe(e_module, module_node,
403 module_node_next,
404 &(e->modules), hlist) {
405 if (e_module->mod != mod)
406 continue;
407 count = e_module->nr_entries;
408 iter = e_module->table;
409 while (count--) {
410 if (within_module_init(iter->code, mod))
411 iter->key = 0;
412 iter++;
413 }
414 }
415 }
416 } 295 }
417} 296}
418 297
@@ -426,59 +305,77 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
426 switch (val) { 305 switch (val) {
427 case MODULE_STATE_COMING: 306 case MODULE_STATE_COMING:
428 jump_label_lock(); 307 jump_label_lock();
429 ret = add_jump_label_module(mod); 308 ret = jump_label_add_module(mod);
430 if (ret) 309 if (ret)
431 remove_jump_label_module(mod); 310 jump_label_del_module(mod);
432 jump_label_unlock(); 311 jump_label_unlock();
433 break; 312 break;
434 case MODULE_STATE_GOING: 313 case MODULE_STATE_GOING:
435 jump_label_lock(); 314 jump_label_lock();
436 remove_jump_label_module(mod); 315 jump_label_del_module(mod);
437 jump_label_unlock(); 316 jump_label_unlock();
438 break; 317 break;
439 case MODULE_STATE_LIVE: 318 case MODULE_STATE_LIVE:
440 jump_label_lock(); 319 jump_label_lock();
441 remove_jump_label_module_init(mod); 320 jump_label_invalidate_module_init(mod);
442 jump_label_unlock(); 321 jump_label_unlock();
443 break; 322 break;
444 } 323 }
445 return ret;
446}
447 324
448/*** 325 return notifier_from_errno(ret);
449 * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
450 * @mod: module to patch
451 *
452 * Allow for run-time selection of the optimal nops. Before the module
453 * loads patch these with arch_get_jump_label_nop(), which is specified by
454 * the arch specific jump label code.
455 */
456void jump_label_apply_nops(struct module *mod)
457{
458 struct jump_entry *iter;
459
460 /* if the module doesn't have jump label entries, just return */
461 if (!mod->num_jump_entries)
462 return;
463
464 iter = mod->jump_entries;
465 while (iter < mod->jump_entries + mod->num_jump_entries) {
466 arch_jump_label_text_poke_early(iter->code);
467 iter++;
468 }
469} 326}
470 327
471struct notifier_block jump_label_module_nb = { 328struct notifier_block jump_label_module_nb = {
472 .notifier_call = jump_label_module_notify, 329 .notifier_call = jump_label_module_notify,
473 .priority = 0, 330 .priority = 1, /* higher than tracepoints */
474}; 331};
475 332
476static __init int init_jump_label_module(void) 333static __init int jump_label_init_module(void)
477{ 334{
478 return register_module_notifier(&jump_label_module_nb); 335 return register_module_notifier(&jump_label_module_nb);
479} 336}
480early_initcall(init_jump_label_module); 337early_initcall(jump_label_init_module);
481 338
482#endif /* CONFIG_MODULES */ 339#endif /* CONFIG_MODULES */
483 340
341/***
342 * jump_label_text_reserved - check if addr range is reserved
343 * @start: start text addr
344 * @end: end text addr
345 *
346 * checks if the text addr located between @start and @end
347 * overlaps with any of the jump label patch addresses. Code
348 * that wants to modify kernel text should first verify that
349 * it does not overlap with any of the jump label addresses.
350 * Caller must hold jump_label_mutex.
351 *
352 * returns 1 if there is an overlap, 0 otherwise
353 */
354int jump_label_text_reserved(void *start, void *end)
355{
356 int ret = __jump_label_text_reserved(__start___jump_table,
357 __stop___jump_table, start, end);
358
359 if (ret)
360 return ret;
361
362#ifdef CONFIG_MODULES
363 ret = __jump_label_mod_text_reserved(start, end);
364#endif
365 return ret;
366}
367
368static void jump_label_update(struct jump_label_key *key, int enable)
369{
370 struct jump_entry *entry = key->entries;
371
372 /* if there are no users, entry can be NULL */
373 if (entry)
374 __jump_label_update(key, entry, enable);
375
376#ifdef CONFIG_MODULES
377 __jump_label_mod_update(key, enable);
378#endif
379}
380
484#endif 381#endif
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6f6d091b5757..079f1d39a8b8 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -64,14 +64,14 @@ static inline int is_kernel_text(unsigned long addr)
64 if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || 64 if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
65 arch_is_kernel_text(addr)) 65 arch_is_kernel_text(addr))
66 return 1; 66 return 1;
67 return in_gate_area_no_task(addr); 67 return in_gate_area_no_mm(addr);
68} 68}
69 69
70static inline int is_kernel(unsigned long addr) 70static inline int is_kernel(unsigned long addr)
71{ 71{
72 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) 72 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
73 return 1; 73 return 1;
74 return in_gate_area_no_task(addr); 74 return in_gate_area_no_mm(addr);
75} 75}
76 76
77static int is_ksym_addr(unsigned long addr) 77static int is_ksym_addr(unsigned long addr)
@@ -342,13 +342,15 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
342} 342}
343 343
344/* Look up a kernel symbol and return it in a text buffer. */ 344/* Look up a kernel symbol and return it in a text buffer. */
345int sprint_symbol(char *buffer, unsigned long address) 345static int __sprint_symbol(char *buffer, unsigned long address,
346 int symbol_offset)
346{ 347{
347 char *modname; 348 char *modname;
348 const char *name; 349 const char *name;
349 unsigned long offset, size; 350 unsigned long offset, size;
350 int len; 351 int len;
351 352
353 address += symbol_offset;
352 name = kallsyms_lookup(address, &size, &offset, &modname, buffer); 354 name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
353 if (!name) 355 if (!name)
354 return sprintf(buffer, "0x%lx", address); 356 return sprintf(buffer, "0x%lx", address);
@@ -357,17 +359,53 @@ int sprint_symbol(char *buffer, unsigned long address)
357 strcpy(buffer, name); 359 strcpy(buffer, name);
358 len = strlen(buffer); 360 len = strlen(buffer);
359 buffer += len; 361 buffer += len;
362 offset -= symbol_offset;
360 363
361 if (modname) 364 if (modname)
362 len += sprintf(buffer, "+%#lx/%#lx [%s]", 365 len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname);
363 offset, size, modname);
364 else 366 else
365 len += sprintf(buffer, "+%#lx/%#lx", offset, size); 367 len += sprintf(buffer, "+%#lx/%#lx", offset, size);
366 368
367 return len; 369 return len;
368} 370}
371
372/**
373 * sprint_symbol - Look up a kernel symbol and return it in a text buffer
374 * @buffer: buffer to be stored
375 * @address: address to lookup
376 *
377 * This function looks up a kernel symbol with @address and stores its name,
378 * offset, size and module name to @buffer if possible. If no symbol was found,
379 * just saves its @address as is.
380 *
381 * This function returns the number of bytes stored in @buffer.
382 */
383int sprint_symbol(char *buffer, unsigned long address)
384{
385 return __sprint_symbol(buffer, address, 0);
386}
387
369EXPORT_SYMBOL_GPL(sprint_symbol); 388EXPORT_SYMBOL_GPL(sprint_symbol);
370 389
390/**
391 * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
392 * @buffer: buffer to be stored
393 * @address: address to lookup
394 *
395 * This function is for stack backtrace and does the same thing as
396 * sprint_symbol() but with modified/decreased @address. If there is a
397 * tail-call to the function marked "noreturn", gcc optimized out code after
398 * the call so that the stack-saved return address could point outside of the
399 * caller. This function ensures that kallsyms will find the original caller
400 * by decreasing @address.
401 *
402 * This function returns the number of bytes stored in @buffer.
403 */
404int sprint_backtrace(char *buffer, unsigned long address)
405{
406 return __sprint_symbol(buffer, address, -1);
407}
408
371/* Look up a kernel symbol and print it to the kernel messages. */ 409/* Look up a kernel symbol and print it to the kernel messages. */
372void __print_symbol(const char *fmt, unsigned long address) 410void __print_symbol(const char *fmt, unsigned long address)
373{ 411{
@@ -477,13 +515,11 @@ static int s_show(struct seq_file *m, void *p)
477 */ 515 */
478 type = iter->exported ? toupper(iter->type) : 516 type = iter->exported ? toupper(iter->type) :
479 tolower(iter->type); 517 tolower(iter->type);
480 seq_printf(m, "%0*lx %c %s\t[%s]\n", 518 seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
481 (int)(2 * sizeof(void *)), 519 type, iter->name, iter->module_name);
482 iter->value, type, iter->name, iter->module_name);
483 } else 520 } else
484 seq_printf(m, "%0*lx %c %s\n", 521 seq_printf(m, "%pK %c %s\n", (void *)iter->value,
485 (int)(2 * sizeof(void *)), 522 iter->type, iter->name);
486 iter->value, iter->type, iter->name);
487 return 0; 523 return 0;
488} 524}
489 525
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ec19b92c7ebd..8d814cbc8109 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -33,6 +33,7 @@
33#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/kmsg_dump.h> 35#include <linux/kmsg_dump.h>
36#include <linux/syscore_ops.h>
36 37
37#include <asm/page.h> 38#include <asm/page.h>
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
@@ -144,7 +145,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
144 /* Initialize the list of destination pages */ 145 /* Initialize the list of destination pages */
145 INIT_LIST_HEAD(&image->dest_pages); 146 INIT_LIST_HEAD(&image->dest_pages);
146 147
147 /* Initialize the list of unuseable pages */ 148 /* Initialize the list of unusable pages */
148 INIT_LIST_HEAD(&image->unuseable_pages); 149 INIT_LIST_HEAD(&image->unuseable_pages);
149 150
150 /* Read in the segments */ 151 /* Read in the segments */
@@ -454,7 +455,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
454 /* Deal with the destination pages I have inadvertently allocated. 455 /* Deal with the destination pages I have inadvertently allocated.
455 * 456 *
456 * Ideally I would convert multi-page allocations into single 457 * Ideally I would convert multi-page allocations into single
457 * page allocations, and add everyting to image->dest_pages. 458 * page allocations, and add everything to image->dest_pages.
458 * 459 *
459 * For now it is simpler to just free the pages. 460 * For now it is simpler to just free the pages.
460 */ 461 */
@@ -602,7 +603,7 @@ static void kimage_free_extra_pages(struct kimage *image)
602 /* Walk through and free any extra destination pages I may have */ 603 /* Walk through and free any extra destination pages I may have */
603 kimage_free_page_list(&image->dest_pages); 604 kimage_free_page_list(&image->dest_pages);
604 605
605 /* Walk through and free any unuseable pages I have cached */ 606 /* Walk through and free any unusable pages I have cached */
606 kimage_free_page_list(&image->unuseable_pages); 607 kimage_free_page_list(&image->unuseable_pages);
607 608
608} 609}
@@ -1099,7 +1100,8 @@ size_t crash_get_memory_size(void)
1099 return size; 1100 return size;
1100} 1101}
1101 1102
1102static void free_reserved_phys_range(unsigned long begin, unsigned long end) 1103void __weak crash_free_reserved_phys_range(unsigned long begin,
1104 unsigned long end)
1103{ 1105{
1104 unsigned long addr; 1106 unsigned long addr;
1105 1107
@@ -1135,7 +1137,7 @@ int crash_shrink_memory(unsigned long new_size)
1135 start = roundup(start, PAGE_SIZE); 1137 start = roundup(start, PAGE_SIZE);
1136 end = roundup(start + new_size, PAGE_SIZE); 1138 end = roundup(start + new_size, PAGE_SIZE);
1137 1139
1138 free_reserved_phys_range(end, crashk_res.end); 1140 crash_free_reserved_phys_range(end, crashk_res.end);
1139 1141
1140 if ((start == end) && (crashk_res.parent != NULL)) 1142 if ((start == end) && (crashk_res.parent != NULL))
1141 release_resource(&crashk_res); 1143 release_resource(&crashk_res);
@@ -1529,8 +1531,7 @@ int kernel_kexec(void)
1529 if (error) 1531 if (error)
1530 goto Enable_cpus; 1532 goto Enable_cpus;
1531 local_irq_disable(); 1533 local_irq_disable();
1532 /* Suspend system devices */ 1534 error = syscore_suspend();
1533 error = sysdev_suspend(PMSG_FREEZE);
1534 if (error) 1535 if (error)
1535 goto Enable_irqs; 1536 goto Enable_irqs;
1536 } else 1537 } else
@@ -1545,7 +1546,7 @@ int kernel_kexec(void)
1545 1546
1546#ifdef CONFIG_KEXEC_JUMP 1547#ifdef CONFIG_KEXEC_JUMP
1547 if (kexec_image->preserve_context) { 1548 if (kexec_image->preserve_context) {
1548 sysdev_resume(); 1549 syscore_resume();
1549 Enable_irqs: 1550 Enable_irqs:
1550 local_irq_enable(); 1551 local_irq_enable();
1551 Enable_cpus: 1552 Enable_cpus:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9cd0591c96a2..ad6a81c58b44 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,6 +25,7 @@
25#include <linux/kmod.h> 25#include <linux/kmod.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/completion.h> 27#include <linux/completion.h>
28#include <linux/cred.h>
28#include <linux/file.h> 29#include <linux/file.h>
29#include <linux/fdtable.h> 30#include <linux/fdtable.h>
30#include <linux/workqueue.h> 31#include <linux/workqueue.h>
@@ -43,6 +44,13 @@ extern int max_threads;
43 44
44static struct workqueue_struct *khelper_wq; 45static struct workqueue_struct *khelper_wq;
45 46
47#define CAP_BSET (void *)1
48#define CAP_PI (void *)2
49
50static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
51static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
52static DEFINE_SPINLOCK(umh_sysctl_lock);
53
46#ifdef CONFIG_MODULES 54#ifdef CONFIG_MODULES
47 55
48/* 56/*
@@ -132,6 +140,7 @@ EXPORT_SYMBOL(__request_module);
132static int ____call_usermodehelper(void *data) 140static int ____call_usermodehelper(void *data)
133{ 141{
134 struct subprocess_info *sub_info = data; 142 struct subprocess_info *sub_info = data;
143 struct cred *new;
135 int retval; 144 int retval;
136 145
137 spin_lock_irq(&current->sighand->siglock); 146 spin_lock_irq(&current->sighand->siglock);
@@ -153,6 +162,19 @@ static int ____call_usermodehelper(void *data)
153 goto fail; 162 goto fail;
154 } 163 }
155 164
165 retval = -ENOMEM;
166 new = prepare_kernel_cred(current);
167 if (!new)
168 goto fail;
169
170 spin_lock(&umh_sysctl_lock);
171 new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
172 new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
173 new->cap_inheritable);
174 spin_unlock(&umh_sysctl_lock);
175
176 commit_creds(new);
177
156 retval = kernel_execve(sub_info->path, 178 retval = kernel_execve(sub_info->path,
157 (const char *const *)sub_info->argv, 179 (const char *const *)sub_info->argv,
158 (const char *const *)sub_info->envp); 180 (const char *const *)sub_info->envp);
@@ -245,7 +267,6 @@ static void __call_usermodehelper(struct work_struct *work)
245 } 267 }
246} 268}
247 269
248#ifdef CONFIG_PM_SLEEP
249/* 270/*
250 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY 271 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
251 * (used for preventing user land processes from being created after the user 272 * (used for preventing user land processes from being created after the user
@@ -301,6 +322,15 @@ void usermodehelper_enable(void)
301 usermodehelper_disabled = 0; 322 usermodehelper_disabled = 0;
302} 323}
303 324
325/**
326 * usermodehelper_is_disabled - check if new helpers are allowed to be started
327 */
328bool usermodehelper_is_disabled(void)
329{
330 return usermodehelper_disabled;
331}
332EXPORT_SYMBOL_GPL(usermodehelper_is_disabled);
333
304static void helper_lock(void) 334static void helper_lock(void)
305{ 335{
306 atomic_inc(&running_helpers); 336 atomic_inc(&running_helpers);
@@ -312,12 +342,6 @@ static void helper_unlock(void)
312 if (atomic_dec_and_test(&running_helpers)) 342 if (atomic_dec_and_test(&running_helpers))
313 wake_up(&running_helpers_waitq); 343 wake_up(&running_helpers_waitq);
314} 344}
315#else /* CONFIG_PM_SLEEP */
316#define usermodehelper_disabled 0
317
318static inline void helper_lock(void) {}
319static inline void helper_unlock(void) {}
320#endif /* CONFIG_PM_SLEEP */
321 345
322/** 346/**
323 * call_usermodehelper_setup - prepare to call a usermode helper 347 * call_usermodehelper_setup - prepare to call a usermode helper
@@ -418,6 +442,84 @@ unlock:
418} 442}
419EXPORT_SYMBOL(call_usermodehelper_exec); 443EXPORT_SYMBOL(call_usermodehelper_exec);
420 444
445static int proc_cap_handler(struct ctl_table *table, int write,
446 void __user *buffer, size_t *lenp, loff_t *ppos)
447{
448 struct ctl_table t;
449 unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
450 kernel_cap_t new_cap;
451 int err, i;
452
453 if (write && (!capable(CAP_SETPCAP) ||
454 !capable(CAP_SYS_MODULE)))
455 return -EPERM;
456
457 /*
458 * convert from the global kernel_cap_t to the ulong array to print to
459 * userspace if this is a read.
460 */
461 spin_lock(&umh_sysctl_lock);
462 for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
463 if (table->data == CAP_BSET)
464 cap_array[i] = usermodehelper_bset.cap[i];
465 else if (table->data == CAP_PI)
466 cap_array[i] = usermodehelper_inheritable.cap[i];
467 else
468 BUG();
469 }
470 spin_unlock(&umh_sysctl_lock);
471
472 t = *table;
473 t.data = &cap_array;
474
475 /*
476 * actually read or write and array of ulongs from userspace. Remember
477 * these are least significant 32 bits first
478 */
479 err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
480 if (err < 0)
481 return err;
482
483 /*
484 * convert from the sysctl array of ulongs to the kernel_cap_t
485 * internal representation
486 */
487 for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
488 new_cap.cap[i] = cap_array[i];
489
490 /*
491 * Drop everything not in the new_cap (but don't add things)
492 */
493 spin_lock(&umh_sysctl_lock);
494 if (write) {
495 if (table->data == CAP_BSET)
496 usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
497 if (table->data == CAP_PI)
498 usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
499 }
500 spin_unlock(&umh_sysctl_lock);
501
502 return 0;
503}
504
505struct ctl_table usermodehelper_table[] = {
506 {
507 .procname = "bset",
508 .data = CAP_BSET,
509 .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
510 .mode = 0600,
511 .proc_handler = proc_cap_handler,
512 },
513 {
514 .procname = "inheritable",
515 .data = CAP_PI,
516 .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
517 .mode = 0600,
518 .proc_handler = proc_cap_handler,
519 },
520 { }
521};
522
421void __init usermodehelper_init(void) 523void __init usermodehelper_init(void)
422{ 524{
423 khelper_wq = create_singlethread_workqueue("khelper"); 525 khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 0b624e791805..3b053c04dd86 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -16,6 +16,7 @@
16#include <linux/kexec.h> 16#include <linux/kexec.h>
17#include <linux/profile.h> 17#include <linux/profile.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/capability.h>
19 20
20#define KERNEL_ATTR_RO(_name) \ 21#define KERNEL_ATTR_RO(_name) \
21static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 22static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -131,6 +132,14 @@ KERNEL_ATTR_RO(vmcoreinfo);
131 132
132#endif /* CONFIG_KEXEC */ 133#endif /* CONFIG_KEXEC */
133 134
135/* whether file capabilities are enabled */
136static ssize_t fscaps_show(struct kobject *kobj,
137 struct kobj_attribute *attr, char *buf)
138{
139 return sprintf(buf, "%d\n", file_caps_enabled);
140}
141KERNEL_ATTR_RO(fscaps);
142
134/* 143/*
135 * Make /sys/kernel/notes give the raw contents of our kernel .notes section. 144 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
136 */ 145 */
@@ -158,6 +167,7 @@ struct kobject *kernel_kobj;
158EXPORT_SYMBOL_GPL(kernel_kobj); 167EXPORT_SYMBOL_GPL(kernel_kobj);
159 168
160static struct attribute * kernel_attrs[] = { 169static struct attribute * kernel_attrs[] = {
170 &fscaps_attr.attr,
161#if defined(CONFIG_HOTPLUG) 171#if defined(CONFIG_HOTPLUG)
162 &uevent_seqnum_attr.attr, 172 &uevent_seqnum_attr.attr,
163 &uevent_helper_attr.attr, 173 &uevent_helper_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c55afba990a3..3b34d2732bce 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,6 +27,7 @@ struct kthread_create_info
27 /* Information passed to kthread() from kthreadd. */ 27 /* Information passed to kthread() from kthreadd. */
28 int (*threadfn)(void *data); 28 int (*threadfn)(void *data);
29 void *data; 29 void *data;
30 int node;
30 31
31 /* Result passed back to kthread_create() from kthreadd. */ 32 /* Result passed back to kthread_create() from kthreadd. */
32 struct task_struct *result; 33 struct task_struct *result;
@@ -98,10 +99,23 @@ static int kthread(void *_create)
98 do_exit(ret); 99 do_exit(ret);
99} 100}
100 101
102/* called from do_fork() to get node information for about to be created task */
103int tsk_fork_get_node(struct task_struct *tsk)
104{
105#ifdef CONFIG_NUMA
106 if (tsk == kthreadd_task)
107 return tsk->pref_node_fork;
108#endif
109 return numa_node_id();
110}
111
101static void create_kthread(struct kthread_create_info *create) 112static void create_kthread(struct kthread_create_info *create)
102{ 113{
103 int pid; 114 int pid;
104 115
116#ifdef CONFIG_NUMA
117 current->pref_node_fork = create->node;
118#endif
105 /* We want our own signal handler (we take no signals by default). */ 119 /* We want our own signal handler (we take no signals by default). */
106 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); 120 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
107 if (pid < 0) { 121 if (pid < 0) {
@@ -111,33 +125,38 @@ static void create_kthread(struct kthread_create_info *create)
111} 125}
112 126
113/** 127/**
114 * kthread_create - create a kthread. 128 * kthread_create_on_node - create a kthread.
115 * @threadfn: the function to run until signal_pending(current). 129 * @threadfn: the function to run until signal_pending(current).
116 * @data: data ptr for @threadfn. 130 * @data: data ptr for @threadfn.
131 * @node: memory node number.
117 * @namefmt: printf-style name for the thread. 132 * @namefmt: printf-style name for the thread.
118 * 133 *
119 * Description: This helper function creates and names a kernel 134 * Description: This helper function creates and names a kernel
120 * thread. The thread will be stopped: use wake_up_process() to start 135 * thread. The thread will be stopped: use wake_up_process() to start
121 * it. See also kthread_run(). 136 * it. See also kthread_run().
122 * 137 *
138 * If thread is going to be bound on a particular cpu, give its node
139 * in @node, to get NUMA affinity for kthread stack, or else give -1.
123 * When woken, the thread will run @threadfn() with @data as its 140 * When woken, the thread will run @threadfn() with @data as its
124 * argument. @threadfn() can either call do_exit() directly if it is a 141 * argument. @threadfn() can either call do_exit() directly if it is a
125 * standalone thread for which noone will call kthread_stop(), or 142 * standalone thread for which no one will call kthread_stop(), or
126 * return when 'kthread_should_stop()' is true (which means 143 * return when 'kthread_should_stop()' is true (which means
127 * kthread_stop() has been called). The return value should be zero 144 * kthread_stop() has been called). The return value should be zero
128 * or a negative error number; it will be passed to kthread_stop(). 145 * or a negative error number; it will be passed to kthread_stop().
129 * 146 *
130 * Returns a task_struct or ERR_PTR(-ENOMEM). 147 * Returns a task_struct or ERR_PTR(-ENOMEM).
131 */ 148 */
132struct task_struct *kthread_create(int (*threadfn)(void *data), 149struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
133 void *data, 150 void *data,
134 const char namefmt[], 151 int node,
135 ...) 152 const char namefmt[],
153 ...)
136{ 154{
137 struct kthread_create_info create; 155 struct kthread_create_info create;
138 156
139 create.threadfn = threadfn; 157 create.threadfn = threadfn;
140 create.data = data; 158 create.data = data;
159 create.node = node;
141 init_completion(&create.done); 160 init_completion(&create.done);
142 161
143 spin_lock(&kthread_create_lock); 162 spin_lock(&kthread_create_lock);
@@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
164 } 183 }
165 return create.result; 184 return create.result;
166} 185}
167EXPORT_SYMBOL(kthread_create); 186EXPORT_SYMBOL(kthread_create_on_node);
168 187
169/** 188/**
170 * kthread_bind - bind a just-created kthread to a cpu. 189 * kthread_bind - bind a just-created kthread to a cpu.
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index ee74b35e528d..376066e10413 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -153,7 +153,7 @@ static inline void store_stacktrace(struct task_struct *tsk,
153} 153}
154 154
155/** 155/**
156 * __account_scheduler_latency - record an occured latency 156 * __account_scheduler_latency - record an occurred latency
157 * @tsk - the task struct of the task hitting the latency 157 * @tsk - the task struct of the task hitting the latency
158 * @usecs - the duration of the latency in microseconds 158 * @usecs - the duration of the latency in microseconds
159 * @inter - 1 if the sleep was interruptible, 0 if uninterruptible 159 * @inter - 1 if the sleep was interruptible, 0 if uninterruptible
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0d2058da80f5..63437d065ac8 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -490,6 +490,18 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
490 usage[i] = '\0'; 490 usage[i] = '\0';
491} 491}
492 492
493static int __print_lock_name(struct lock_class *class)
494{
495 char str[KSYM_NAME_LEN];
496 const char *name;
497
498 name = class->name;
499 if (!name)
500 name = __get_key_name(class->key, str);
501
502 return printk("%s", name);
503}
504
493static void print_lock_name(struct lock_class *class) 505static void print_lock_name(struct lock_class *class)
494{ 506{
495 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; 507 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
@@ -1053,6 +1065,56 @@ print_circular_bug_entry(struct lock_list *target, int depth)
1053 return 0; 1065 return 0;
1054} 1066}
1055 1067
1068static void
1069print_circular_lock_scenario(struct held_lock *src,
1070 struct held_lock *tgt,
1071 struct lock_list *prt)
1072{
1073 struct lock_class *source = hlock_class(src);
1074 struct lock_class *target = hlock_class(tgt);
1075 struct lock_class *parent = prt->class;
1076
1077 /*
1078 * A direct locking problem where unsafe_class lock is taken
1079 * directly by safe_class lock, then all we need to show
1080 * is the deadlock scenario, as it is obvious that the
1081 * unsafe lock is taken under the safe lock.
1082 *
1083 * But if there is a chain instead, where the safe lock takes
1084 * an intermediate lock (middle_class) where this lock is
1085 * not the same as the safe lock, then the lock chain is
1086 * used to describe the problem. Otherwise we would need
1087 * to show a different CPU case for each link in the chain
1088 * from the safe_class lock to the unsafe_class lock.
1089 */
1090 if (parent != source) {
1091 printk("Chain exists of:\n ");
1092 __print_lock_name(source);
1093 printk(" --> ");
1094 __print_lock_name(parent);
1095 printk(" --> ");
1096 __print_lock_name(target);
1097 printk("\n\n");
1098 }
1099
1100 printk(" Possible unsafe locking scenario:\n\n");
1101 printk(" CPU0 CPU1\n");
1102 printk(" ---- ----\n");
1103 printk(" lock(");
1104 __print_lock_name(target);
1105 printk(");\n");
1106 printk(" lock(");
1107 __print_lock_name(parent);
1108 printk(");\n");
1109 printk(" lock(");
1110 __print_lock_name(target);
1111 printk(");\n");
1112 printk(" lock(");
1113 __print_lock_name(source);
1114 printk(");\n");
1115 printk("\n *** DEADLOCK ***\n\n");
1116}
1117
1056/* 1118/*
1057 * When a circular dependency is detected, print the 1119 * When a circular dependency is detected, print the
1058 * header first: 1120 * header first:
@@ -1096,6 +1158,7 @@ static noinline int print_circular_bug(struct lock_list *this,
1096{ 1158{
1097 struct task_struct *curr = current; 1159 struct task_struct *curr = current;
1098 struct lock_list *parent; 1160 struct lock_list *parent;
1161 struct lock_list *first_parent;
1099 int depth; 1162 int depth;
1100 1163
1101 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1164 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
@@ -1109,6 +1172,7 @@ static noinline int print_circular_bug(struct lock_list *this,
1109 print_circular_bug_header(target, depth, check_src, check_tgt); 1172 print_circular_bug_header(target, depth, check_src, check_tgt);
1110 1173
1111 parent = get_lock_parent(target); 1174 parent = get_lock_parent(target);
1175 first_parent = parent;
1112 1176
1113 while (parent) { 1177 while (parent) {
1114 print_circular_bug_entry(parent, --depth); 1178 print_circular_bug_entry(parent, --depth);
@@ -1116,6 +1180,9 @@ static noinline int print_circular_bug(struct lock_list *this,
1116 } 1180 }
1117 1181
1118 printk("\nother info that might help us debug this:\n\n"); 1182 printk("\nother info that might help us debug this:\n\n");
1183 print_circular_lock_scenario(check_src, check_tgt,
1184 first_parent);
1185
1119 lockdep_print_held_locks(curr); 1186 lockdep_print_held_locks(curr);
1120 1187
1121 printk("\nstack backtrace:\n"); 1188 printk("\nstack backtrace:\n");
@@ -1314,7 +1381,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
1314 printk("\n"); 1381 printk("\n");
1315 1382
1316 if (depth == 0 && (entry != root)) { 1383 if (depth == 0 && (entry != root)) {
1317 printk("lockdep:%s bad BFS generated tree\n", __func__); 1384 printk("lockdep:%s bad path found in chain graph\n", __func__);
1318 break; 1385 break;
1319 } 1386 }
1320 1387
@@ -1325,6 +1392,62 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
1325 return; 1392 return;
1326} 1393}
1327 1394
1395static void
1396print_irq_lock_scenario(struct lock_list *safe_entry,
1397 struct lock_list *unsafe_entry,
1398 struct lock_class *prev_class,
1399 struct lock_class *next_class)
1400{
1401 struct lock_class *safe_class = safe_entry->class;
1402 struct lock_class *unsafe_class = unsafe_entry->class;
1403 struct lock_class *middle_class = prev_class;
1404
1405 if (middle_class == safe_class)
1406 middle_class = next_class;
1407
1408 /*
1409 * A direct locking problem where unsafe_class lock is taken
1410 * directly by safe_class lock, then all we need to show
1411 * is the deadlock scenario, as it is obvious that the
1412 * unsafe lock is taken under the safe lock.
1413 *
1414 * But if there is a chain instead, where the safe lock takes
1415 * an intermediate lock (middle_class) where this lock is
1416 * not the same as the safe lock, then the lock chain is
1417 * used to describe the problem. Otherwise we would need
1418 * to show a different CPU case for each link in the chain
1419 * from the safe_class lock to the unsafe_class lock.
1420 */
1421 if (middle_class != unsafe_class) {
1422 printk("Chain exists of:\n ");
1423 __print_lock_name(safe_class);
1424 printk(" --> ");
1425 __print_lock_name(middle_class);
1426 printk(" --> ");
1427 __print_lock_name(unsafe_class);
1428 printk("\n\n");
1429 }
1430
1431 printk(" Possible interrupt unsafe locking scenario:\n\n");
1432 printk(" CPU0 CPU1\n");
1433 printk(" ---- ----\n");
1434 printk(" lock(");
1435 __print_lock_name(unsafe_class);
1436 printk(");\n");
1437 printk(" local_irq_disable();\n");
1438 printk(" lock(");
1439 __print_lock_name(safe_class);
1440 printk(");\n");
1441 printk(" lock(");
1442 __print_lock_name(middle_class);
1443 printk(");\n");
1444 printk(" <Interrupt>\n");
1445 printk(" lock(");
1446 __print_lock_name(safe_class);
1447 printk(");\n");
1448 printk("\n *** DEADLOCK ***\n\n");
1449}
1450
1328static int 1451static int
1329print_bad_irq_dependency(struct task_struct *curr, 1452print_bad_irq_dependency(struct task_struct *curr,
1330 struct lock_list *prev_root, 1453 struct lock_list *prev_root,
@@ -1376,6 +1499,9 @@ print_bad_irq_dependency(struct task_struct *curr,
1376 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); 1499 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
1377 1500
1378 printk("\nother info that might help us debug this:\n\n"); 1501 printk("\nother info that might help us debug this:\n\n");
1502 print_irq_lock_scenario(backwards_entry, forwards_entry,
1503 hlock_class(prev), hlock_class(next));
1504
1379 lockdep_print_held_locks(curr); 1505 lockdep_print_held_locks(curr);
1380 1506
1381 printk("\nthe dependencies between %s-irq-safe lock", irqclass); 1507 printk("\nthe dependencies between %s-irq-safe lock", irqclass);
@@ -1539,6 +1665,26 @@ static inline void inc_chains(void)
1539 1665
1540#endif 1666#endif
1541 1667
1668static void
1669print_deadlock_scenario(struct held_lock *nxt,
1670 struct held_lock *prv)
1671{
1672 struct lock_class *next = hlock_class(nxt);
1673 struct lock_class *prev = hlock_class(prv);
1674
1675 printk(" Possible unsafe locking scenario:\n\n");
1676 printk(" CPU0\n");
1677 printk(" ----\n");
1678 printk(" lock(");
1679 __print_lock_name(prev);
1680 printk(");\n");
1681 printk(" lock(");
1682 __print_lock_name(next);
1683 printk(");\n");
1684 printk("\n *** DEADLOCK ***\n\n");
1685 printk(" May be due to missing lock nesting notation\n\n");
1686}
1687
1542static int 1688static int
1543print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, 1689print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1544 struct held_lock *next) 1690 struct held_lock *next)
@@ -1557,6 +1703,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1557 print_lock(prev); 1703 print_lock(prev);
1558 1704
1559 printk("\nother info that might help us debug this:\n"); 1705 printk("\nother info that might help us debug this:\n");
1706 print_deadlock_scenario(next, prev);
1560 lockdep_print_held_locks(curr); 1707 lockdep_print_held_locks(curr);
1561 1708
1562 printk("\nstack backtrace:\n"); 1709 printk("\nstack backtrace:\n");
@@ -1826,7 +1973,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
1826 struct list_head *hash_head = chainhashentry(chain_key); 1973 struct list_head *hash_head = chainhashentry(chain_key);
1827 struct lock_chain *chain; 1974 struct lock_chain *chain;
1828 struct held_lock *hlock_curr, *hlock_next; 1975 struct held_lock *hlock_curr, *hlock_next;
1829 int i, j, n, cn; 1976 int i, j;
1830 1977
1831 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 1978 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1832 return 0; 1979 return 0;
@@ -1886,15 +2033,9 @@ cache_hit:
1886 } 2033 }
1887 i++; 2034 i++;
1888 chain->depth = curr->lockdep_depth + 1 - i; 2035 chain->depth = curr->lockdep_depth + 1 - i;
1889 cn = nr_chain_hlocks; 2036 if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
1890 while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) { 2037 chain->base = nr_chain_hlocks;
1891 n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth); 2038 nr_chain_hlocks += chain->depth;
1892 if (n == cn)
1893 break;
1894 cn = n;
1895 }
1896 if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
1897 chain->base = cn;
1898 for (j = 0; j < chain->depth - 1; j++, i++) { 2039 for (j = 0; j < chain->depth - 1; j++, i++) {
1899 int lock_id = curr->held_locks[i].class_idx - 1; 2040 int lock_id = curr->held_locks[i].class_idx - 1;
1900 chain_hlocks[chain->base + j] = lock_id; 2041 chain_hlocks[chain->base + j] = lock_id;
@@ -2011,6 +2152,24 @@ static void check_chain_key(struct task_struct *curr)
2011#endif 2152#endif
2012} 2153}
2013 2154
2155static void
2156print_usage_bug_scenario(struct held_lock *lock)
2157{
2158 struct lock_class *class = hlock_class(lock);
2159
2160 printk(" Possible unsafe locking scenario:\n\n");
2161 printk(" CPU0\n");
2162 printk(" ----\n");
2163 printk(" lock(");
2164 __print_lock_name(class);
2165 printk(");\n");
2166 printk(" <Interrupt>\n");
2167 printk(" lock(");
2168 __print_lock_name(class);
2169 printk(");\n");
2170 printk("\n *** DEADLOCK ***\n\n");
2171}
2172
2014static int 2173static int
2015print_usage_bug(struct task_struct *curr, struct held_lock *this, 2174print_usage_bug(struct task_struct *curr, struct held_lock *this,
2016 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) 2175 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
@@ -2039,6 +2198,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2039 2198
2040 print_irqtrace_events(curr); 2199 print_irqtrace_events(curr);
2041 printk("\nother info that might help us debug this:\n"); 2200 printk("\nother info that might help us debug this:\n");
2201 print_usage_bug_scenario(this);
2202
2042 lockdep_print_held_locks(curr); 2203 lockdep_print_held_locks(curr);
2043 2204
2044 printk("\nstack backtrace:\n"); 2205 printk("\nstack backtrace:\n");
@@ -2073,6 +2234,10 @@ print_irq_inversion_bug(struct task_struct *curr,
2073 struct held_lock *this, int forwards, 2234 struct held_lock *this, int forwards,
2074 const char *irqclass) 2235 const char *irqclass)
2075{ 2236{
2237 struct lock_list *entry = other;
2238 struct lock_list *middle = NULL;
2239 int depth;
2240
2076 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 2241 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
2077 return 0; 2242 return 0;
2078 2243
@@ -2091,6 +2256,25 @@ print_irq_inversion_bug(struct task_struct *curr,
2091 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); 2256 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
2092 2257
2093 printk("\nother info that might help us debug this:\n"); 2258 printk("\nother info that might help us debug this:\n");
2259
2260 /* Find a middle lock (if one exists) */
2261 depth = get_lock_depth(other);
2262 do {
2263 if (depth == 0 && (entry != root)) {
2264 printk("lockdep:%s bad path found in chain graph\n", __func__);
2265 break;
2266 }
2267 middle = entry;
2268 entry = get_lock_parent(entry);
2269 depth--;
2270 } while (entry && entry != root && (depth >= 0));
2271 if (forwards)
2272 print_irq_lock_scenario(root, other,
2273 middle ? middle->class : root->class, other->class);
2274 else
2275 print_irq_lock_scenario(other, root,
2276 middle ? middle->class : other->class, root->class);
2277
2094 lockdep_print_held_locks(curr); 2278 lockdep_print_held_locks(curr);
2095 2279
2096 printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); 2280 printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
@@ -2309,7 +2493,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2309 if (unlikely(curr->hardirqs_enabled)) { 2493 if (unlikely(curr->hardirqs_enabled)) {
2310 /* 2494 /*
2311 * Neither irq nor preemption are disabled here 2495 * Neither irq nor preemption are disabled here
2312 * so this is racy by nature but loosing one hit 2496 * so this is racy by nature but losing one hit
2313 * in a stat is not a big deal. 2497 * in a stat is not a big deal.
2314 */ 2498 */
2315 __debug_atomic_inc(redundant_hardirqs_on); 2499 __debug_atomic_inc(redundant_hardirqs_on);
@@ -2620,7 +2804,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2620 if (!graph_lock()) 2804 if (!graph_lock())
2621 return 0; 2805 return 0;
2622 /* 2806 /*
2623 * Make sure we didnt race: 2807 * Make sure we didn't race:
2624 */ 2808 */
2625 if (unlikely(hlock_class(this)->usage_mask & new_mask)) { 2809 if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
2626 graph_unlock(); 2810 graph_unlock();
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 1969d2fc4b36..71edd2f60c02 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -225,7 +225,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
225 nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, 225 nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
226 nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, 226 nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
227 nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, 227 nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
228 sum_forward_deps = 0, factor = 0; 228 sum_forward_deps = 0;
229 229
230 list_for_each_entry(class, &all_lock_classes, lock_entry) { 230 list_for_each_entry(class, &all_lock_classes, lock_entry) {
231 231
@@ -283,13 +283,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
283 nr_hardirq_unsafe * nr_hardirq_safe + 283 nr_hardirq_unsafe * nr_hardirq_safe +
284 nr_list_entries); 284 nr_list_entries);
285 285
286 /*
287 * Estimated factor between direct and indirect
288 * dependencies:
289 */
290 if (nr_list_entries)
291 factor = sum_forward_deps / nr_list_entries;
292
293#ifdef CONFIG_PROVE_LOCKING 286#ifdef CONFIG_PROVE_LOCKING
294 seq_printf(m, " dependency chains: %11lu [max: %lu]\n", 287 seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
295 nr_lock_chains, MAX_LOCKDEP_CHAINS); 288 nr_lock_chains, MAX_LOCKDEP_CHAINS);
diff --git a/kernel/module.c b/kernel/module.c
index efa290ea94bf..795bdc7f5c3f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -57,6 +57,7 @@
57#include <linux/kmemleak.h> 57#include <linux/kmemleak.h>
58#include <linux/jump_label.h> 58#include <linux/jump_label.h>
59#include <linux/pfn.h> 59#include <linux/pfn.h>
60#include <linux/bsearch.h>
60 61
61#define CREATE_TRACE_POINTS 62#define CREATE_TRACE_POINTS
62#include <trace/events/module.h> 63#include <trace/events/module.h>
@@ -240,23 +241,24 @@ static bool each_symbol_in_section(const struct symsearch *arr,
240 struct module *owner, 241 struct module *owner,
241 bool (*fn)(const struct symsearch *syms, 242 bool (*fn)(const struct symsearch *syms,
242 struct module *owner, 243 struct module *owner,
243 unsigned int symnum, void *data), 244 void *data),
244 void *data) 245 void *data)
245{ 246{
246 unsigned int i, j; 247 unsigned int j;
247 248
248 for (j = 0; j < arrsize; j++) { 249 for (j = 0; j < arrsize; j++) {
249 for (i = 0; i < arr[j].stop - arr[j].start; i++) 250 if (fn(&arr[j], owner, data))
250 if (fn(&arr[j], owner, i, data)) 251 return true;
251 return true;
252 } 252 }
253 253
254 return false; 254 return false;
255} 255}
256 256
257/* Returns true as soon as fn returns true, otherwise false. */ 257/* Returns true as soon as fn returns true, otherwise false. */
258bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, 258bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
259 unsigned int symnum, void *data), void *data) 259 struct module *owner,
260 void *data),
261 void *data)
260{ 262{
261 struct module *mod; 263 struct module *mod;
262 static const struct symsearch arr[] = { 264 static const struct symsearch arr[] = {
@@ -309,7 +311,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
309 } 311 }
310 return false; 312 return false;
311} 313}
312EXPORT_SYMBOL_GPL(each_symbol); 314EXPORT_SYMBOL_GPL(each_symbol_section);
313 315
314struct find_symbol_arg { 316struct find_symbol_arg {
315 /* Input */ 317 /* Input */
@@ -323,15 +325,12 @@ struct find_symbol_arg {
323 const struct kernel_symbol *sym; 325 const struct kernel_symbol *sym;
324}; 326};
325 327
326static bool find_symbol_in_section(const struct symsearch *syms, 328static bool check_symbol(const struct symsearch *syms,
327 struct module *owner, 329 struct module *owner,
328 unsigned int symnum, void *data) 330 unsigned int symnum, void *data)
329{ 331{
330 struct find_symbol_arg *fsa = data; 332 struct find_symbol_arg *fsa = data;
331 333
332 if (strcmp(syms->start[symnum].name, fsa->name) != 0)
333 return false;
334
335 if (!fsa->gplok) { 334 if (!fsa->gplok) {
336 if (syms->licence == GPL_ONLY) 335 if (syms->licence == GPL_ONLY)
337 return false; 336 return false;
@@ -365,6 +364,30 @@ static bool find_symbol_in_section(const struct symsearch *syms,
365 return true; 364 return true;
366} 365}
367 366
367static int cmp_name(const void *va, const void *vb)
368{
369 const char *a;
370 const struct kernel_symbol *b;
371 a = va; b = vb;
372 return strcmp(a, b->name);
373}
374
375static bool find_symbol_in_section(const struct symsearch *syms,
376 struct module *owner,
377 void *data)
378{
379 struct find_symbol_arg *fsa = data;
380 struct kernel_symbol *sym;
381
382 sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
383 sizeof(struct kernel_symbol), cmp_name);
384
385 if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data))
386 return true;
387
388 return false;
389}
390
368/* Find a symbol and return it, along with, (optional) crc and 391/* Find a symbol and return it, along with, (optional) crc and
369 * (optional) module which owns it. Needs preempt disabled or module_mutex. */ 392 * (optional) module which owns it. Needs preempt disabled or module_mutex. */
370const struct kernel_symbol *find_symbol(const char *name, 393const struct kernel_symbol *find_symbol(const char *name,
@@ -379,7 +402,7 @@ const struct kernel_symbol *find_symbol(const char *name,
379 fsa.gplok = gplok; 402 fsa.gplok = gplok;
380 fsa.warn = warn; 403 fsa.warn = warn;
381 404
382 if (each_symbol(find_symbol_in_section, &fsa)) { 405 if (each_symbol_section(find_symbol_in_section, &fsa)) {
383 if (owner) 406 if (owner)
384 *owner = fsa.owner; 407 *owner = fsa.owner;
385 if (crc) 408 if (crc)
@@ -809,7 +832,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
809 wait_for_zero_refcount(mod); 832 wait_for_zero_refcount(mod);
810 833
811 mutex_unlock(&module_mutex); 834 mutex_unlock(&module_mutex);
812 /* Final destruction now noone is using it. */ 835 /* Final destruction now no one is using it. */
813 if (mod->exit != NULL) 836 if (mod->exit != NULL)
814 mod->exit(); 837 mod->exit();
815 blocking_notifier_call_chain(&module_notify_list, 838 blocking_notifier_call_chain(&module_notify_list,
@@ -1168,7 +1191,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
1168{ 1191{
1169 struct module_sect_attr *sattr = 1192 struct module_sect_attr *sattr =
1170 container_of(mattr, struct module_sect_attr, mattr); 1193 container_of(mattr, struct module_sect_attr, mattr);
1171 return sprintf(buf, "0x%lx\n", sattr->address); 1194 return sprintf(buf, "0x%pK\n", (void *)sattr->address);
1172} 1195}
1173 1196
1174static void free_sect_attrs(struct module_sect_attrs *sect_attrs) 1197static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
@@ -1607,27 +1630,28 @@ static void set_section_ro_nx(void *base,
1607 } 1630 }
1608} 1631}
1609 1632
1610/* Setting memory back to RW+NX before releasing it */ 1633static void unset_module_core_ro_nx(struct module *mod)
1611void unset_section_ro_nx(struct module *mod, void *module_region)
1612{ 1634{
1613 unsigned long total_pages; 1635 set_page_attributes(mod->module_core + mod->core_text_size,
1614 1636 mod->module_core + mod->core_size,
1615 if (mod->module_core == module_region) { 1637 set_memory_x);
1616 /* Set core as NX+RW */ 1638 set_page_attributes(mod->module_core,
1617 total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size); 1639 mod->module_core + mod->core_ro_size,
1618 set_memory_nx((unsigned long)mod->module_core, total_pages); 1640 set_memory_rw);
1619 set_memory_rw((unsigned long)mod->module_core, total_pages); 1641}
1620 1642
1621 } else if (mod->module_init == module_region) { 1643static void unset_module_init_ro_nx(struct module *mod)
1622 /* Set init as NX+RW */ 1644{
1623 total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size); 1645 set_page_attributes(mod->module_init + mod->init_text_size,
1624 set_memory_nx((unsigned long)mod->module_init, total_pages); 1646 mod->module_init + mod->init_size,
1625 set_memory_rw((unsigned long)mod->module_init, total_pages); 1647 set_memory_x);
1626 } 1648 set_page_attributes(mod->module_init,
1649 mod->module_init + mod->init_ro_size,
1650 set_memory_rw);
1627} 1651}
1628 1652
1629/* Iterate through all modules and set each module's text as RW */ 1653/* Iterate through all modules and set each module's text as RW */
1630void set_all_modules_text_rw() 1654void set_all_modules_text_rw(void)
1631{ 1655{
1632 struct module *mod; 1656 struct module *mod;
1633 1657
@@ -1648,7 +1672,7 @@ void set_all_modules_text_rw()
1648} 1672}
1649 1673
1650/* Iterate through all modules and set each module's text as RO */ 1674/* Iterate through all modules and set each module's text as RO */
1651void set_all_modules_text_ro() 1675void set_all_modules_text_ro(void)
1652{ 1676{
1653 struct module *mod; 1677 struct module *mod;
1654 1678
@@ -1669,7 +1693,8 @@ void set_all_modules_text_ro()
1669} 1693}
1670#else 1694#else
1671static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } 1695static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
1672static inline void unset_section_ro_nx(struct module *mod, void *module_region) { } 1696static void unset_module_core_ro_nx(struct module *mod) { }
1697static void unset_module_init_ro_nx(struct module *mod) { }
1673#endif 1698#endif
1674 1699
1675/* Free a module, remove from lists, etc. */ 1700/* Free a module, remove from lists, etc. */
@@ -1696,7 +1721,7 @@ static void free_module(struct module *mod)
1696 destroy_params(mod->kp, mod->num_kp); 1721 destroy_params(mod->kp, mod->num_kp);
1697 1722
1698 /* This may be NULL, but that's OK */ 1723 /* This may be NULL, but that's OK */
1699 unset_section_ro_nx(mod, mod->module_init); 1724 unset_module_init_ro_nx(mod);
1700 module_free(mod, mod->module_init); 1725 module_free(mod, mod->module_init);
1701 kfree(mod->args); 1726 kfree(mod->args);
1702 percpu_modfree(mod); 1727 percpu_modfree(mod);
@@ -1705,7 +1730,7 @@ static void free_module(struct module *mod)
1705 lockdep_free_key_range(mod->module_core, mod->core_size); 1730 lockdep_free_key_range(mod->module_core, mod->core_size);
1706 1731
1707 /* Finally, free the core (containing the module structure) */ 1732 /* Finally, free the core (containing the module structure) */
1708 unset_section_ro_nx(mod, mod->module_core); 1733 unset_module_core_ro_nx(mod);
1709 module_free(mod, mod->module_core); 1734 module_free(mod, mod->module_core);
1710 1735
1711#ifdef CONFIG_MPU 1736#ifdef CONFIG_MPU
@@ -2030,11 +2055,8 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
2030 const struct kernel_symbol *start, 2055 const struct kernel_symbol *start,
2031 const struct kernel_symbol *stop) 2056 const struct kernel_symbol *stop)
2032{ 2057{
2033 const struct kernel_symbol *ks = start; 2058 return bsearch(name, start, stop - start,
2034 for (; ks < stop; ks++) 2059 sizeof(struct kernel_symbol), cmp_name);
2035 if (strcmp(ks->name, name) == 0)
2036 return ks;
2037 return NULL;
2038} 2060}
2039 2061
2040static int is_exported(const char *name, unsigned long value, 2062static int is_exported(const char *name, unsigned long value,
@@ -2777,7 +2799,7 @@ static struct module *load_module(void __user *umod,
2777 mod->state = MODULE_STATE_COMING; 2799 mod->state = MODULE_STATE_COMING;
2778 2800
2779 /* Now sew it into the lists so we can get lockdep and oops 2801 /* Now sew it into the lists so we can get lockdep and oops
2780 * info during argument parsing. Noone should access us, since 2802 * info during argument parsing. No one should access us, since
2781 * strong_try_module_get() will fail. 2803 * strong_try_module_get() will fail.
2782 * lockdep/oops can run asynchronous, so use the RCU list insertion 2804 * lockdep/oops can run asynchronous, so use the RCU list insertion
2783 * function to insert in a way safe to concurrent readers. 2805 * function to insert in a way safe to concurrent readers.
@@ -2790,7 +2812,7 @@ static struct module *load_module(void __user *umod,
2790 } 2812 }
2791 2813
2792 /* This has to be done once we're sure module name is unique. */ 2814 /* This has to be done once we're sure module name is unique. */
2793 if (!mod->taints) 2815 if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
2794 dynamic_debug_setup(info.debug, info.num_debug); 2816 dynamic_debug_setup(info.debug, info.num_debug);
2795 2817
2796 /* Find duplicate symbols */ 2818 /* Find duplicate symbols */
@@ -2827,7 +2849,7 @@ static struct module *load_module(void __user *umod,
2827 module_bug_cleanup(mod); 2849 module_bug_cleanup(mod);
2828 2850
2829 ddebug: 2851 ddebug:
2830 if (!mod->taints) 2852 if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
2831 dynamic_debug_remove(info.debug); 2853 dynamic_debug_remove(info.debug);
2832 unlock: 2854 unlock:
2833 mutex_unlock(&module_mutex); 2855 mutex_unlock(&module_mutex);
@@ -2931,10 +2953,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2931 mod->symtab = mod->core_symtab; 2953 mod->symtab = mod->core_symtab;
2932 mod->strtab = mod->core_strtab; 2954 mod->strtab = mod->core_strtab;
2933#endif 2955#endif
2934 unset_section_ro_nx(mod, mod->module_init); 2956 unset_module_init_ro_nx(mod);
2935 module_free(mod, mod->module_init); 2957 module_free(mod, mod->module_init);
2936 mod->module_init = NULL; 2958 mod->module_init = NULL;
2937 mod->init_size = 0; 2959 mod->init_size = 0;
2960 mod->init_ro_size = 0;
2938 mod->init_text_size = 0; 2961 mod->init_text_size = 0;
2939 mutex_unlock(&module_mutex); 2962 mutex_unlock(&module_mutex);
2940 2963
@@ -2971,7 +2994,7 @@ static const char *get_ksymbol(struct module *mod,
2971 else 2994 else
2972 nextval = (unsigned long)mod->module_core+mod->core_text_size; 2995 nextval = (unsigned long)mod->module_core+mod->core_text_size;
2973 2996
2974 /* Scan for closest preceeding symbol, and next symbol. (ELF 2997 /* Scan for closest preceding symbol, and next symbol. (ELF
2975 starts real symbols at 1). */ 2998 starts real symbols at 1). */
2976 for (i = 1; i < mod->num_symtab; i++) { 2999 for (i = 1; i < mod->num_symtab; i++) {
2977 if (mod->symtab[i].st_shndx == SHN_UNDEF) 3000 if (mod->symtab[i].st_shndx == SHN_UNDEF)
@@ -3224,7 +3247,7 @@ static int m_show(struct seq_file *m, void *p)
3224 mod->state == MODULE_STATE_COMING ? "Loading": 3247 mod->state == MODULE_STATE_COMING ? "Loading":
3225 "Live"); 3248 "Live");
3226 /* Used by oprofile and other similar tools. */ 3249 /* Used by oprofile and other similar tools. */
3227 seq_printf(m, " 0x%p", mod->module_core); 3250 seq_printf(m, " 0x%pK", mod->module_core);
3228 3251
3229 /* Taints info */ 3252 /* Taints info */
3230 if (mod->taints) 3253 if (mod->taints)
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index ec815a960b5d..73da83aff418 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock)
75 return; 75 return;
76 76
77 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 77 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
78 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 78 DEBUG_LOCKS_WARN_ON(lock->owner != current);
79 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 79 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
80 mutex_clear_owner(lock); 80 mutex_clear_owner(lock);
81} 81}
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index 57d527a16f9d..0799fd3e4cfa 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
29 29
30static inline void mutex_set_owner(struct mutex *lock) 30static inline void mutex_set_owner(struct mutex *lock)
31{ 31{
32 lock->owner = current_thread_info(); 32 lock->owner = current;
33} 33}
34 34
35static inline void mutex_clear_owner(struct mutex *lock) 35static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index a5889fb28ecf..d607ed5dd441 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -131,14 +131,14 @@ EXPORT_SYMBOL(mutex_unlock);
131 */ 131 */
132static inline int __sched 132static inline int __sched
133__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, 133__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
134 unsigned long ip) 134 struct lockdep_map *nest_lock, unsigned long ip)
135{ 135{
136 struct task_struct *task = current; 136 struct task_struct *task = current;
137 struct mutex_waiter waiter; 137 struct mutex_waiter waiter;
138 unsigned long flags; 138 unsigned long flags;
139 139
140 preempt_disable(); 140 preempt_disable();
141 mutex_acquire(&lock->dep_map, subclass, 0, ip); 141 mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
142 142
143#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 143#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
144 /* 144 /*
@@ -160,14 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
160 */ 160 */
161 161
162 for (;;) { 162 for (;;) {
163 struct thread_info *owner; 163 struct task_struct *owner;
164
165 /*
166 * If we own the BKL, then don't spin. The owner of
167 * the mutex might be waiting on us to release the BKL.
168 */
169 if (unlikely(current->lock_depth >= 0))
170 break;
171 164
172 /* 165 /*
173 * If there's an owner, wait for it to either 166 * If there's an owner, wait for it to either
@@ -245,7 +238,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
245 } 238 }
246 __set_task_state(task, state); 239 __set_task_state(task, state);
247 240
248 /* didnt get the lock, go to sleep: */ 241 /* didn't get the lock, go to sleep: */
249 spin_unlock_mutex(&lock->wait_lock, flags); 242 spin_unlock_mutex(&lock->wait_lock, flags);
250 preempt_enable_no_resched(); 243 preempt_enable_no_resched();
251 schedule(); 244 schedule();
@@ -276,16 +269,25 @@ void __sched
276mutex_lock_nested(struct mutex *lock, unsigned int subclass) 269mutex_lock_nested(struct mutex *lock, unsigned int subclass)
277{ 270{
278 might_sleep(); 271 might_sleep();
279 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_); 272 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
280} 273}
281 274
282EXPORT_SYMBOL_GPL(mutex_lock_nested); 275EXPORT_SYMBOL_GPL(mutex_lock_nested);
283 276
277void __sched
278_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
279{
280 might_sleep();
281 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
282}
283
284EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
285
284int __sched 286int __sched
285mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) 287mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
286{ 288{
287 might_sleep(); 289 might_sleep();
288 return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_); 290 return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
289} 291}
290EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); 292EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
291 293
@@ -294,7 +296,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
294{ 296{
295 might_sleep(); 297 might_sleep();
296 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 298 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
297 subclass, _RET_IP_); 299 subclass, NULL, _RET_IP_);
298} 300}
299 301
300EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); 302EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -400,7 +402,7 @@ __mutex_lock_slowpath(atomic_t *lock_count)
400{ 402{
401 struct mutex *lock = container_of(lock_count, struct mutex, count); 403 struct mutex *lock = container_of(lock_count, struct mutex, count);
402 404
403 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_); 405 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
404} 406}
405 407
406static noinline int __sched 408static noinline int __sched
@@ -408,7 +410,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count)
408{ 410{
409 struct mutex *lock = container_of(lock_count, struct mutex, count); 411 struct mutex *lock = container_of(lock_count, struct mutex, count);
410 412
411 return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_); 413 return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
412} 414}
413 415
414static noinline int __sched 416static noinline int __sched
@@ -416,7 +418,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count)
416{ 418{
417 struct mutex *lock = container_of(lock_count, struct mutex, count); 419 struct mutex *lock = container_of(lock_count, struct mutex, count);
418 420
419 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_); 421 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
420} 422}
421#endif 423#endif
422 424
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 67578ca48f94..4115fbf83b12 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -19,7 +19,7 @@
19#ifdef CONFIG_SMP 19#ifdef CONFIG_SMP
20static inline void mutex_set_owner(struct mutex *lock) 20static inline void mutex_set_owner(struct mutex *lock)
21{ 21{
22 lock->owner = current_thread_info(); 22 lock->owner = current;
23} 23}
24 24
25static inline void mutex_clear_owner(struct mutex *lock) 25static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
deleted file mode 100644
index 2c98ad94ba0e..000000000000
--- a/kernel/ns_cgroup.c
+++ /dev/null
@@ -1,118 +0,0 @@
1/*
2 * ns_cgroup.c - namespace cgroup subsystem
3 *
4 * Copyright 2006, 2007 IBM Corp
5 */
6
7#include <linux/module.h>
8#include <linux/cgroup.h>
9#include <linux/fs.h>
10#include <linux/proc_fs.h>
11#include <linux/slab.h>
12#include <linux/nsproxy.h>
13
14struct ns_cgroup {
15 struct cgroup_subsys_state css;
16};
17
18struct cgroup_subsys ns_subsys;
19
20static inline struct ns_cgroup *cgroup_to_ns(
21 struct cgroup *cgroup)
22{
23 return container_of(cgroup_subsys_state(cgroup, ns_subsys_id),
24 struct ns_cgroup, css);
25}
26
27int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
28{
29 char name[PROC_NUMBUF];
30
31 snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
32 return cgroup_clone(task, &ns_subsys, name);
33}
34
35/*
36 * Rules:
37 * 1. you can only enter a cgroup which is a descendant of your current
38 * cgroup
39 * 2. you can only place another process into a cgroup if
40 * a. you have CAP_SYS_ADMIN
41 * b. your cgroup is an ancestor of task's destination cgroup
42 * (hence either you are in the same cgroup as task, or in an
43 * ancestor cgroup thereof)
44 */
45static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
46 struct task_struct *task, bool threadgroup)
47{
48 if (current != task) {
49 if (!capable(CAP_SYS_ADMIN))
50 return -EPERM;
51
52 if (!cgroup_is_descendant(new_cgroup, current))
53 return -EPERM;
54 }
55
56 if (!cgroup_is_descendant(new_cgroup, task))
57 return -EPERM;
58
59 if (threadgroup) {
60 struct task_struct *c;
61 rcu_read_lock();
62 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
63 if (!cgroup_is_descendant(new_cgroup, c)) {
64 rcu_read_unlock();
65 return -EPERM;
66 }
67 }
68 rcu_read_unlock();
69 }
70
71 return 0;
72}
73
74/*
75 * Rules: you can only create a cgroup if
76 * 1. you are capable(CAP_SYS_ADMIN)
77 * 2. the target cgroup is a descendant of your own cgroup
78 */
79static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
80 struct cgroup *cgroup)
81{
82 struct ns_cgroup *ns_cgroup;
83
84 if (!capable(CAP_SYS_ADMIN))
85 return ERR_PTR(-EPERM);
86 if (!cgroup_is_descendant(cgroup, current))
87 return ERR_PTR(-EPERM);
88 if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) {
89 printk("ns_cgroup can't be created with parent "
90 "'clone_children' set.\n");
91 return ERR_PTR(-EINVAL);
92 }
93
94 printk_once("ns_cgroup deprecated: consider using the "
95 "'clone_children' flag without the ns_cgroup.\n");
96
97 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
98 if (!ns_cgroup)
99 return ERR_PTR(-ENOMEM);
100 return &ns_cgroup->css;
101}
102
103static void ns_destroy(struct cgroup_subsys *ss,
104 struct cgroup *cgroup)
105{
106 struct ns_cgroup *ns_cgroup;
107
108 ns_cgroup = cgroup_to_ns(cgroup);
109 kfree(ns_cgroup);
110}
111
112struct cgroup_subsys ns_subsys = {
113 .name = "ns",
114 .can_attach = ns_can_attach,
115 .create = ns_create,
116 .destroy = ns_destroy,
117 .subsys_id = ns_subsys_id,
118};
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f74e6c00e26d..d6a00f3de15d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,6 +22,9 @@
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <net/net_namespace.h> 23#include <net/net_namespace.h>
24#include <linux/ipc_namespace.h> 24#include <linux/ipc_namespace.h>
25#include <linux/proc_fs.h>
26#include <linux/file.h>
27#include <linux/syscalls.h>
25 28
26static struct kmem_cache *nsproxy_cachep; 29static struct kmem_cache *nsproxy_cachep;
27 30
@@ -69,13 +72,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
69 goto out_ns; 72 goto out_ns;
70 } 73 }
71 74
72 new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); 75 new_nsp->uts_ns = copy_utsname(flags, tsk);
73 if (IS_ERR(new_nsp->uts_ns)) { 76 if (IS_ERR(new_nsp->uts_ns)) {
74 err = PTR_ERR(new_nsp->uts_ns); 77 err = PTR_ERR(new_nsp->uts_ns);
75 goto out_uts; 78 goto out_uts;
76 } 79 }
77 80
78 new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); 81 new_nsp->ipc_ns = copy_ipcs(flags, tsk);
79 if (IS_ERR(new_nsp->ipc_ns)) { 82 if (IS_ERR(new_nsp->ipc_ns)) {
80 err = PTR_ERR(new_nsp->ipc_ns); 83 err = PTR_ERR(new_nsp->ipc_ns);
81 goto out_ipc; 84 goto out_ipc;
@@ -198,10 +201,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
198 goto out; 201 goto out;
199 } 202 }
200 203
201 err = ns_cgroup_clone(current, task_pid(current));
202 if (err)
203 put_nsproxy(*new_nsp);
204
205out: 204out:
206 return err; 205 return err;
207} 206}
@@ -233,6 +232,45 @@ void exit_task_namespaces(struct task_struct *p)
233 switch_task_namespaces(p, NULL); 232 switch_task_namespaces(p, NULL);
234} 233}
235 234
235SYSCALL_DEFINE2(setns, int, fd, int, nstype)
236{
237 const struct proc_ns_operations *ops;
238 struct task_struct *tsk = current;
239 struct nsproxy *new_nsproxy;
240 struct proc_inode *ei;
241 struct file *file;
242 int err;
243
244 if (!capable(CAP_SYS_ADMIN))
245 return -EPERM;
246
247 file = proc_ns_fget(fd);
248 if (IS_ERR(file))
249 return PTR_ERR(file);
250
251 err = -EINVAL;
252 ei = PROC_I(file->f_dentry->d_inode);
253 ops = ei->ns_ops;
254 if (nstype && (ops->type != nstype))
255 goto out;
256
257 new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
258 if (IS_ERR(new_nsproxy)) {
259 err = PTR_ERR(new_nsproxy);
260 goto out;
261 }
262
263 err = ops->install(new_nsproxy, ei->ns);
264 if (err) {
265 free_nsproxy(new_nsproxy);
266 goto out;
267 }
268 switch_task_namespaces(tsk, new_nsproxy);
269out:
270 fput(file);
271 return err;
272}
273
236static int __init nsproxy_cache_init(void) 274static int __init nsproxy_cache_init(void)
237{ 275{
238 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); 276 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
diff --git a/kernel/padata.c b/kernel/padata.c
index 751019415d23..b91941df5e63 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -262,7 +262,7 @@ static void padata_reorder(struct parallel_data *pd)
262 /* 262 /*
263 * This cpu has to do the parallel processing of the next 263 * This cpu has to do the parallel processing of the next
264 * object. It's waiting in the cpu's parallelization queue, 264 * object. It's waiting in the cpu's parallelization queue,
265 * so exit imediately. 265 * so exit immediately.
266 */ 266 */
267 if (PTR_ERR(padata) == -ENODATA) { 267 if (PTR_ERR(padata) == -ENODATA) {
268 del_timer(&pd->timer); 268 del_timer(&pd->timer);
@@ -284,7 +284,7 @@ static void padata_reorder(struct parallel_data *pd)
284 /* 284 /*
285 * The next object that needs serialization might have arrived to 285 * The next object that needs serialization might have arrived to
286 * the reorder queues in the meantime, we will be called again 286 * the reorder queues in the meantime, we will be called again
287 * from the timer function if noone else cares for it. 287 * from the timer function if no one else cares for it.
288 */ 288 */
289 if (atomic_read(&pd->reorder_objects) 289 if (atomic_read(&pd->reorder_objects)
290 && !(pinst->flags & PADATA_RESET)) 290 && !(pinst->flags & PADATA_RESET))
@@ -515,7 +515,7 @@ static void __padata_stop(struct padata_instance *pinst)
515 put_online_cpus(); 515 put_online_cpus();
516} 516}
517 517
518/* Replace the internal control stucture with a new one. */ 518/* Replace the internal control structure with a new one. */
519static void padata_replace(struct padata_instance *pinst, 519static void padata_replace(struct padata_instance *pinst,
520 struct parallel_data *pd_new) 520 struct parallel_data *pd_new)
521{ 521{
@@ -768,7 +768,7 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
768} 768}
769 769
770 /** 770 /**
771 * padata_remove_cpu - remove a cpu from the one or both(serial and paralell) 771 * padata_remove_cpu - remove a cpu from the one or both(serial and parallel)
772 * padata cpumasks. 772 * padata cpumasks.
773 * 773 *
774 * @pinst: padata instance 774 * @pinst: padata instance
diff --git a/kernel/panic.c b/kernel/panic.c
index 991bb87a1704..69231670eb95 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -433,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
433 433
434core_param(panic, panic_timeout, int, 0644); 434core_param(panic, panic_timeout, int, 0644);
435core_param(pause_on_oops, pause_on_oops, int, 0644); 435core_param(pause_on_oops, pause_on_oops, int, 0644);
436
437static int __init oops_setup(char *s)
438{
439 if (!s)
440 return -EINVAL;
441 if (!strcmp(s, "panic"))
442 panic_on_oops = 1;
443 return 0;
444}
445early_param("oops", oops_setup);
diff --git a/kernel/params.c b/kernel/params.c
index 0da1411222b9..ed72e1330862 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -95,7 +95,7 @@ static int parse_one(char *param,
95 /* Find parameter */ 95 /* Find parameter */
96 for (i = 0; i < num_params; i++) { 96 for (i = 0; i < num_params; i++) {
97 if (parameq(param, params[i].name)) { 97 if (parameq(param, params[i].name)) {
98 /* Noone handled NULL, so do it here. */ 98 /* No one handled NULL, so do it here. */
99 if (!val && params[i].ops->set != param_set_bool) 99 if (!val && params[i].ops->set != param_set_bool)
100 return -EINVAL; 100 return -EINVAL;
101 DEBUGP("They are equal! Calling %p\n", 101 DEBUGP("They are equal! Calling %p\n",
@@ -297,21 +297,15 @@ EXPORT_SYMBOL(param_ops_charp);
297int param_set_bool(const char *val, const struct kernel_param *kp) 297int param_set_bool(const char *val, const struct kernel_param *kp)
298{ 298{
299 bool v; 299 bool v;
300 int ret;
300 301
301 /* No equals means "set"... */ 302 /* No equals means "set"... */
302 if (!val) val = "1"; 303 if (!val) val = "1";
303 304
304 /* One of =[yYnN01] */ 305 /* One of =[yYnN01] */
305 switch (val[0]) { 306 ret = strtobool(val, &v);
306 case 'y': case 'Y': case '1': 307 if (ret)
307 v = true; 308 return ret;
308 break;
309 case 'n': case 'N': case '0':
310 v = false;
311 break;
312 default:
313 return -EINVAL;
314 }
315 309
316 if (kp->flags & KPARAM_ISBOOL) 310 if (kp->flags & KPARAM_ISBOOL)
317 *(bool *)kp->arg = v; 311 *(bool *)kp->arg = v;
@@ -821,15 +815,18 @@ ssize_t __modver_version_show(struct module_attribute *mattr,
821 return sprintf(buf, "%s\n", vattr->version); 815 return sprintf(buf, "%s\n", vattr->version);
822} 816}
823 817
824extern struct module_version_attribute __start___modver[], __stop___modver[]; 818extern const struct module_version_attribute *__start___modver[];
819extern const struct module_version_attribute *__stop___modver[];
825 820
826static void __init version_sysfs_builtin(void) 821static void __init version_sysfs_builtin(void)
827{ 822{
828 const struct module_version_attribute *vattr; 823 const struct module_version_attribute **p;
829 struct module_kobject *mk; 824 struct module_kobject *mk;
830 int err; 825 int err;
831 826
832 for (vattr = __start___modver; vattr < __stop___modver; vattr++) { 827 for (p = __start___modver; p < __stop___modver; p++) {
828 const struct module_version_attribute *vattr = *p;
829
833 mk = locate_module_kobject(vattr->module_name); 830 mk = locate_module_kobject(vattr->module_name);
834 if (mk) { 831 if (mk) {
835 err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); 832 err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
diff --git a/kernel/pid.c b/kernel/pid.c
index 39b65b69584f..57a8346a270e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -217,11 +217,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
217 return -1; 217 return -1;
218} 218}
219 219
220int next_pidmap(struct pid_namespace *pid_ns, int last) 220int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
221{ 221{
222 int offset; 222 int offset;
223 struct pidmap *map, *end; 223 struct pidmap *map, *end;
224 224
225 if (last >= PID_MAX_LIMIT)
226 return -1;
227
225 offset = (last + 1) & BITS_PER_PAGE_MASK; 228 offset = (last + 1) & BITS_PER_PAGE_MASK;
226 map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; 229 map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
227 end = &pid_ns->pidmap[PIDMAP_ENTRIES]; 230 end = &pid_ns->pidmap[PIDMAP_ENTRIES];
@@ -435,6 +438,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
435 rcu_read_unlock(); 438 rcu_read_unlock();
436 return pid; 439 return pid;
437} 440}
441EXPORT_SYMBOL_GPL(get_task_pid);
438 442
439struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) 443struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
440{ 444{
@@ -446,6 +450,7 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
446 rcu_read_unlock(); 450 rcu_read_unlock();
447 return result; 451 return result;
448} 452}
453EXPORT_SYMBOL_GPL(get_pid_task);
449 454
450struct pid *find_get_pid(pid_t nr) 455struct pid *find_get_pid(pid_t nr)
451{ 456{
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a5aff94e1f0b..e9c9adc84ca6 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -14,6 +14,7 @@
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/acct.h> 15#include <linux/acct.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/proc_fs.h>
17 18
18#define BITS_PER_PAGE (PAGE_SIZE*8) 19#define BITS_PER_PAGE (PAGE_SIZE*8)
19 20
@@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
72{ 73{
73 struct pid_namespace *ns; 74 struct pid_namespace *ns;
74 unsigned int level = parent_pid_ns->level + 1; 75 unsigned int level = parent_pid_ns->level + 1;
75 int i; 76 int i, err = -ENOMEM;
76 77
77 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); 78 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
78 if (ns == NULL) 79 if (ns == NULL)
@@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
96 for (i = 1; i < PIDMAP_ENTRIES; i++) 97 for (i = 1; i < PIDMAP_ENTRIES; i++)
97 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 98 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
98 99
100 err = pid_ns_prepare_proc(ns);
101 if (err)
102 goto out_put_parent_pid_ns;
103
99 return ns; 104 return ns;
100 105
106out_put_parent_pid_ns:
107 put_pid_ns(parent_pid_ns);
101out_free_map: 108out_free_map:
102 kfree(ns->pidmap[0].page); 109 kfree(ns->pidmap[0].page);
103out_free: 110out_free:
104 kmem_cache_free(pid_ns_cachep, ns); 111 kmem_cache_free(pid_ns_cachep, ns);
105out: 112out:
106 return ERR_PTR(-ENOMEM); 113 return ERR_PTR(err);
107} 114}
108 115
109static void destroy_pid_namespace(struct pid_namespace *ns) 116static void destroy_pid_namespace(struct pid_namespace *ns)
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index aeaa7f846821..beb184689af9 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -103,11 +103,14 @@ static struct pm_qos_object *pm_qos_array[] = {
103 103
104static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 104static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
105 size_t count, loff_t *f_pos); 105 size_t count, loff_t *f_pos);
106static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
107 size_t count, loff_t *f_pos);
106static int pm_qos_power_open(struct inode *inode, struct file *filp); 108static int pm_qos_power_open(struct inode *inode, struct file *filp);
107static int pm_qos_power_release(struct inode *inode, struct file *filp); 109static int pm_qos_power_release(struct inode *inode, struct file *filp);
108 110
109static const struct file_operations pm_qos_power_fops = { 111static const struct file_operations pm_qos_power_fops = {
110 .write = pm_qos_power_write, 112 .write = pm_qos_power_write,
113 .read = pm_qos_power_read,
111 .open = pm_qos_power_open, 114 .open = pm_qos_power_open,
112 .release = pm_qos_power_release, 115 .release = pm_qos_power_release,
113 .llseek = noop_llseek, 116 .llseek = noop_llseek,
@@ -376,6 +379,27 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
376} 379}
377 380
378 381
382static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
383 size_t count, loff_t *f_pos)
384{
385 s32 value;
386 unsigned long flags;
387 struct pm_qos_object *o;
388 struct pm_qos_request_list *pm_qos_req = filp->private_data;
389
390 if (!pm_qos_req)
391 return -EINVAL;
392 if (!pm_qos_request_active(pm_qos_req))
393 return -EINVAL;
394
395 o = pm_qos_array[pm_qos_req->pm_qos_class];
396 spin_lock_irqsave(&pm_qos_lock, flags);
397 value = pm_qos_get_value(o);
398 spin_unlock_irqrestore(&pm_qos_lock, flags);
399
400 return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
401}
402
379static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 403static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
380 size_t count, loff_t *f_pos) 404 size_t count, loff_t *f_pos)
381{ 405{
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 05bb7173850e..58f405b581e7 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p)
176 return p->utime; 176 return p->utime;
177} 177}
178 178
179int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) 179static int
180posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
180{ 181{
181 int error = check_clock(which_clock); 182 int error = check_clock(which_clock);
182 if (!error) { 183 if (!error) {
@@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
194 return error; 195 return error;
195} 196}
196 197
197int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) 198static int
199posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
198{ 200{
199 /* 201 /*
200 * You can never reset a CPU clock, but we check for other errors 202 * You can never reset a CPU clock, but we check for other errors
@@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
317} 319}
318 320
319 321
320int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) 322static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
321{ 323{
322 const pid_t pid = CPUCLOCK_PID(which_clock); 324 const pid_t pid = CPUCLOCK_PID(which_clock);
323 int error = -EINVAL; 325 int error = -EINVAL;
@@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
379 * This is called from sys_timer_create() and do_cpu_nanosleep() with the 381 * This is called from sys_timer_create() and do_cpu_nanosleep() with the
380 * new timer already all-zeros initialized. 382 * new timer already all-zeros initialized.
381 */ 383 */
382int posix_cpu_timer_create(struct k_itimer *new_timer) 384static int posix_cpu_timer_create(struct k_itimer *new_timer)
383{ 385{
384 int ret = 0; 386 int ret = 0;
385 const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); 387 const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
@@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
425 * If we return TIMER_RETRY, it's necessary to release the timer's lock 427 * If we return TIMER_RETRY, it's necessary to release the timer's lock
426 * and try again. (This happens when the timer is in the middle of firing.) 428 * and try again. (This happens when the timer is in the middle of firing.)
427 */ 429 */
428int posix_cpu_timer_del(struct k_itimer *timer) 430static int posix_cpu_timer_del(struct k_itimer *timer)
429{ 431{
430 struct task_struct *p = timer->it.cpu.task; 432 struct task_struct *p = timer->it.cpu.task;
431 int ret = 0; 433 int ret = 0;
@@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
665 * If we return TIMER_RETRY, it's necessary to release the timer's lock 667 * If we return TIMER_RETRY, it's necessary to release the timer's lock
666 * and try again. (This happens when the timer is in the middle of firing.) 668 * and try again. (This happens when the timer is in the middle of firing.)
667 */ 669 */
668int posix_cpu_timer_set(struct k_itimer *timer, int flags, 670static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
669 struct itimerspec *new, struct itimerspec *old) 671 struct itimerspec *new, struct itimerspec *old)
670{ 672{
671 struct task_struct *p = timer->it.cpu.task; 673 struct task_struct *p = timer->it.cpu.task;
672 union cpu_time_count old_expires, new_expires, old_incr, val; 674 union cpu_time_count old_expires, new_expires, old_incr, val;
@@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
820 return ret; 822 return ret;
821} 823}
822 824
823void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) 825static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
824{ 826{
825 union cpu_time_count now; 827 union cpu_time_count now;
826 struct task_struct *p = timer->it.cpu.task; 828 struct task_struct *p = timer->it.cpu.task;
@@ -1345,7 +1347,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1345 1347
1346 /* 1348 /*
1347 * Now that all the timers on our list have the firing flag, 1349 * Now that all the timers on our list have the firing flag,
1348 * noone will touch their list entries but us. We'll take 1350 * no one will touch their list entries but us. We'll take
1349 * each timer's lock before clearing its firing flag, so no 1351 * each timer's lock before clearing its firing flag, so no
1350 * timer call will interfere. 1352 * timer call will interfere.
1351 */ 1353 */
@@ -1481,11 +1483,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1481 return error; 1483 return error;
1482} 1484}
1483 1485
1484int posix_cpu_nsleep(const clockid_t which_clock, int flags, 1486static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1485 struct timespec *rqtp, struct timespec __user *rmtp) 1487
1488static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1489 struct timespec *rqtp, struct timespec __user *rmtp)
1486{ 1490{
1487 struct restart_block *restart_block = 1491 struct restart_block *restart_block =
1488 &current_thread_info()->restart_block; 1492 &current_thread_info()->restart_block;
1489 struct itimerspec it; 1493 struct itimerspec it;
1490 int error; 1494 int error;
1491 1495
@@ -1501,56 +1505,47 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1501 1505
1502 if (error == -ERESTART_RESTARTBLOCK) { 1506 if (error == -ERESTART_RESTARTBLOCK) {
1503 1507
1504 if (flags & TIMER_ABSTIME) 1508 if (flags & TIMER_ABSTIME)
1505 return -ERESTARTNOHAND; 1509 return -ERESTARTNOHAND;
1506 /* 1510 /*
1507 * Report back to the user the time still remaining. 1511 * Report back to the user the time still remaining.
1508 */ 1512 */
1509 if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) 1513 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1510 return -EFAULT; 1514 return -EFAULT;
1511 1515
1512 restart_block->fn = posix_cpu_nsleep_restart; 1516 restart_block->fn = posix_cpu_nsleep_restart;
1513 restart_block->arg0 = which_clock; 1517 restart_block->nanosleep.clockid = which_clock;
1514 restart_block->arg1 = (unsigned long) rmtp; 1518 restart_block->nanosleep.rmtp = rmtp;
1515 restart_block->arg2 = rqtp->tv_sec; 1519 restart_block->nanosleep.expires = timespec_to_ns(rqtp);
1516 restart_block->arg3 = rqtp->tv_nsec;
1517 } 1520 }
1518 return error; 1521 return error;
1519} 1522}
1520 1523
1521long posix_cpu_nsleep_restart(struct restart_block *restart_block) 1524static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1522{ 1525{
1523 clockid_t which_clock = restart_block->arg0; 1526 clockid_t which_clock = restart_block->nanosleep.clockid;
1524 struct timespec __user *rmtp;
1525 struct timespec t; 1527 struct timespec t;
1526 struct itimerspec it; 1528 struct itimerspec it;
1527 int error; 1529 int error;
1528 1530
1529 rmtp = (struct timespec __user *) restart_block->arg1; 1531 t = ns_to_timespec(restart_block->nanosleep.expires);
1530 t.tv_sec = restart_block->arg2;
1531 t.tv_nsec = restart_block->arg3;
1532 1532
1533 restart_block->fn = do_no_restart_syscall;
1534 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); 1533 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
1535 1534
1536 if (error == -ERESTART_RESTARTBLOCK) { 1535 if (error == -ERESTART_RESTARTBLOCK) {
1536 struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
1537 /* 1537 /*
1538 * Report back to the user the time still remaining. 1538 * Report back to the user the time still remaining.
1539 */ 1539 */
1540 if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) 1540 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1541 return -EFAULT; 1541 return -EFAULT;
1542 1542
1543 restart_block->fn = posix_cpu_nsleep_restart; 1543 restart_block->nanosleep.expires = timespec_to_ns(&t);
1544 restart_block->arg0 = which_clock;
1545 restart_block->arg1 = (unsigned long) rmtp;
1546 restart_block->arg2 = t.tv_sec;
1547 restart_block->arg3 = t.tv_nsec;
1548 } 1544 }
1549 return error; 1545 return error;
1550 1546
1551} 1547}
1552 1548
1553
1554#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) 1549#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
1555#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) 1550#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
1556 1551
@@ -1594,38 +1589,37 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
1594 timer->it_clock = THREAD_CLOCK; 1589 timer->it_clock = THREAD_CLOCK;
1595 return posix_cpu_timer_create(timer); 1590 return posix_cpu_timer_create(timer);
1596} 1591}
1597static int thread_cpu_nsleep(const clockid_t which_clock, int flags, 1592
1598 struct timespec *rqtp, struct timespec __user *rmtp) 1593struct k_clock clock_posix_cpu = {
1599{ 1594 .clock_getres = posix_cpu_clock_getres,
1600 return -EINVAL; 1595 .clock_set = posix_cpu_clock_set,
1601} 1596 .clock_get = posix_cpu_clock_get,
1602static long thread_cpu_nsleep_restart(struct restart_block *restart_block) 1597 .timer_create = posix_cpu_timer_create,
1603{ 1598 .nsleep = posix_cpu_nsleep,
1604 return -EINVAL; 1599 .nsleep_restart = posix_cpu_nsleep_restart,
1605} 1600 .timer_set = posix_cpu_timer_set,
1601 .timer_del = posix_cpu_timer_del,
1602 .timer_get = posix_cpu_timer_get,
1603};
1606 1604
1607static __init int init_posix_cpu_timers(void) 1605static __init int init_posix_cpu_timers(void)
1608{ 1606{
1609 struct k_clock process = { 1607 struct k_clock process = {
1610 .clock_getres = process_cpu_clock_getres, 1608 .clock_getres = process_cpu_clock_getres,
1611 .clock_get = process_cpu_clock_get, 1609 .clock_get = process_cpu_clock_get,
1612 .clock_set = do_posix_clock_nosettime, 1610 .timer_create = process_cpu_timer_create,
1613 .timer_create = process_cpu_timer_create, 1611 .nsleep = process_cpu_nsleep,
1614 .nsleep = process_cpu_nsleep, 1612 .nsleep_restart = process_cpu_nsleep_restart,
1615 .nsleep_restart = process_cpu_nsleep_restart,
1616 }; 1613 };
1617 struct k_clock thread = { 1614 struct k_clock thread = {
1618 .clock_getres = thread_cpu_clock_getres, 1615 .clock_getres = thread_cpu_clock_getres,
1619 .clock_get = thread_cpu_clock_get, 1616 .clock_get = thread_cpu_clock_get,
1620 .clock_set = do_posix_clock_nosettime, 1617 .timer_create = thread_cpu_timer_create,
1621 .timer_create = thread_cpu_timer_create,
1622 .nsleep = thread_cpu_nsleep,
1623 .nsleep_restart = thread_cpu_nsleep_restart,
1624 }; 1618 };
1625 struct timespec ts; 1619 struct timespec ts;
1626 1620
1627 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); 1621 posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
1628 register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); 1622 posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
1629 1623
1630 cputime_to_timespec(cputime_one_jiffy, &ts); 1624 cputime_to_timespec(cputime_one_jiffy, &ts);
1631 onecputick = ts.tv_nsec; 1625 onecputick = ts.tv_nsec;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 93bd2eb2bc53..4556182527f3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -41,6 +41,7 @@
41#include <linux/init.h> 41#include <linux/init.h>
42#include <linux/compiler.h> 42#include <linux/compiler.h>
43#include <linux/idr.h> 43#include <linux/idr.h>
44#include <linux/posix-clock.h>
44#include <linux/posix-timers.h> 45#include <linux/posix-timers.h>
45#include <linux/syscalls.h> 46#include <linux/syscalls.h>
46#include <linux/wait.h> 47#include <linux/wait.h>
@@ -81,6 +82,14 @@ static DEFINE_SPINLOCK(idr_lock);
81#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" 82#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
82#endif 83#endif
83 84
85/*
86 * parisc wants ENOTSUP instead of EOPNOTSUPP
87 */
88#ifndef ENOTSUP
89# define ENANOSLEEP_NOTSUP EOPNOTSUPP
90#else
91# define ENANOSLEEP_NOTSUP ENOTSUP
92#endif
84 93
85/* 94/*
86 * The timer ID is turned into a timer address by idr_find(). 95 * The timer ID is turned into a timer address by idr_find().
@@ -94,11 +103,7 @@ static DEFINE_SPINLOCK(idr_lock);
94/* 103/*
95 * CLOCKs: The POSIX standard calls for a couple of clocks and allows us 104 * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
96 * to implement others. This structure defines the various 105 * to implement others. This structure defines the various
97 * clocks and allows the possibility of adding others. We 106 * clocks.
98 * provide an interface to add clocks to the table and expect
99 * the "arch" code to add at least one clock that is high
100 * resolution. Here we define the standard CLOCK_REALTIME as a
101 * 1/HZ resolution clock.
102 * 107 *
103 * RESOLUTION: Clock resolution is used to round up timer and interval 108 * RESOLUTION: Clock resolution is used to round up timer and interval
104 * times, NOT to report clock times, which are reported with as 109 * times, NOT to report clock times, which are reported with as
@@ -108,20 +113,13 @@ static DEFINE_SPINLOCK(idr_lock);
108 * necessary code is written. The standard says we should say 113 * necessary code is written. The standard says we should say
109 * something about this issue in the documentation... 114 * something about this issue in the documentation...
110 * 115 *
111 * FUNCTIONS: The CLOCKs structure defines possible functions to handle 116 * FUNCTIONS: The CLOCKs structure defines possible functions to
112 * various clock functions. For clocks that use the standard 117 * handle various clock functions.
113 * system timer code these entries should be NULL. This will
114 * allow dispatch without the overhead of indirect function
115 * calls. CLOCKS that depend on other sources (e.g. WWV or GPS)
116 * must supply functions here, even if the function just returns
117 * ENOSYS. The standard POSIX timer management code assumes the
118 * following: 1.) The k_itimer struct (sched.h) is used for the
119 * timer. 2.) The list, it_lock, it_clock, it_id and it_pid
120 * fields are not modified by timer code.
121 * 118 *
122 * At this time all functions EXCEPT clock_nanosleep can be 119 * The standard POSIX timer management code assumes the
123 * redirected by the CLOCKS structure. Clock_nanosleep is in 120 * following: 1.) The k_itimer struct (sched.h) is used for
124 * there, but the code ignores it. 121 * the timer. 2.) The list, it_lock, it_clock, it_id and
122 * it_pid fields are not modified by timer code.
125 * 123 *
126 * Permissions: It is assumed that the clock_settime() function defined 124 * Permissions: It is assumed that the clock_settime() function defined
127 * for each clock will take care of permission checks. Some 125 * for each clock will take care of permission checks. Some
@@ -138,6 +136,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
138 */ 136 */
139static int common_nsleep(const clockid_t, int flags, struct timespec *t, 137static int common_nsleep(const clockid_t, int flags, struct timespec *t,
140 struct timespec __user *rmtp); 138 struct timespec __user *rmtp);
139static int common_timer_create(struct k_itimer *new_timer);
141static void common_timer_get(struct k_itimer *, struct itimerspec *); 140static void common_timer_get(struct k_itimer *, struct itimerspec *);
142static int common_timer_set(struct k_itimer *, int, 141static int common_timer_set(struct k_itimer *, int,
143 struct itimerspec *, struct itimerspec *); 142 struct itimerspec *, struct itimerspec *);
@@ -158,76 +157,24 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
158 spin_unlock_irqrestore(&timr->it_lock, flags); 157 spin_unlock_irqrestore(&timr->it_lock, flags);
159} 158}
160 159
161/* 160/* Get clock_realtime */
162 * Call the k_clock hook function if non-null, or the default function. 161static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
163 */
164#define CLOCK_DISPATCH(clock, call, arglist) \
165 ((clock) < 0 ? posix_cpu_##call arglist : \
166 (posix_clocks[clock].call != NULL \
167 ? (*posix_clocks[clock].call) arglist : common_##call arglist))
168
169/*
170 * Default clock hook functions when the struct k_clock passed
171 * to register_posix_clock leaves a function pointer null.
172 *
173 * The function common_CALL is the default implementation for
174 * the function pointer CALL in struct k_clock.
175 */
176
177static inline int common_clock_getres(const clockid_t which_clock,
178 struct timespec *tp)
179{
180 tp->tv_sec = 0;
181 tp->tv_nsec = posix_clocks[which_clock].res;
182 return 0;
183}
184
185/*
186 * Get real time for posix timers
187 */
188static int common_clock_get(clockid_t which_clock, struct timespec *tp)
189{ 162{
190 ktime_get_real_ts(tp); 163 ktime_get_real_ts(tp);
191 return 0; 164 return 0;
192} 165}
193 166
194static inline int common_clock_set(const clockid_t which_clock, 167/* Set clock_realtime */
195 struct timespec *tp) 168static int posix_clock_realtime_set(const clockid_t which_clock,
169 const struct timespec *tp)
196{ 170{
197 return do_sys_settimeofday(tp, NULL); 171 return do_sys_settimeofday(tp, NULL);
198} 172}
199 173
200static int common_timer_create(struct k_itimer *new_timer) 174static int posix_clock_realtime_adj(const clockid_t which_clock,
201{ 175 struct timex *t)
202 hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
203 return 0;
204}
205
206static int no_timer_create(struct k_itimer *new_timer)
207{
208 return -EOPNOTSUPP;
209}
210
211static int no_nsleep(const clockid_t which_clock, int flags,
212 struct timespec *tsave, struct timespec __user *rmtp)
213{
214 return -EOPNOTSUPP;
215}
216
217/*
218 * Return nonzero if we know a priori this clockid_t value is bogus.
219 */
220static inline int invalid_clockid(const clockid_t which_clock)
221{ 176{
222 if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ 177 return do_adjtimex(t);
223 return 0;
224 if ((unsigned) which_clock >= MAX_CLOCKS)
225 return 1;
226 if (posix_clocks[which_clock].clock_getres != NULL)
227 return 0;
228 if (posix_clocks[which_clock].res != 0)
229 return 0;
230 return 1;
231} 178}
232 179
233/* 180/*
@@ -240,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
240} 187}
241 188
242/* 189/*
243 * Get monotonic time for posix timers 190 * Get monotonic-raw time for posix timers
244 */ 191 */
245static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) 192static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
246{ 193{
@@ -267,46 +214,70 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp
267 *tp = ktime_to_timespec(KTIME_LOW_RES); 214 *tp = ktime_to_timespec(KTIME_LOW_RES);
268 return 0; 215 return 0;
269} 216}
217
218static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
219{
220 get_monotonic_boottime(tp);
221 return 0;
222}
223
224
270/* 225/*
271 * Initialize everything, well, just everything in Posix clocks/timers ;) 226 * Initialize everything, well, just everything in Posix clocks/timers ;)
272 */ 227 */
273static __init int init_posix_timers(void) 228static __init int init_posix_timers(void)
274{ 229{
275 struct k_clock clock_realtime = { 230 struct k_clock clock_realtime = {
276 .clock_getres = hrtimer_get_res, 231 .clock_getres = hrtimer_get_res,
232 .clock_get = posix_clock_realtime_get,
233 .clock_set = posix_clock_realtime_set,
234 .clock_adj = posix_clock_realtime_adj,
235 .nsleep = common_nsleep,
236 .nsleep_restart = hrtimer_nanosleep_restart,
237 .timer_create = common_timer_create,
238 .timer_set = common_timer_set,
239 .timer_get = common_timer_get,
240 .timer_del = common_timer_del,
277 }; 241 };
278 struct k_clock clock_monotonic = { 242 struct k_clock clock_monotonic = {
279 .clock_getres = hrtimer_get_res, 243 .clock_getres = hrtimer_get_res,
280 .clock_get = posix_ktime_get_ts, 244 .clock_get = posix_ktime_get_ts,
281 .clock_set = do_posix_clock_nosettime, 245 .nsleep = common_nsleep,
246 .nsleep_restart = hrtimer_nanosleep_restart,
247 .timer_create = common_timer_create,
248 .timer_set = common_timer_set,
249 .timer_get = common_timer_get,
250 .timer_del = common_timer_del,
282 }; 251 };
283 struct k_clock clock_monotonic_raw = { 252 struct k_clock clock_monotonic_raw = {
284 .clock_getres = hrtimer_get_res, 253 .clock_getres = hrtimer_get_res,
285 .clock_get = posix_get_monotonic_raw, 254 .clock_get = posix_get_monotonic_raw,
286 .clock_set = do_posix_clock_nosettime,
287 .timer_create = no_timer_create,
288 .nsleep = no_nsleep,
289 }; 255 };
290 struct k_clock clock_realtime_coarse = { 256 struct k_clock clock_realtime_coarse = {
291 .clock_getres = posix_get_coarse_res, 257 .clock_getres = posix_get_coarse_res,
292 .clock_get = posix_get_realtime_coarse, 258 .clock_get = posix_get_realtime_coarse,
293 .clock_set = do_posix_clock_nosettime,
294 .timer_create = no_timer_create,
295 .nsleep = no_nsleep,
296 }; 259 };
297 struct k_clock clock_monotonic_coarse = { 260 struct k_clock clock_monotonic_coarse = {
298 .clock_getres = posix_get_coarse_res, 261 .clock_getres = posix_get_coarse_res,
299 .clock_get = posix_get_monotonic_coarse, 262 .clock_get = posix_get_monotonic_coarse,
300 .clock_set = do_posix_clock_nosettime, 263 };
301 .timer_create = no_timer_create, 264 struct k_clock clock_boottime = {
302 .nsleep = no_nsleep, 265 .clock_getres = hrtimer_get_res,
266 .clock_get = posix_get_boottime,
267 .nsleep = common_nsleep,
268 .nsleep_restart = hrtimer_nanosleep_restart,
269 .timer_create = common_timer_create,
270 .timer_set = common_timer_set,
271 .timer_get = common_timer_get,
272 .timer_del = common_timer_del,
303 }; 273 };
304 274
305 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 275 posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
306 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 276 posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
307 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); 277 posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
308 register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); 278 posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
309 register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); 279 posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
280 posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
310 281
311 posix_timers_cache = kmem_cache_create("posix_timers_cache", 282 posix_timers_cache = kmem_cache_create("posix_timers_cache",
312 sizeof (struct k_itimer), 0, SLAB_PANIC, 283 sizeof (struct k_itimer), 0, SLAB_PANIC,
@@ -342,7 +313,7 @@ static void schedule_next_timer(struct k_itimer *timr)
342 * restarted (i.e. we have flagged this in the sys_private entry of the 313 * restarted (i.e. we have flagged this in the sys_private entry of the
343 * info block). 314 * info block).
344 * 315 *
345 * To protect aginst the timer going away while the interrupt is queued, 316 * To protect against the timer going away while the interrupt is queued,
346 * we require that the it_requeue_pending flag be set. 317 * we require that the it_requeue_pending flag be set.
347 */ 318 */
348void do_schedule_next_timer(struct siginfo *info) 319void do_schedule_next_timer(struct siginfo *info)
@@ -482,17 +453,29 @@ static struct pid *good_sigevent(sigevent_t * event)
482 return task_pid(rtn); 453 return task_pid(rtn);
483} 454}
484 455
485void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) 456void posix_timers_register_clock(const clockid_t clock_id,
457 struct k_clock *new_clock)
486{ 458{
487 if ((unsigned) clock_id >= MAX_CLOCKS) { 459 if ((unsigned) clock_id >= MAX_CLOCKS) {
488 printk("POSIX clock register failed for clock_id %d\n", 460 printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
461 clock_id);
462 return;
463 }
464
465 if (!new_clock->clock_get) {
466 printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n",
467 clock_id);
468 return;
469 }
470 if (!new_clock->clock_getres) {
471 printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n",
489 clock_id); 472 clock_id);
490 return; 473 return;
491 } 474 }
492 475
493 posix_clocks[clock_id] = *new_clock; 476 posix_clocks[clock_id] = *new_clock;
494} 477}
495EXPORT_SYMBOL_GPL(register_posix_clock); 478EXPORT_SYMBOL_GPL(posix_timers_register_clock);
496 479
497static struct k_itimer * alloc_posix_timer(void) 480static struct k_itimer * alloc_posix_timer(void)
498{ 481{
@@ -508,6 +491,13 @@ static struct k_itimer * alloc_posix_timer(void)
508 return tmr; 491 return tmr;
509} 492}
510 493
494static void k_itimer_rcu_free(struct rcu_head *head)
495{
496 struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
497
498 kmem_cache_free(posix_timers_cache, tmr);
499}
500
511#define IT_ID_SET 1 501#define IT_ID_SET 1
512#define IT_ID_NOT_SET 0 502#define IT_ID_NOT_SET 0
513static void release_posix_timer(struct k_itimer *tmr, int it_id_set) 503static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
@@ -520,7 +510,24 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
520 } 510 }
521 put_pid(tmr->it_pid); 511 put_pid(tmr->it_pid);
522 sigqueue_free(tmr->sigq); 512 sigqueue_free(tmr->sigq);
523 kmem_cache_free(posix_timers_cache, tmr); 513 call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
514}
515
516static struct k_clock *clockid_to_kclock(const clockid_t id)
517{
518 if (id < 0)
519 return (id & CLOCKFD_MASK) == CLOCKFD ?
520 &clock_posix_dynamic : &clock_posix_cpu;
521
522 if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
523 return NULL;
524 return &posix_clocks[id];
525}
526
527static int common_timer_create(struct k_itimer *new_timer)
528{
529 hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
530 return 0;
524} 531}
525 532
526/* Create a POSIX.1b interval timer. */ 533/* Create a POSIX.1b interval timer. */
@@ -529,13 +536,16 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
529 struct sigevent __user *, timer_event_spec, 536 struct sigevent __user *, timer_event_spec,
530 timer_t __user *, created_timer_id) 537 timer_t __user *, created_timer_id)
531{ 538{
539 struct k_clock *kc = clockid_to_kclock(which_clock);
532 struct k_itimer *new_timer; 540 struct k_itimer *new_timer;
533 int error, new_timer_id; 541 int error, new_timer_id;
534 sigevent_t event; 542 sigevent_t event;
535 int it_id_set = IT_ID_NOT_SET; 543 int it_id_set = IT_ID_NOT_SET;
536 544
537 if (invalid_clockid(which_clock)) 545 if (!kc)
538 return -EINVAL; 546 return -EINVAL;
547 if (!kc->timer_create)
548 return -EOPNOTSUPP;
539 549
540 new_timer = alloc_posix_timer(); 550 new_timer = alloc_posix_timer();
541 if (unlikely(!new_timer)) 551 if (unlikely(!new_timer))
@@ -597,7 +607,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
597 goto out; 607 goto out;
598 } 608 }
599 609
600 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); 610 error = kc->timer_create(new_timer);
601 if (error) 611 if (error)
602 goto out; 612 goto out;
603 613
@@ -607,7 +617,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
607 spin_unlock_irq(&current->sighand->siglock); 617 spin_unlock_irq(&current->sighand->siglock);
608 618
609 return 0; 619 return 0;
610 /* 620 /*
611 * In the case of the timer belonging to another task, after 621 * In the case of the timer belonging to another task, after
612 * the task is unlocked, the timer is owned by the other task 622 * the task is unlocked, the timer is owned by the other task
613 * and may cease to exist at any time. Don't use or modify 623 * and may cease to exist at any time. Don't use or modify
@@ -628,22 +638,18 @@ out:
628static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) 638static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
629{ 639{
630 struct k_itimer *timr; 640 struct k_itimer *timr;
631 /* 641
632 * Watch out here. We do a irqsave on the idr_lock and pass the 642 rcu_read_lock();
633 * flags part over to the timer lock. Must not let interrupts in
634 * while we are moving the lock.
635 */
636 spin_lock_irqsave(&idr_lock, *flags);
637 timr = idr_find(&posix_timers_id, (int)timer_id); 643 timr = idr_find(&posix_timers_id, (int)timer_id);
638 if (timr) { 644 if (timr) {
639 spin_lock(&timr->it_lock); 645 spin_lock_irqsave(&timr->it_lock, *flags);
640 if (timr->it_signal == current->signal) { 646 if (timr->it_signal == current->signal) {
641 spin_unlock(&idr_lock); 647 rcu_read_unlock();
642 return timr; 648 return timr;
643 } 649 }
644 spin_unlock(&timr->it_lock); 650 spin_unlock_irqrestore(&timr->it_lock, *flags);
645 } 651 }
646 spin_unlock_irqrestore(&idr_lock, *flags); 652 rcu_read_unlock();
647 653
648 return NULL; 654 return NULL;
649} 655}
@@ -709,22 +715,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
709SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, 715SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
710 struct itimerspec __user *, setting) 716 struct itimerspec __user *, setting)
711{ 717{
712 struct k_itimer *timr;
713 struct itimerspec cur_setting; 718 struct itimerspec cur_setting;
719 struct k_itimer *timr;
720 struct k_clock *kc;
714 unsigned long flags; 721 unsigned long flags;
722 int ret = 0;
715 723
716 timr = lock_timer(timer_id, &flags); 724 timr = lock_timer(timer_id, &flags);
717 if (!timr) 725 if (!timr)
718 return -EINVAL; 726 return -EINVAL;
719 727
720 CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting)); 728 kc = clockid_to_kclock(timr->it_clock);
729 if (WARN_ON_ONCE(!kc || !kc->timer_get))
730 ret = -EINVAL;
731 else
732 kc->timer_get(timr, &cur_setting);
721 733
722 unlock_timer(timr, flags); 734 unlock_timer(timr, flags);
723 735
724 if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) 736 if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
725 return -EFAULT; 737 return -EFAULT;
726 738
727 return 0; 739 return ret;
728} 740}
729 741
730/* 742/*
@@ -813,6 +825,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
813 int error = 0; 825 int error = 0;
814 unsigned long flag; 826 unsigned long flag;
815 struct itimerspec *rtn = old_setting ? &old_spec : NULL; 827 struct itimerspec *rtn = old_setting ? &old_spec : NULL;
828 struct k_clock *kc;
816 829
817 if (!new_setting) 830 if (!new_setting)
818 return -EINVAL; 831 return -EINVAL;
@@ -828,8 +841,11 @@ retry:
828 if (!timr) 841 if (!timr)
829 return -EINVAL; 842 return -EINVAL;
830 843
831 error = CLOCK_DISPATCH(timr->it_clock, timer_set, 844 kc = clockid_to_kclock(timr->it_clock);
832 (timr, flags, &new_spec, rtn)); 845 if (WARN_ON_ONCE(!kc || !kc->timer_set))
846 error = -EINVAL;
847 else
848 error = kc->timer_set(timr, flags, &new_spec, rtn);
833 849
834 unlock_timer(timr, flag); 850 unlock_timer(timr, flag);
835 if (error == TIMER_RETRY) { 851 if (error == TIMER_RETRY) {
@@ -844,7 +860,7 @@ retry:
844 return error; 860 return error;
845} 861}
846 862
847static inline int common_timer_del(struct k_itimer *timer) 863static int common_timer_del(struct k_itimer *timer)
848{ 864{
849 timer->it.real.interval.tv64 = 0; 865 timer->it.real.interval.tv64 = 0;
850 866
@@ -855,7 +871,11 @@ static inline int common_timer_del(struct k_itimer *timer)
855 871
856static inline int timer_delete_hook(struct k_itimer *timer) 872static inline int timer_delete_hook(struct k_itimer *timer)
857{ 873{
858 return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer)); 874 struct k_clock *kc = clockid_to_kclock(timer->it_clock);
875
876 if (WARN_ON_ONCE(!kc || !kc->timer_del))
877 return -EINVAL;
878 return kc->timer_del(timer);
859} 879}
860 880
861/* Delete a POSIX.1b interval timer. */ 881/* Delete a POSIX.1b interval timer. */
@@ -927,69 +947,76 @@ void exit_itimers(struct signal_struct *sig)
927 } 947 }
928} 948}
929 949
930/* Not available / possible... functions */
931int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
932{
933 return -EINVAL;
934}
935EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
936
937int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
938 struct timespec *t, struct timespec __user *r)
939{
940#ifndef ENOTSUP
941 return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */
942#else /* parisc does define it separately. */
943 return -ENOTSUP;
944#endif
945}
946EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
947
948SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, 950SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
949 const struct timespec __user *, tp) 951 const struct timespec __user *, tp)
950{ 952{
953 struct k_clock *kc = clockid_to_kclock(which_clock);
951 struct timespec new_tp; 954 struct timespec new_tp;
952 955
953 if (invalid_clockid(which_clock)) 956 if (!kc || !kc->clock_set)
954 return -EINVAL; 957 return -EINVAL;
958
955 if (copy_from_user(&new_tp, tp, sizeof (*tp))) 959 if (copy_from_user(&new_tp, tp, sizeof (*tp)))
956 return -EFAULT; 960 return -EFAULT;
957 961
958 return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); 962 return kc->clock_set(which_clock, &new_tp);
959} 963}
960 964
961SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, 965SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
962 struct timespec __user *,tp) 966 struct timespec __user *,tp)
963{ 967{
968 struct k_clock *kc = clockid_to_kclock(which_clock);
964 struct timespec kernel_tp; 969 struct timespec kernel_tp;
965 int error; 970 int error;
966 971
967 if (invalid_clockid(which_clock)) 972 if (!kc)
968 return -EINVAL; 973 return -EINVAL;
969 error = CLOCK_DISPATCH(which_clock, clock_get, 974
970 (which_clock, &kernel_tp)); 975 error = kc->clock_get(which_clock, &kernel_tp);
976
971 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) 977 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
972 error = -EFAULT; 978 error = -EFAULT;
973 979
974 return error; 980 return error;
981}
982
983SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
984 struct timex __user *, utx)
985{
986 struct k_clock *kc = clockid_to_kclock(which_clock);
987 struct timex ktx;
988 int err;
989
990 if (!kc)
991 return -EINVAL;
992 if (!kc->clock_adj)
993 return -EOPNOTSUPP;
975 994
995 if (copy_from_user(&ktx, utx, sizeof(ktx)))
996 return -EFAULT;
997
998 err = kc->clock_adj(which_clock, &ktx);
999
1000 if (!err && copy_to_user(utx, &ktx, sizeof(ktx)))
1001 return -EFAULT;
1002
1003 return err;
976} 1004}
977 1005
978SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, 1006SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
979 struct timespec __user *, tp) 1007 struct timespec __user *, tp)
980{ 1008{
1009 struct k_clock *kc = clockid_to_kclock(which_clock);
981 struct timespec rtn_tp; 1010 struct timespec rtn_tp;
982 int error; 1011 int error;
983 1012
984 if (invalid_clockid(which_clock)) 1013 if (!kc)
985 return -EINVAL; 1014 return -EINVAL;
986 1015
987 error = CLOCK_DISPATCH(which_clock, clock_getres, 1016 error = kc->clock_getres(which_clock, &rtn_tp);
988 (which_clock, &rtn_tp));
989 1017
990 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) { 1018 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
991 error = -EFAULT; 1019 error = -EFAULT;
992 }
993 1020
994 return error; 1021 return error;
995} 1022}
@@ -1009,10 +1036,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1009 const struct timespec __user *, rqtp, 1036 const struct timespec __user *, rqtp,
1010 struct timespec __user *, rmtp) 1037 struct timespec __user *, rmtp)
1011{ 1038{
1039 struct k_clock *kc = clockid_to_kclock(which_clock);
1012 struct timespec t; 1040 struct timespec t;
1013 1041
1014 if (invalid_clockid(which_clock)) 1042 if (!kc)
1015 return -EINVAL; 1043 return -EINVAL;
1044 if (!kc->nsleep)
1045 return -ENANOSLEEP_NOTSUP;
1016 1046
1017 if (copy_from_user(&t, rqtp, sizeof (struct timespec))) 1047 if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
1018 return -EFAULT; 1048 return -EFAULT;
@@ -1020,27 +1050,20 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1020 if (!timespec_valid(&t)) 1050 if (!timespec_valid(&t))
1021 return -EINVAL; 1051 return -EINVAL;
1022 1052
1023 return CLOCK_DISPATCH(which_clock, nsleep, 1053 return kc->nsleep(which_clock, flags, &t, rmtp);
1024 (which_clock, flags, &t, rmtp));
1025}
1026
1027/*
1028 * nanosleep_restart for monotonic and realtime clocks
1029 */
1030static int common_nsleep_restart(struct restart_block *restart_block)
1031{
1032 return hrtimer_nanosleep_restart(restart_block);
1033} 1054}
1034 1055
1035/* 1056/*
1036 * This will restart clock_nanosleep. This is required only by 1057 * This will restart clock_nanosleep. This is required only by
1037 * compat_clock_nanosleep_restart for now. 1058 * compat_clock_nanosleep_restart for now.
1038 */ 1059 */
1039long 1060long clock_nanosleep_restart(struct restart_block *restart_block)
1040clock_nanosleep_restart(struct restart_block *restart_block)
1041{ 1061{
1042 clockid_t which_clock = restart_block->arg0; 1062 clockid_t which_clock = restart_block->nanosleep.clockid;
1063 struct k_clock *kc = clockid_to_kclock(which_clock);
1064
1065 if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
1066 return -EINVAL;
1043 1067
1044 return CLOCK_DISPATCH(which_clock, nsleep_restart, 1068 return kc->nsleep_restart(restart_block);
1045 (restart_block));
1046} 1069}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 265729966ece..87f4d24b55b0 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,125 +1,12 @@
1config PM
2 bool "Power Management support"
3 depends on !IA64_HP_SIM
4 ---help---
5 "Power Management" means that parts of your computer are shut
6 off or put into a power conserving "sleep" mode if they are not
7 being used. There are two competing standards for doing this: APM
8 and ACPI. If you want to use either one, say Y here and then also
9 to the requisite support below.
10
11 Power Management is most important for battery powered laptop
12 computers; if you have a laptop, check out the Linux Laptop home
13 page on the WWW at <http://www.linux-on-laptops.com/> or
14 Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
15 and the Battery Powered Linux mini-HOWTO, available from
16 <http://www.tldp.org/docs.html#howto>.
17
18 Note that, even if you say N here, Linux on the x86 architecture
19 will issue the hlt instruction if nothing is to be done, thereby
20 sending the processor to sleep and saving power.
21
22config PM_DEBUG
23 bool "Power Management Debug Support"
24 depends on PM
25 ---help---
26 This option enables various debugging support in the Power Management
27 code. This is helpful when debugging and reporting PM bugs, like
28 suspend support.
29
30config PM_ADVANCED_DEBUG
31 bool "Extra PM attributes in sysfs for low-level debugging/testing"
32 depends on PM_DEBUG
33 default n
34 ---help---
35 Add extra sysfs attributes allowing one to access some Power Management
36 fields of device objects from user space. If you are not a kernel
37 developer interested in debugging/testing Power Management, say "no".
38
39config PM_VERBOSE
40 bool "Verbose Power Management debugging"
41 depends on PM_DEBUG
42 default n
43 ---help---
44 This option enables verbose messages from the Power Management code.
45
46config CAN_PM_TRACE
47 def_bool y
48 depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL
49
50config PM_TRACE
51 bool
52 help
53 This enables code to save the last PM event point across
54 reboot. The architecture needs to support this, x86 for
55 example does by saving things in the RTC, see below.
56
57 The architecture specific code must provide the extern
58 functions from <linux/resume-trace.h> as well as the
59 <asm/resume-trace.h> header with a TRACE_RESUME() macro.
60
61 The way the information is presented is architecture-
62 dependent, x86 will print the information during a
63 late_initcall.
64
65config PM_TRACE_RTC
66 bool "Suspend/resume event tracing"
67 depends on CAN_PM_TRACE
68 depends on X86
69 select PM_TRACE
70 default n
71 ---help---
72 This enables some cheesy code to save the last PM event point in the
73 RTC across reboots, so that you can debug a machine that just hangs
74 during suspend (or more commonly, during resume).
75
76 To use this debugging feature you should attempt to suspend the
77 machine, reboot it and then run
78
79 dmesg -s 1000000 | grep 'hash matches'
80
81 CAUTION: this option will cause your machine's real-time clock to be
82 set to an invalid time after a resume.
83
84config PM_SLEEP_SMP
85 bool
86 depends on SMP
87 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
88 depends on PM_SLEEP
89 select HOTPLUG
90 select HOTPLUG_CPU
91 default y
92
93config PM_SLEEP
94 bool
95 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
96 default y
97
98config PM_SLEEP_ADVANCED_DEBUG
99 bool
100 depends on PM_ADVANCED_DEBUG
101 default n
102
103config SUSPEND 1config SUSPEND
104 bool "Suspend to RAM and standby" 2 bool "Suspend to RAM and standby"
105 depends on PM && ARCH_SUSPEND_POSSIBLE 3 depends on ARCH_SUSPEND_POSSIBLE
106 default y 4 default y
107 ---help--- 5 ---help---
108 Allow the system to enter sleep states in which main memory is 6 Allow the system to enter sleep states in which main memory is
109 powered and thus its contents are preserved, such as the 7 powered and thus its contents are preserved, such as the
110 suspend-to-RAM state (e.g. the ACPI S3 state). 8 suspend-to-RAM state (e.g. the ACPI S3 state).
111 9
112config PM_TEST_SUSPEND
113 bool "Test suspend/resume and wakealarm during bootup"
114 depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
115 ---help---
116 This option will let you suspend your machine during bootup, and
117 make it wake up a few seconds later using an RTC wakeup alarm.
118 Enable this with a kernel parameter like "test_suspend=mem".
119
120 You probably want to have your system's RTC driver statically
121 linked, ensuring that it's available when this test runs.
122
123config SUSPEND_FREEZER 10config SUSPEND_FREEZER
124 bool "Enable freezer for suspend to RAM/standby" \ 11 bool "Enable freezer for suspend to RAM/standby" \
125 if ARCH_WANTS_FREEZER_CONTROL || BROKEN 12 if ARCH_WANTS_FREEZER_CONTROL || BROKEN
@@ -131,9 +18,13 @@ config SUSPEND_FREEZER
131 18
132 Turning OFF this setting is NOT recommended! If in doubt, say Y. 19 Turning OFF this setting is NOT recommended! If in doubt, say Y.
133 20
21config HIBERNATE_CALLBACKS
22 bool
23
134config HIBERNATION 24config HIBERNATION
135 bool "Hibernation (aka 'suspend to disk')" 25 bool "Hibernation (aka 'suspend to disk')"
136 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE 26 depends on SWAP && ARCH_HIBERNATION_POSSIBLE
27 select HIBERNATE_CALLBACKS
137 select LZO_COMPRESS 28 select LZO_COMPRESS
138 select LZO_DECOMPRESS 29 select LZO_DECOMPRESS
139 ---help--- 30 ---help---
@@ -196,6 +87,100 @@ config PM_STD_PARTITION
196 suspended image to. It will simply pick the first available swap 87 suspended image to. It will simply pick the first available swap
197 device. 88 device.
198 89
90config PM_SLEEP
91 def_bool y
92 depends on SUSPEND || HIBERNATE_CALLBACKS
93
94config PM_SLEEP_SMP
95 def_bool y
96 depends on SMP
97 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
98 depends on PM_SLEEP
99 select HOTPLUG
100 select HOTPLUG_CPU
101
102config PM_RUNTIME
103 bool "Run-time PM core functionality"
104 depends on !IA64_HP_SIM
105 ---help---
106 Enable functionality allowing I/O devices to be put into energy-saving
107 (low power) states at run time (or autosuspended) after a specified
108 period of inactivity and woken up in response to a hardware-generated
109 wake-up event or a driver's request.
110
111 Hardware support is generally required for this functionality to work
112 and the bus type drivers of the buses the devices are on are
113 responsible for the actual handling of the autosuspend requests and
114 wake-up events.
115
116config PM
117 def_bool y
118 depends on PM_SLEEP || PM_RUNTIME
119
120config PM_DEBUG
121 bool "Power Management Debug Support"
122 depends on PM
123 ---help---
124 This option enables various debugging support in the Power Management
125 code. This is helpful when debugging and reporting PM bugs, like
126 suspend support.
127
128config PM_ADVANCED_DEBUG
129 bool "Extra PM attributes in sysfs for low-level debugging/testing"
130 depends on PM_DEBUG
131 ---help---
132 Add extra sysfs attributes allowing one to access some Power Management
133 fields of device objects from user space. If you are not a kernel
134 developer interested in debugging/testing Power Management, say "no".
135
136config PM_TEST_SUSPEND
137 bool "Test suspend/resume and wakealarm during bootup"
138 depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
139 ---help---
140 This option will let you suspend your machine during bootup, and
141 make it wake up a few seconds later using an RTC wakeup alarm.
142 Enable this with a kernel parameter like "test_suspend=mem".
143
144 You probably want to have your system's RTC driver statically
145 linked, ensuring that it's available when this test runs.
146
147config CAN_PM_TRACE
148 def_bool y
149 depends on PM_DEBUG && PM_SLEEP
150
151config PM_TRACE
152 bool
153 help
154 This enables code to save the last PM event point across
155 reboot. The architecture needs to support this, x86 for
156 example does by saving things in the RTC, see below.
157
158 The architecture specific code must provide the extern
159 functions from <linux/resume-trace.h> as well as the
160 <asm/resume-trace.h> header with a TRACE_RESUME() macro.
161
162 The way the information is presented is architecture-
163 dependent, x86 will print the information during a
164 late_initcall.
165
166config PM_TRACE_RTC
167 bool "Suspend/resume event tracing"
168 depends on CAN_PM_TRACE
169 depends on X86
170 select PM_TRACE
171 ---help---
172 This enables some cheesy code to save the last PM event point in the
173 RTC across reboots, so that you can debug a machine that just hangs
174 during suspend (or more commonly, during resume).
175
176 To use this debugging feature you should attempt to suspend the
177 machine, reboot it and then run
178
179 dmesg -s 1000000 | grep 'hash matches'
180
181 CAUTION: this option will cause your machine's real-time clock to be
182 set to an invalid time after a resume.
183
199config APM_EMULATION 184config APM_EMULATION
200 tristate "Advanced Power Management Emulation" 185 tristate "Advanced Power Management Emulation"
201 depends on PM && SYS_SUPPORTS_APM_EMULATION 186 depends on PM && SYS_SUPPORTS_APM_EMULATION
@@ -222,31 +207,11 @@ config APM_EMULATION
222 anything, try disabling/enabling this option (or disabling/enabling 207 anything, try disabling/enabling this option (or disabling/enabling
223 APM in your BIOS). 208 APM in your BIOS).
224 209
225config PM_RUNTIME
226 bool "Run-time PM core functionality"
227 depends on PM
228 ---help---
229 Enable functionality allowing I/O devices to be put into energy-saving
230 (low power) states at run time (or autosuspended) after a specified
231 period of inactivity and woken up in response to a hardware-generated
232 wake-up event or a driver's request.
233
234 Hardware support is generally required for this functionality to work
235 and the bus type drivers of the buses the devices are on are
236 responsible for the actual handling of the autosuspend requests and
237 wake-up events.
238
239config PM_OPS
240 bool
241 depends on PM_SLEEP || PM_RUNTIME
242 default y
243
244config ARCH_HAS_OPP 210config ARCH_HAS_OPP
245 bool 211 bool
246 212
247config PM_OPP 213config PM_OPP
248 bool "Operating Performance Point (OPP) Layer library" 214 bool "Operating Performance Point (OPP) Layer library"
249 depends on PM
250 depends on ARCH_HAS_OPP 215 depends on ARCH_HAS_OPP
251 ---help--- 216 ---help---
252 SOCs have a standard set of tuples consisting of frequency and 217 SOCs have a standard set of tuples consisting of frequency and
@@ -258,3 +223,7 @@ config PM_OPP
258 representing individual voltage domains and provides SOC 223 representing individual voltage domains and provides SOC
259 implementations a ready to use framework to manage OPPs. 224 implementations a ready to use framework to manage OPPs.
260 For more information, read <file:Documentation/power/opp.txt> 225 For more information, read <file:Documentation/power/opp.txt>
226
227config PM_RUNTIME_CLK
228 def_bool y
229 depends on PM_RUNTIME && HAVE_CLK
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c350e18b53e3..c5ebc6a90643 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,4 +1,5 @@
1ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG 1
2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
2 3
3obj-$(CONFIG_PM) += main.o 4obj-$(CONFIG_PM) += main.o
4obj-$(CONFIG_PM_SLEEP) += console.o 5obj-$(CONFIG_PM_SLEEP) += console.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 83bbc7c02df9..d09dd10c5a5e 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
28static int submit(int rw, struct block_device *bdev, sector_t sector, 28static int submit(int rw, struct block_device *bdev, sector_t sector,
29 struct page *page, struct bio **bio_chain) 29 struct page *page, struct bio **bio_chain)
30{ 30{
31 const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG; 31 const int bio_rw = rw | REQ_SYNC;
32 struct bio *bio; 32 struct bio *bio;
33 33
34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); 34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1832bd264219..f9bec56d8825 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -23,6 +23,7 @@
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/gfp.h> 25#include <linux/gfp.h>
26#include <linux/syscore_ops.h>
26#include <scsi/scsi_scan.h> 27#include <scsi/scsi_scan.h>
27#include <asm/suspend.h> 28#include <asm/suspend.h>
28 29
@@ -271,7 +272,7 @@ static int create_image(int platform_mode)
271 272
272 local_irq_disable(); 273 local_irq_disable();
273 274
274 error = sysdev_suspend(PMSG_FREEZE); 275 error = syscore_suspend();
275 if (error) { 276 if (error) {
276 printk(KERN_ERR "PM: Some system devices failed to power down, " 277 printk(KERN_ERR "PM: Some system devices failed to power down, "
277 "aborting hibernation\n"); 278 "aborting hibernation\n");
@@ -295,7 +296,7 @@ static int create_image(int platform_mode)
295 } 296 }
296 297
297 Power_up: 298 Power_up:
298 sysdev_resume(); 299 syscore_resume();
299 /* NOTE: dpm_resume_noirq() is just a resume() for devices 300 /* NOTE: dpm_resume_noirq() is just a resume() for devices
300 * that suspended with irqs off ... no overall powerup. 301 * that suspended with irqs off ... no overall powerup.
301 */ 302 */
@@ -326,20 +327,25 @@ static int create_image(int platform_mode)
326 327
327int hibernation_snapshot(int platform_mode) 328int hibernation_snapshot(int platform_mode)
328{ 329{
330 pm_message_t msg = PMSG_RECOVER;
329 int error; 331 int error;
330 332
331 error = platform_begin(platform_mode); 333 error = platform_begin(platform_mode);
332 if (error) 334 if (error)
333 goto Close; 335 goto Close;
334 336
337 error = dpm_prepare(PMSG_FREEZE);
338 if (error)
339 goto Complete_devices;
340
335 /* Preallocate image memory before shutting down devices. */ 341 /* Preallocate image memory before shutting down devices. */
336 error = hibernate_preallocate_memory(); 342 error = hibernate_preallocate_memory();
337 if (error) 343 if (error)
338 goto Close; 344 goto Complete_devices;
339 345
340 suspend_console(); 346 suspend_console();
341 pm_restrict_gfp_mask(); 347 pm_restrict_gfp_mask();
342 error = dpm_suspend_start(PMSG_FREEZE); 348 error = dpm_suspend(PMSG_FREEZE);
343 if (error) 349 if (error)
344 goto Recover_platform; 350 goto Recover_platform;
345 351
@@ -357,13 +363,17 @@ int hibernation_snapshot(int platform_mode)
357 if (error || !in_suspend) 363 if (error || !in_suspend)
358 swsusp_free(); 364 swsusp_free();
359 365
360 dpm_resume_end(in_suspend ? 366 msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE;
361 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 367 dpm_resume(msg);
362 368
363 if (error || !in_suspend) 369 if (error || !in_suspend)
364 pm_restore_gfp_mask(); 370 pm_restore_gfp_mask();
365 371
366 resume_console(); 372 resume_console();
373
374 Complete_devices:
375 dpm_complete(msg);
376
367 Close: 377 Close:
368 platform_end(platform_mode); 378 platform_end(platform_mode);
369 return error; 379 return error;
@@ -402,7 +412,7 @@ static int resume_target_kernel(bool platform_mode)
402 412
403 local_irq_disable(); 413 local_irq_disable();
404 414
405 error = sysdev_suspend(PMSG_QUIESCE); 415 error = syscore_suspend();
406 if (error) 416 if (error)
407 goto Enable_irqs; 417 goto Enable_irqs;
408 418
@@ -429,7 +439,7 @@ static int resume_target_kernel(bool platform_mode)
429 restore_processor_state(); 439 restore_processor_state();
430 touch_softlockup_watchdog(); 440 touch_softlockup_watchdog();
431 441
432 sysdev_resume(); 442 syscore_resume();
433 443
434 Enable_irqs: 444 Enable_irqs:
435 local_irq_enable(); 445 local_irq_enable();
@@ -515,7 +525,7 @@ int hibernation_platform_enter(void)
515 goto Platform_finish; 525 goto Platform_finish;
516 526
517 local_irq_disable(); 527 local_irq_disable();
518 sysdev_suspend(PMSG_HIBERNATE); 528 syscore_suspend();
519 if (pm_wakeup_pending()) { 529 if (pm_wakeup_pending()) {
520 error = -EAGAIN; 530 error = -EAGAIN;
521 goto Power_up; 531 goto Power_up;
@@ -526,7 +536,7 @@ int hibernation_platform_enter(void)
526 while (1); 536 while (1);
527 537
528 Power_up: 538 Power_up:
529 sysdev_resume(); 539 syscore_resume();
530 local_irq_enable(); 540 local_irq_enable();
531 enable_nonboot_cpus(); 541 enable_nonboot_cpus();
532 542
@@ -967,10 +977,33 @@ static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *att
967 977
968power_attr(image_size); 978power_attr(image_size);
969 979
980static ssize_t reserved_size_show(struct kobject *kobj,
981 struct kobj_attribute *attr, char *buf)
982{
983 return sprintf(buf, "%lu\n", reserved_size);
984}
985
986static ssize_t reserved_size_store(struct kobject *kobj,
987 struct kobj_attribute *attr,
988 const char *buf, size_t n)
989{
990 unsigned long size;
991
992 if (sscanf(buf, "%lu", &size) == 1) {
993 reserved_size = size;
994 return n;
995 }
996
997 return -EINVAL;
998}
999
1000power_attr(reserved_size);
1001
970static struct attribute * g[] = { 1002static struct attribute * g[] = {
971 &disk_attr.attr, 1003 &disk_attr.attr,
972 &resume_attr.attr, 1004 &resume_attr.attr,
973 &image_size_attr.attr, 1005 &image_size_attr.attr,
1006 &reserved_size_attr.attr,
974 NULL, 1007 NULL,
975}; 1008};
976 1009
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7b5db6a8561e..2981af4ce7cb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -17,9 +17,6 @@
17 17
18DEFINE_MUTEX(pm_mutex); 18DEFINE_MUTEX(pm_mutex);
19 19
20unsigned int pm_flags;
21EXPORT_SYMBOL(pm_flags);
22
23#ifdef CONFIG_PM_SLEEP 20#ifdef CONFIG_PM_SLEEP
24 21
25/* Routines for PM-transition notifications */ 22/* Routines for PM-transition notifications */
@@ -227,7 +224,7 @@ power_attr(state);
227 * writing to 'state'. It first should read from 'wakeup_count' and store 224 * writing to 'state'. It first should read from 'wakeup_count' and store
228 * the read value. Then, after carrying out its own preparations for the system 225 * the read value. Then, after carrying out its own preparations for the system
229 * transition to a sleep state, it should write the stored value to 226 * transition to a sleep state, it should write the stored value to
230 * 'wakeup_count'. If that fails, at least one wakeup event has occured since 227 * 'wakeup_count'. If that fails, at least one wakeup event has occurred since
231 * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it 228 * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it
232 * is allowed to write to 'state', but the transition will be aborted if there 229 * is allowed to write to 'state', but the transition will be aborted if there
233 * are any wakeup events detected after 'wakeup_count' was written to. 230 * are any wakeup events detected after 'wakeup_count' was written to.
@@ -326,7 +323,7 @@ EXPORT_SYMBOL_GPL(pm_wq);
326 323
327static int __init pm_start_workqueue(void) 324static int __init pm_start_workqueue(void)
328{ 325{
329 pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0); 326 pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0);
330 327
331 return pm_wq ? 0 : -ENOMEM; 328 return pm_wq ? 0 : -ENOMEM;
332} 329}
@@ -340,6 +337,7 @@ static int __init pm_init(void)
340 if (error) 337 if (error)
341 return error; 338 return error;
342 hibernate_image_size_init(); 339 hibernate_image_size_init();
340 hibernate_reserved_size_init();
343 power_kobj = kobject_create_and_add("power", NULL); 341 power_kobj = kobject_create_and_add("power", NULL);
344 if (!power_kobj) 342 if (!power_kobj)
345 return -ENOMEM; 343 return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 03634be55f62..9a00a0a26280 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -15,6 +15,7 @@ struct swsusp_info {
15 15
16#ifdef CONFIG_HIBERNATION 16#ifdef CONFIG_HIBERNATION
17/* kernel/power/snapshot.c */ 17/* kernel/power/snapshot.c */
18extern void __init hibernate_reserved_size_init(void);
18extern void __init hibernate_image_size_init(void); 19extern void __init hibernate_image_size_init(void);
19 20
20#ifdef CONFIG_ARCH_HIBERNATION_HEADER 21#ifdef CONFIG_ARCH_HIBERNATION_HEADER
@@ -55,6 +56,7 @@ extern int hibernation_platform_enter(void);
55 56
56#else /* !CONFIG_HIBERNATION */ 57#else /* !CONFIG_HIBERNATION */
57 58
59static inline void hibernate_reserved_size_init(void) {}
58static inline void hibernate_image_size_init(void) {} 60static inline void hibernate_image_size_init(void) {}
59#endif /* !CONFIG_HIBERNATION */ 61#endif /* !CONFIG_HIBERNATION */
60 62
@@ -72,6 +74,8 @@ static struct kobj_attribute _name##_attr = { \
72 74
73/* Preferred image size in bytes (default 500 MB) */ 75/* Preferred image size in bytes (default 500 MB) */
74extern unsigned long image_size; 76extern unsigned long image_size;
77/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
78extern unsigned long reserved_size;
75extern int in_suspend; 79extern int in_suspend;
76extern dev_t swsusp_resume_device; 80extern dev_t swsusp_resume_device;
77extern sector_t swsusp_resume_block; 81extern sector_t swsusp_resume_block;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index d6d2a10320e0..0cf3a27a6c9d 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -22,7 +22,7 @@
22 */ 22 */
23#define TIMEOUT (20 * HZ) 23#define TIMEOUT (20 * HZ)
24 24
25static inline int freezeable(struct task_struct * p) 25static inline int freezable(struct task_struct * p)
26{ 26{
27 if ((p == current) || 27 if ((p == current) ||
28 (p->flags & PF_NOFREEZE) || 28 (p->flags & PF_NOFREEZE) ||
@@ -53,7 +53,7 @@ static int try_to_freeze_tasks(bool sig_only)
53 todo = 0; 53 todo = 0;
54 read_lock(&tasklist_lock); 54 read_lock(&tasklist_lock);
55 do_each_thread(g, p) { 55 do_each_thread(g, p) {
56 if (frozen(p) || !freezeable(p)) 56 if (frozen(p) || !freezable(p))
57 continue; 57 continue;
58 58
59 if (!freeze_task(p, sig_only)) 59 if (!freeze_task(p, sig_only))
@@ -167,7 +167,7 @@ static void thaw_tasks(bool nosig_only)
167 167
168 read_lock(&tasklist_lock); 168 read_lock(&tasklist_lock);
169 do_each_thread(g, p) { 169 do_each_thread(g, p) {
170 if (!freezeable(p)) 170 if (!freezable(p))
171 continue; 171 continue;
172 172
173 if (nosig_only && should_send_signal(p)) 173 if (nosig_only && should_send_signal(p))
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0dac75ea4456..ace55889f702 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -41,6 +41,18 @@ static void swsusp_set_page_forbidden(struct page *);
41static void swsusp_unset_page_forbidden(struct page *); 41static void swsusp_unset_page_forbidden(struct page *);
42 42
43/* 43/*
44 * Number of bytes to reserve for memory allocations made by device drivers
45 * from their ->freeze() and ->freeze_noirq() callbacks so that they don't
46 * cause image creation to fail (tunable via /sys/power/reserved_size).
47 */
48unsigned long reserved_size;
49
50void __init hibernate_reserved_size_init(void)
51{
52 reserved_size = SPARE_PAGES * PAGE_SIZE;
53}
54
55/*
44 * Preferred image size in bytes (tunable via /sys/power/image_size). 56 * Preferred image size in bytes (tunable via /sys/power/image_size).
45 * When it is set to N, swsusp will do its best to ensure the image 57 * When it is set to N, swsusp will do its best to ensure the image
46 * size will not exceed N bytes, but if that is impossible, it will 58 * size will not exceed N bytes, but if that is impossible, it will
@@ -1263,11 +1275,13 @@ static unsigned long minimum_image_size(unsigned long saveable)
1263 * frame in use. We also need a number of page frames to be free during 1275 * frame in use. We also need a number of page frames to be free during
1264 * hibernation for allocations made while saving the image and for device 1276 * hibernation for allocations made while saving the image and for device
1265 * drivers, in case they need to allocate memory from their hibernation 1277 * drivers, in case they need to allocate memory from their hibernation
1266 * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES, 1278 * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough
1267 * respectively, both of which are rough estimates). To make this happen, we 1279 * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through
1268 * compute the total number of available page frames and allocate at least 1280 * /sys/power/reserved_size, respectively). To make this happen, we compute the
1281 * total number of available page frames and allocate at least
1269 * 1282 *
1270 * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES 1283 * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2
1284 * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
1271 * 1285 *
1272 * of them, which corresponds to the maximum size of a hibernation image. 1286 * of them, which corresponds to the maximum size of a hibernation image.
1273 * 1287 *
@@ -1322,7 +1336,8 @@ int hibernate_preallocate_memory(void)
1322 count -= totalreserve_pages; 1336 count -= totalreserve_pages;
1323 1337
1324 /* Compute the maximum number of saveable pages to leave in memory. */ 1338 /* Compute the maximum number of saveable pages to leave in memory. */
1325 max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; 1339 max_size = (count - (size + PAGES_FOR_IO)) / 2
1340 - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
1326 /* Compute the desired number of image pages specified by image_size. */ 1341 /* Compute the desired number of image pages specified by image_size. */
1327 size = DIV_ROUND_UP(image_size, PAGE_SIZE); 1342 size = DIV_ROUND_UP(image_size, PAGE_SIZE);
1328 if (size > max_size) 1343 if (size > max_size)
@@ -1519,11 +1534,8 @@ static int
1519swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, 1534swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1520 unsigned int nr_pages, unsigned int nr_highmem) 1535 unsigned int nr_pages, unsigned int nr_highmem)
1521{ 1536{
1522 int error = 0;
1523
1524 if (nr_highmem > 0) { 1537 if (nr_highmem > 0) {
1525 error = get_highmem_buffer(PG_ANY); 1538 if (get_highmem_buffer(PG_ANY))
1526 if (error)
1527 goto err_out; 1539 goto err_out;
1528 if (nr_highmem > alloc_highmem) { 1540 if (nr_highmem > alloc_highmem) {
1529 nr_highmem -= alloc_highmem; 1541 nr_highmem -= alloc_highmem;
@@ -1546,7 +1558,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1546 1558
1547 err_out: 1559 err_out:
1548 swsusp_free(); 1560 swsusp_free();
1549 return error; 1561 return -ENOMEM;
1550} 1562}
1551 1563
1552asmlinkage int swsusp_save(void) 1564asmlinkage int swsusp_save(void)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index de6f86bfa303..1c41ba215419 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -22,6 +22,7 @@
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/suspend.h> 24#include <linux/suspend.h>
25#include <linux/syscore_ops.h>
25#include <trace/events/power.h> 26#include <trace/events/power.h>
26 27
27#include "power.h" 28#include "power.h"
@@ -162,13 +163,13 @@ static int suspend_enter(suspend_state_t state)
162 arch_suspend_disable_irqs(); 163 arch_suspend_disable_irqs();
163 BUG_ON(!irqs_disabled()); 164 BUG_ON(!irqs_disabled());
164 165
165 error = sysdev_suspend(PMSG_SUSPEND); 166 error = syscore_suspend();
166 if (!error) { 167 if (!error) {
167 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { 168 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
168 error = suspend_ops->enter(state); 169 error = suspend_ops->enter(state);
169 events_check_enabled = false; 170 events_check_enabled = false;
170 } 171 }
171 sysdev_resume(); 172 syscore_resume();
172 } 173 }
173 174
174 arch_suspend_enable_irqs(); 175 arch_suspend_enable_irqs();
@@ -209,7 +210,6 @@ int suspend_devices_and_enter(suspend_state_t state)
209 goto Close; 210 goto Close;
210 } 211 }
211 suspend_console(); 212 suspend_console();
212 pm_restrict_gfp_mask();
213 suspend_test_start(); 213 suspend_test_start();
214 error = dpm_suspend_start(PMSG_SUSPEND); 214 error = dpm_suspend_start(PMSG_SUSPEND);
215 if (error) { 215 if (error) {
@@ -220,13 +220,12 @@ int suspend_devices_and_enter(suspend_state_t state)
220 if (suspend_test(TEST_DEVICES)) 220 if (suspend_test(TEST_DEVICES))
221 goto Recover_platform; 221 goto Recover_platform;
222 222
223 suspend_enter(state); 223 error = suspend_enter(state);
224 224
225 Resume_devices: 225 Resume_devices:
226 suspend_test_start(); 226 suspend_test_start();
227 dpm_resume_end(PMSG_RESUME); 227 dpm_resume_end(PMSG_RESUME);
228 suspend_test_finish("resume devices"); 228 suspend_test_finish("resume devices");
229 pm_restore_gfp_mask();
230 resume_console(); 229 resume_console();
231 Close: 230 Close:
232 if (suspend_ops->end) 231 if (suspend_ops->end)
@@ -287,7 +286,9 @@ int enter_state(suspend_state_t state)
287 goto Finish; 286 goto Finish;
288 287
289 pr_debug("PM: Entering %s sleep\n", pm_states[state]); 288 pr_debug("PM: Entering %s sleep\n", pm_states[state]);
289 pm_restrict_gfp_mask();
290 error = suspend_devices_and_enter(state); 290 error = suspend_devices_and_enter(state);
291 pm_restore_gfp_mask();
291 292
292 Finish: 293 Finish:
293 pr_debug("PM: Finishing wakeup.\n"); 294 pr_debug("PM: Finishing wakeup.\n");
diff --git a/kernel/power/user.c b/kernel/power/user.c
index c36c3b9e8a84..7d02d33be699 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -135,8 +135,10 @@ static int snapshot_release(struct inode *inode, struct file *filp)
135 free_basic_memory_bitmaps(); 135 free_basic_memory_bitmaps();
136 data = filp->private_data; 136 data = filp->private_data;
137 free_all_swap_pages(data->swap); 137 free_all_swap_pages(data->swap);
138 if (data->frozen) 138 if (data->frozen) {
139 pm_restore_gfp_mask();
139 thaw_processes(); 140 thaw_processes();
141 }
140 pm_notifier_call_chain(data->mode == O_RDONLY ? 142 pm_notifier_call_chain(data->mode == O_RDONLY ?
141 PM_POST_HIBERNATION : PM_POST_RESTORE); 143 PM_POST_HIBERNATION : PM_POST_RESTORE);
142 atomic_inc(&snapshot_device_available); 144 atomic_inc(&snapshot_device_available);
@@ -379,6 +381,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
379 * PM_HIBERNATION_PREPARE 381 * PM_HIBERNATION_PREPARE
380 */ 382 */
381 error = suspend_devices_and_enter(PM_SUSPEND_MEM); 383 error = suspend_devices_and_enter(PM_SUSPEND_MEM);
384 data->ready = 0;
382 break; 385 break;
383 386
384 case SNAPSHOT_PLATFORM_SUPPORT: 387 case SNAPSHOT_PLATFORM_SUPPORT:
diff --git a/kernel/printk.c b/kernel/printk.c
index 2ddbdc73aade..35185392173f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -31,6 +31,7 @@
31#include <linux/smp.h> 31#include <linux/smp.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/memblock.h>
34#include <linux/syscalls.h> 35#include <linux/syscalls.h>
35#include <linux/kexec.h> 36#include <linux/kexec.h>
36#include <linux/kdb.h> 37#include <linux/kdb.h>
@@ -53,7 +54,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
53#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) 54#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
54 55
55/* printk's without a loglevel use this.. */ 56/* printk's without a loglevel use this.. */
56#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */ 57#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
57 58
58/* We show everything that is MORE important than this.. */ 59/* We show everything that is MORE important than this.. */
59#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ 60#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
@@ -113,6 +114,11 @@ static unsigned con_start; /* Index into log_buf: next char to be sent to consol
113static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ 114static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
114 115
115/* 116/*
117 * If exclusive_console is non-NULL then only this console is to be printed to.
118 */
119static struct console *exclusive_console;
120
121/*
116 * Array of consoles built from command line options (console=) 122 * Array of consoles built from command line options (console=)
117 */ 123 */
118struct console_cmdline 124struct console_cmdline
@@ -162,46 +168,74 @@ void log_buf_kexec_setup(void)
162} 168}
163#endif 169#endif
164 170
171/* requested log_buf_len from kernel cmdline */
172static unsigned long __initdata new_log_buf_len;
173
174/* save requested log_buf_len since it's too early to process it */
165static int __init log_buf_len_setup(char *str) 175static int __init log_buf_len_setup(char *str)
166{ 176{
167 unsigned size = memparse(str, &str); 177 unsigned size = memparse(str, &str);
168 unsigned long flags;
169 178
170 if (size) 179 if (size)
171 size = roundup_pow_of_two(size); 180 size = roundup_pow_of_two(size);
172 if (size > log_buf_len) { 181 if (size > log_buf_len)
173 unsigned start, dest_idx, offset; 182 new_log_buf_len = size;
174 char *new_log_buf;
175 183
176 new_log_buf = alloc_bootmem(size); 184 return 0;
177 if (!new_log_buf) { 185}
178 printk(KERN_WARNING "log_buf_len: allocation failed\n"); 186early_param("log_buf_len", log_buf_len_setup);
179 goto out;
180 }
181 187
182 spin_lock_irqsave(&logbuf_lock, flags); 188void __init setup_log_buf(int early)
183 log_buf_len = size; 189{
184 log_buf = new_log_buf; 190 unsigned long flags;
185 191 unsigned start, dest_idx, offset;
186 offset = start = min(con_start, log_start); 192 char *new_log_buf;
187 dest_idx = 0; 193 int free;
188 while (start != log_end) { 194
189 log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)]; 195 if (!new_log_buf_len)
190 start++; 196 return;
191 dest_idx++; 197
192 } 198 if (early) {
193 log_start -= offset; 199 unsigned long mem;
194 con_start -= offset;
195 log_end -= offset;
196 spin_unlock_irqrestore(&logbuf_lock, flags);
197 200
198 printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); 201 mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
202 if (mem == MEMBLOCK_ERROR)
203 return;
204 new_log_buf = __va(mem);
205 } else {
206 new_log_buf = alloc_bootmem_nopanic(new_log_buf_len);
199 } 207 }
200out:
201 return 1;
202}
203 208
204__setup("log_buf_len=", log_buf_len_setup); 209 if (unlikely(!new_log_buf)) {
210 pr_err("log_buf_len: %ld bytes not available\n",
211 new_log_buf_len);
212 return;
213 }
214
215 spin_lock_irqsave(&logbuf_lock, flags);
216 log_buf_len = new_log_buf_len;
217 log_buf = new_log_buf;
218 new_log_buf_len = 0;
219 free = __LOG_BUF_LEN - log_end;
220
221 offset = start = min(con_start, log_start);
222 dest_idx = 0;
223 while (start != log_end) {
224 unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1);
225
226 log_buf[dest_idx] = __log_buf[log_idx_mask];
227 start++;
228 dest_idx++;
229 }
230 log_start -= offset;
231 con_start -= offset;
232 log_end -= offset;
233 spin_unlock_irqrestore(&logbuf_lock, flags);
234
235 pr_info("log_buf_len: %d\n", log_buf_len);
236 pr_info("early log buf free: %d(%d%%)\n",
237 free, (free * 100) / __LOG_BUF_LEN);
238}
205 239
206#ifdef CONFIG_BOOT_PRINTK_DELAY 240#ifdef CONFIG_BOOT_PRINTK_DELAY
207 241
@@ -262,25 +296,47 @@ int dmesg_restrict = 1;
262int dmesg_restrict; 296int dmesg_restrict;
263#endif 297#endif
264 298
299static int syslog_action_restricted(int type)
300{
301 if (dmesg_restrict)
302 return 1;
303 /* Unless restricted, we allow "read all" and "get buffer size" for everybody */
304 return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER;
305}
306
307static int check_syslog_permissions(int type, bool from_file)
308{
309 /*
310 * If this is from /proc/kmsg and we've already opened it, then we've
311 * already done the capabilities checks at open time.
312 */
313 if (from_file && type != SYSLOG_ACTION_OPEN)
314 return 0;
315
316 if (syslog_action_restricted(type)) {
317 if (capable(CAP_SYSLOG))
318 return 0;
319 /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
320 if (capable(CAP_SYS_ADMIN)) {
321 WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
322 "but no CAP_SYSLOG (deprecated).\n");
323 return 0;
324 }
325 return -EPERM;
326 }
327 return 0;
328}
329
265int do_syslog(int type, char __user *buf, int len, bool from_file) 330int do_syslog(int type, char __user *buf, int len, bool from_file)
266{ 331{
267 unsigned i, j, limit, count; 332 unsigned i, j, limit, count;
268 int do_clear = 0; 333 int do_clear = 0;
269 char c; 334 char c;
270 int error = 0; 335 int error;
271 336
272 /* 337 error = check_syslog_permissions(type, from_file);
273 * If this is from /proc/kmsg we only do the capabilities checks 338 if (error)
274 * at open time. 339 goto out;
275 */
276 if (type == SYSLOG_ACTION_OPEN || !from_file) {
277 if (dmesg_restrict && !capable(CAP_SYSLOG))
278 goto warn; /* switch to return -EPERM after 2.6.39 */
279 if ((type != SYSLOG_ACTION_READ_ALL &&
280 type != SYSLOG_ACTION_SIZE_BUFFER) &&
281 !capable(CAP_SYSLOG))
282 goto warn; /* switch to return -EPERM after 2.6.39 */
283 }
284 340
285 error = security_syslog(type); 341 error = security_syslog(type);
286 if (error) 342 if (error)
@@ -423,12 +479,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
423 } 479 }
424out: 480out:
425 return error; 481 return error;
426warn:
427 /* remove after 2.6.39 */
428 if (capable(CAP_SYS_ADMIN))
429 WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
430 "but no CAP_SYSLOG (deprecated and denied).\n");
431 return -EPERM;
432} 482}
433 483
434SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) 484SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
@@ -460,6 +510,8 @@ static void __call_console_drivers(unsigned start, unsigned end)
460 struct console *con; 510 struct console *con;
461 511
462 for_each_console(con) { 512 for_each_console(con) {
513 if (exclusive_console && con != exclusive_console)
514 continue;
463 if ((con->flags & CON_ENABLED) && con->write && 515 if ((con->flags & CON_ENABLED) && con->write &&
464 (cpu_online(smp_processor_id()) || 516 (cpu_online(smp_processor_id()) ||
465 (con->flags & CON_ANYTIME))) 517 (con->flags & CON_ANYTIME)))
@@ -499,6 +551,71 @@ static void _call_console_drivers(unsigned start,
499} 551}
500 552
501/* 553/*
554 * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
555 * lower 3 bit are the log level, the rest are the log facility. In case
556 * userspace passes usual userspace syslog messages to /dev/kmsg or
557 * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
558 * to extract the correct log level for in-kernel processing, and not mangle
559 * the original value.
560 *
561 * If a prefix is found, the length of the prefix is returned. If 'level' is
562 * passed, it will be filled in with the log level without a possible facility
563 * value. If 'special' is passed, the special printk prefix chars are accepted
564 * and returned. If no valid header is found, 0 is returned and the passed
565 * variables are not touched.
566 */
567static size_t log_prefix(const char *p, unsigned int *level, char *special)
568{
569 unsigned int lev = 0;
570 char sp = '\0';
571 size_t len;
572
573 if (p[0] != '<' || !p[1])
574 return 0;
575 if (p[2] == '>') {
576 /* usual single digit level number or special char */
577 switch (p[1]) {
578 case '0' ... '7':
579 lev = p[1] - '0';
580 break;
581 case 'c': /* KERN_CONT */
582 case 'd': /* KERN_DEFAULT */
583 sp = p[1];
584 break;
585 default:
586 return 0;
587 }
588 len = 3;
589 } else {
590 /* multi digit including the level and facility number */
591 char *endp = NULL;
592
593 if (p[1] < '0' && p[1] > '9')
594 return 0;
595
596 lev = (simple_strtoul(&p[1], &endp, 10) & 7);
597 if (endp == NULL || endp[0] != '>')
598 return 0;
599 len = (endp + 1) - p;
600 }
601
602 /* do not accept special char if not asked for */
603 if (sp && !special)
604 return 0;
605
606 if (special) {
607 *special = sp;
608 /* return special char, do not touch level */
609 if (sp)
610 return len;
611 }
612
613 if (level)
614 *level = lev;
615 return len;
616}
617
618/*
502 * Call the console drivers, asking them to write out 619 * Call the console drivers, asking them to write out
503 * log_buf[start] to log_buf[end - 1]. 620 * log_buf[start] to log_buf[end - 1].
504 * The console_lock must be held. 621 * The console_lock must be held.
@@ -513,13 +630,9 @@ static void call_console_drivers(unsigned start, unsigned end)
513 cur_index = start; 630 cur_index = start;
514 start_print = start; 631 start_print = start;
515 while (cur_index != end) { 632 while (cur_index != end) {
516 if (msg_level < 0 && ((end - cur_index) > 2) && 633 if (msg_level < 0 && ((end - cur_index) > 2)) {
517 LOG_BUF(cur_index + 0) == '<' && 634 /* strip log prefix */
518 LOG_BUF(cur_index + 1) >= '0' && 635 cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
519 LOG_BUF(cur_index + 1) <= '7' &&
520 LOG_BUF(cur_index + 2) == '>') {
521 msg_level = LOG_BUF(cur_index + 1) - '0';
522 cur_index += 3;
523 start_print = cur_index; 636 start_print = cur_index;
524 } 637 }
525 while (cur_index != end) { 638 while (cur_index != end) {
@@ -717,6 +830,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
717 unsigned long flags; 830 unsigned long flags;
718 int this_cpu; 831 int this_cpu;
719 char *p; 832 char *p;
833 size_t plen;
834 char special;
720 835
721 boot_delay_msec(); 836 boot_delay_msec();
722 printk_delay(); 837 printk_delay();
@@ -757,45 +872,52 @@ asmlinkage int vprintk(const char *fmt, va_list args)
757 printed_len += vscnprintf(printk_buf + printed_len, 872 printed_len += vscnprintf(printk_buf + printed_len,
758 sizeof(printk_buf) - printed_len, fmt, args); 873 sizeof(printk_buf) - printed_len, fmt, args);
759 874
760
761 p = printk_buf; 875 p = printk_buf;
762 876
763 /* Do we have a loglevel in the string? */ 877 /* Read log level and handle special printk prefix */
764 if (p[0] == '<') { 878 plen = log_prefix(p, &current_log_level, &special);
765 unsigned char c = p[1]; 879 if (plen) {
766 if (c && p[2] == '>') { 880 p += plen;
767 switch (c) { 881
768 case '0' ... '7': /* loglevel */ 882 switch (special) {
769 current_log_level = c - '0'; 883 case 'c': /* Strip <c> KERN_CONT, continue line */
770 /* Fallthrough - make sure we're on a new line */ 884 plen = 0;
771 case 'd': /* KERN_DEFAULT */ 885 break;
772 if (!new_text_line) { 886 case 'd': /* Strip <d> KERN_DEFAULT, start new line */
773 emit_log_char('\n'); 887 plen = 0;
774 new_text_line = 1; 888 default:
775 } 889 if (!new_text_line) {
776 /* Fallthrough - skip the loglevel */ 890 emit_log_char('\n');
777 case 'c': /* KERN_CONT */ 891 new_text_line = 1;
778 p += 3;
779 break;
780 } 892 }
781 } 893 }
782 } 894 }
783 895
784 /* 896 /*
785 * Copy the output into log_buf. If the caller didn't provide 897 * Copy the output into log_buf. If the caller didn't provide
786 * appropriate log level tags, we insert them here 898 * the appropriate log prefix, we insert them here
787 */ 899 */
788 for ( ; *p; p++) { 900 for (; *p; p++) {
789 if (new_text_line) { 901 if (new_text_line) {
790 /* Always output the token */
791 emit_log_char('<');
792 emit_log_char(current_log_level + '0');
793 emit_log_char('>');
794 printed_len += 3;
795 new_text_line = 0; 902 new_text_line = 0;
796 903
904 if (plen) {
905 /* Copy original log prefix */
906 int i;
907
908 for (i = 0; i < plen; i++)
909 emit_log_char(printk_buf[i]);
910 printed_len += plen;
911 } else {
912 /* Add log prefix */
913 emit_log_char('<');
914 emit_log_char(current_log_level + '0');
915 emit_log_char('>');
916 printed_len += 3;
917 }
918
797 if (printk_time) { 919 if (printk_time) {
798 /* Follow the token with the time */ 920 /* Add the current time stamp */
799 char tbuf[50], *tp; 921 char tbuf[50], *tp;
800 unsigned tlen; 922 unsigned tlen;
801 unsigned long long t; 923 unsigned long long t;
@@ -1144,6 +1266,11 @@ void console_unlock(void)
1144 local_irq_restore(flags); 1266 local_irq_restore(flags);
1145 } 1267 }
1146 console_locked = 0; 1268 console_locked = 0;
1269
1270 /* Release the exclusive_console once it is used */
1271 if (unlikely(exclusive_console))
1272 exclusive_console = NULL;
1273
1147 up(&console_sem); 1274 up(&console_sem);
1148 spin_unlock_irqrestore(&logbuf_lock, flags); 1275 spin_unlock_irqrestore(&logbuf_lock, flags);
1149 if (wake_klogd) 1276 if (wake_klogd)
@@ -1230,6 +1357,18 @@ void console_start(struct console *console)
1230} 1357}
1231EXPORT_SYMBOL(console_start); 1358EXPORT_SYMBOL(console_start);
1232 1359
1360static int __read_mostly keep_bootcon;
1361
1362static int __init keep_bootcon_setup(char *str)
1363{
1364 keep_bootcon = 1;
1365 printk(KERN_INFO "debug: skip boot console de-registration.\n");
1366
1367 return 0;
1368}
1369
1370early_param("keep_bootcon", keep_bootcon_setup);
1371
1233/* 1372/*
1234 * The console driver calls this routine during kernel initialization 1373 * The console driver calls this routine during kernel initialization
1235 * to register the console printing procedure with printk() and to 1374 * to register the console printing procedure with printk() and to
@@ -1366,6 +1505,12 @@ void register_console(struct console *newcon)
1366 spin_lock_irqsave(&logbuf_lock, flags); 1505 spin_lock_irqsave(&logbuf_lock, flags);
1367 con_start = log_start; 1506 con_start = log_start;
1368 spin_unlock_irqrestore(&logbuf_lock, flags); 1507 spin_unlock_irqrestore(&logbuf_lock, flags);
1508 /*
1509 * We're about to replay the log buffer. Only do this to the
1510 * just-registered console to avoid excessive message spam to
1511 * the already-registered consoles.
1512 */
1513 exclusive_console = newcon;
1369 } 1514 }
1370 console_unlock(); 1515 console_unlock();
1371 console_sysfs_notify(); 1516 console_sysfs_notify();
@@ -1377,7 +1522,9 @@ void register_console(struct console *newcon)
1377 * users know there might be something in the kernel's log buffer that 1522 * users know there might be something in the kernel's log buffer that
1378 * went to the bootconsole (that they do not see on the real console) 1523 * went to the bootconsole (that they do not see on the real console)
1379 */ 1524 */
1380 if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { 1525 if (bcon &&
1526 ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
1527 !keep_bootcon) {
1381 /* we need to iterate through twice, to make sure we print 1528 /* we need to iterate through twice, to make sure we print
1382 * everything out, before we unregister the console(s) 1529 * everything out, before we unregister the console(s)
1383 */ 1530 */
diff --git a/kernel/profile.c b/kernel/profile.c
index 66f841b7fbd3..961b389fe52f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -126,11 +126,9 @@ int __ref profile_init(void)
126 if (prof_buffer) 126 if (prof_buffer)
127 return 0; 127 return 0;
128 128
129 prof_buffer = vmalloc(buffer_bytes); 129 prof_buffer = vzalloc(buffer_bytes);
130 if (prof_buffer) { 130 if (prof_buffer)
131 memset(prof_buffer, 0, buffer_bytes);
132 return 0; 131 return 0;
133 }
134 132
135 free_cpumask_var(prof_cpu_mask); 133 free_cpumask_var(prof_cpu_mask);
136 return -ENOMEM; 134 return -ENOMEM;
@@ -305,14 +303,12 @@ static void profile_discard_flip_buffers(void)
305 mutex_unlock(&profile_flip_mutex); 303 mutex_unlock(&profile_flip_mutex);
306} 304}
307 305
308void profile_hits(int type, void *__pc, unsigned int nr_hits) 306static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
309{ 307{
310 unsigned long primary, secondary, flags, pc = (unsigned long)__pc; 308 unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
311 int i, j, cpu; 309 int i, j, cpu;
312 struct profile_hit *hits; 310 struct profile_hit *hits;
313 311
314 if (prof_on != type || !prof_buffer)
315 return;
316 pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); 312 pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
317 i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; 313 i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
318 secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; 314 secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
@@ -419,16 +415,20 @@ out_free:
419#define profile_discard_flip_buffers() do { } while (0) 415#define profile_discard_flip_buffers() do { } while (0)
420#define profile_cpu_callback NULL 416#define profile_cpu_callback NULL
421 417
422void profile_hits(int type, void *__pc, unsigned int nr_hits) 418static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
423{ 419{
424 unsigned long pc; 420 unsigned long pc;
425
426 if (prof_on != type || !prof_buffer)
427 return;
428 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; 421 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
429 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); 422 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
430} 423}
431#endif /* !CONFIG_SMP */ 424#endif /* !CONFIG_SMP */
425
426void profile_hits(int type, void *__pc, unsigned int nr_hits)
427{
428 if (prof_on != type || !prof_buffer)
429 return;
430 do_profile_hits(type, __pc, nr_hits);
431}
432EXPORT_SYMBOL_GPL(profile_hits); 432EXPORT_SYMBOL_GPL(profile_hits);
433 433
434void profile_tick(int type) 434void profile_tick(int type)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 99bbaa3e5b0d..2df115790cd9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
22#include <linux/syscalls.h> 22#include <linux/syscalls.h>
23#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/regset.h> 24#include <linux/regset.h>
25#include <linux/hw_breakpoint.h>
25 26
26 27
27/* 28/*
@@ -37,35 +38,33 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
37 child->parent = new_parent; 38 child->parent = new_parent;
38} 39}
39 40
40/* 41/**
41 * Turn a tracing stop into a normal stop now, since with no tracer there 42 * __ptrace_unlink - unlink ptracee and restore its execution state
42 * would be no way to wake it up with SIGCONT or SIGKILL. If there was a 43 * @child: ptracee to be unlinked
43 * signal sent that would resume the child, but didn't because it was in
44 * TASK_TRACED, resume it now.
45 * Requires that irqs be disabled.
46 */
47static void ptrace_untrace(struct task_struct *child)
48{
49 spin_lock(&child->sighand->siglock);
50 if (task_is_traced(child)) {
51 /*
52 * If the group stop is completed or in progress,
53 * this thread was already counted as stopped.
54 */
55 if (child->signal->flags & SIGNAL_STOP_STOPPED ||
56 child->signal->group_stop_count)
57 __set_task_state(child, TASK_STOPPED);
58 else
59 signal_wake_up(child, 1);
60 }
61 spin_unlock(&child->sighand->siglock);
62}
63
64/*
65 * unptrace a task: move it back to its original parent and
66 * remove it from the ptrace list.
67 * 44 *
68 * Must be called with the tasklist lock write-held. 45 * Remove @child from the ptrace list, move it back to the original parent,
46 * and restore the execution state so that it conforms to the group stop
47 * state.
48 *
49 * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer
50 * exiting. For PTRACE_DETACH, unless the ptracee has been killed between
51 * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED.
52 * If the ptracer is exiting, the ptracee can be in any state.
53 *
54 * After detach, the ptracee should be in a state which conforms to the
55 * group stop. If the group is stopped or in the process of stopping, the
56 * ptracee should be put into TASK_STOPPED; otherwise, it should be woken
57 * up from TASK_TRACED.
58 *
59 * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED,
60 * it goes through TRACED -> RUNNING -> STOPPED transition which is similar
61 * to but in the opposite direction of what happens while attaching to a
62 * stopped task. However, in this direction, the intermediate RUNNING
63 * state is not hidden even from the current ptracer and if it immediately
64 * re-attaches and performs a WNOHANG wait(2), it may fail.
65 *
66 * CONTEXT:
67 * write_lock_irq(tasklist_lock)
69 */ 68 */
70void __ptrace_unlink(struct task_struct *child) 69void __ptrace_unlink(struct task_struct *child)
71{ 70{
@@ -75,8 +74,27 @@ void __ptrace_unlink(struct task_struct *child)
75 child->parent = child->real_parent; 74 child->parent = child->real_parent;
76 list_del_init(&child->ptrace_entry); 75 list_del_init(&child->ptrace_entry);
77 76
78 if (task_is_traced(child)) 77 spin_lock(&child->sighand->siglock);
79 ptrace_untrace(child); 78
79 /*
80 * Reinstate GROUP_STOP_PENDING if group stop is in effect and
81 * @child isn't dead.
82 */
83 if (!(child->flags & PF_EXITING) &&
84 (child->signal->flags & SIGNAL_STOP_STOPPED ||
85 child->signal->group_stop_count))
86 child->group_stop |= GROUP_STOP_PENDING;
87
88 /*
89 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
90 * @child in the butt. Note that @resume should be used iff @child
91 * is in TASK_TRACED; otherwise, we might unduly disrupt
92 * TASK_KILLABLE sleeps.
93 */
94 if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child))
95 signal_wake_up(child, task_is_traced(child));
96
97 spin_unlock(&child->sighand->siglock);
80} 98}
81 99
82/* 100/*
@@ -95,16 +113,14 @@ int ptrace_check_attach(struct task_struct *child, int kill)
95 */ 113 */
96 read_lock(&tasklist_lock); 114 read_lock(&tasklist_lock);
97 if ((child->ptrace & PT_PTRACED) && child->parent == current) { 115 if ((child->ptrace & PT_PTRACED) && child->parent == current) {
98 ret = 0;
99 /* 116 /*
100 * child->sighand can't be NULL, release_task() 117 * child->sighand can't be NULL, release_task()
101 * does ptrace_unlink() before __exit_signal(). 118 * does ptrace_unlink() before __exit_signal().
102 */ 119 */
103 spin_lock_irq(&child->sighand->siglock); 120 spin_lock_irq(&child->sighand->siglock);
104 if (task_is_stopped(child)) 121 WARN_ON_ONCE(task_is_stopped(child));
105 child->state = TASK_TRACED; 122 if (task_is_traced(child) || kill)
106 else if (!task_is_traced(child) && !kill) 123 ret = 0;
107 ret = -ESRCH;
108 spin_unlock_irq(&child->sighand->siglock); 124 spin_unlock_irq(&child->sighand->siglock);
109 } 125 }
110 read_unlock(&tasklist_lock); 126 read_unlock(&tasklist_lock);
@@ -134,21 +150,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
134 return 0; 150 return 0;
135 rcu_read_lock(); 151 rcu_read_lock();
136 tcred = __task_cred(task); 152 tcred = __task_cred(task);
137 if ((cred->uid != tcred->euid || 153 if (cred->user->user_ns == tcred->user->user_ns &&
138 cred->uid != tcred->suid || 154 (cred->uid == tcred->euid &&
139 cred->uid != tcred->uid || 155 cred->uid == tcred->suid &&
140 cred->gid != tcred->egid || 156 cred->uid == tcred->uid &&
141 cred->gid != tcred->sgid || 157 cred->gid == tcred->egid &&
142 cred->gid != tcred->gid) && 158 cred->gid == tcred->sgid &&
143 !capable(CAP_SYS_PTRACE)) { 159 cred->gid == tcred->gid))
144 rcu_read_unlock(); 160 goto ok;
145 return -EPERM; 161 if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
146 } 162 goto ok;
163 rcu_read_unlock();
164 return -EPERM;
165ok:
147 rcu_read_unlock(); 166 rcu_read_unlock();
148 smp_rmb(); 167 smp_rmb();
149 if (task->mm) 168 if (task->mm)
150 dumpable = get_dumpable(task->mm); 169 dumpable = get_dumpable(task->mm);
151 if (!dumpable && !capable(CAP_SYS_PTRACE)) 170 if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE))
152 return -EPERM; 171 return -EPERM;
153 172
154 return security_ptrace_access_check(task, mode); 173 return security_ptrace_access_check(task, mode);
@@ -163,8 +182,9 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
163 return !err; 182 return !err;
164} 183}
165 184
166int ptrace_attach(struct task_struct *task) 185static int ptrace_attach(struct task_struct *task)
167{ 186{
187 bool wait_trap = false;
168 int retval; 188 int retval;
169 189
170 audit_ptrace(task); 190 audit_ptrace(task);
@@ -198,18 +218,48 @@ int ptrace_attach(struct task_struct *task)
198 goto unlock_tasklist; 218 goto unlock_tasklist;
199 219
200 task->ptrace = PT_PTRACED; 220 task->ptrace = PT_PTRACED;
201 if (capable(CAP_SYS_PTRACE)) 221 if (task_ns_capable(task, CAP_SYS_PTRACE))
202 task->ptrace |= PT_PTRACE_CAP; 222 task->ptrace |= PT_PTRACE_CAP;
203 223
204 __ptrace_link(task, current); 224 __ptrace_link(task, current);
205 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); 225 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
206 226
227 spin_lock(&task->sighand->siglock);
228
229 /*
230 * If the task is already STOPPED, set GROUP_STOP_PENDING and
231 * TRAPPING, and kick it so that it transits to TRACED. TRAPPING
232 * will be cleared if the child completes the transition or any
233 * event which clears the group stop states happens. We'll wait
234 * for the transition to complete before returning from this
235 * function.
236 *
237 * This hides STOPPED -> RUNNING -> TRACED transition from the
238 * attaching thread but a different thread in the same group can
239 * still observe the transient RUNNING state. IOW, if another
240 * thread's WNOHANG wait(2) on the stopped tracee races against
241 * ATTACH, the wait(2) may fail due to the transient RUNNING.
242 *
243 * The following task_is_stopped() test is safe as both transitions
244 * in and out of STOPPED are protected by siglock.
245 */
246 if (task_is_stopped(task)) {
247 task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING;
248 signal_wake_up(task, 1);
249 wait_trap = true;
250 }
251
252 spin_unlock(&task->sighand->siglock);
253
207 retval = 0; 254 retval = 0;
208unlock_tasklist: 255unlock_tasklist:
209 write_unlock_irq(&tasklist_lock); 256 write_unlock_irq(&tasklist_lock);
210unlock_creds: 257unlock_creds:
211 mutex_unlock(&task->signal->cred_guard_mutex); 258 mutex_unlock(&task->signal->cred_guard_mutex);
212out: 259out:
260 if (wait_trap)
261 wait_event(current->signal->wait_chldexit,
262 !(task->group_stop & GROUP_STOP_TRAPPING));
213 return retval; 263 return retval;
214} 264}
215 265
@@ -219,7 +269,7 @@ out:
219 * Performs checks and sets PT_PTRACED. 269 * Performs checks and sets PT_PTRACED.
220 * Should be used by all ptrace implementations for PTRACE_TRACEME. 270 * Should be used by all ptrace implementations for PTRACE_TRACEME.
221 */ 271 */
222int ptrace_traceme(void) 272static int ptrace_traceme(void)
223{ 273{
224 int ret = -EPERM; 274 int ret = -EPERM;
225 275
@@ -293,7 +343,7 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
293 return false; 343 return false;
294} 344}
295 345
296int ptrace_detach(struct task_struct *child, unsigned int data) 346static int ptrace_detach(struct task_struct *child, unsigned int data)
297{ 347{
298 bool dead = false; 348 bool dead = false;
299 349
@@ -312,8 +362,6 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
312 if (child->ptrace) { 362 if (child->ptrace) {
313 child->exit_code = data; 363 child->exit_code = data;
314 dead = __ptrace_detach(current, child); 364 dead = __ptrace_detach(current, child);
315 if (!child->exit_state)
316 wake_up_process(child);
317 } 365 }
318 write_unlock_irq(&tasklist_lock); 366 write_unlock_irq(&tasklist_lock);
319 367
@@ -514,7 +562,7 @@ static int ptrace_resume(struct task_struct *child, long request,
514 } 562 }
515 563
516 child->exit_code = data; 564 child->exit_code = data;
517 wake_up_process(child); 565 wake_up_state(child, __TASK_TRACED);
518 566
519 return 0; 567 return 0;
520} 568}
@@ -876,3 +924,19 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
876 return ret; 924 return ret;
877} 925}
878#endif /* CONFIG_COMPAT */ 926#endif /* CONFIG_COMPAT */
927
928#ifdef CONFIG_HAVE_HW_BREAKPOINT
929int ptrace_get_breakpoints(struct task_struct *tsk)
930{
931 if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt))
932 return 0;
933
934 return -1;
935}
936
937void ptrace_put_breakpoints(struct task_struct *tsk)
938{
939 if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt))
940 flush_ptrace_hw_breakpoint(tsk);
941}
942#endif /* CONFIG_HAVE_HW_BREAKPOINT */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a23a57a976d1..7784bd216b6a 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -142,10 +142,17 @@ static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
142 * Ensure that queued callbacks are all executed. 142 * Ensure that queued callbacks are all executed.
143 * If we detect that we are nested in a RCU read-side critical 143 * If we detect that we are nested in a RCU read-side critical
144 * section, we should simply fail, otherwise we would deadlock. 144 * section, we should simply fail, otherwise we would deadlock.
145 * In !PREEMPT configurations, there is no way to tell if we are
146 * in a RCU read-side critical section or not, so we never
147 * attempt any fixup and just print a warning.
145 */ 148 */
149#ifndef CONFIG_PREEMPT
150 WARN_ON_ONCE(1);
151 return 0;
152#endif
146 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 153 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
147 irqs_disabled()) { 154 irqs_disabled()) {
148 WARN_ON(1); 155 WARN_ON_ONCE(1);
149 return 0; 156 return 0;
150 } 157 }
151 rcu_barrier(); 158 rcu_barrier();
@@ -184,10 +191,17 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
184 * Ensure that queued callbacks are all executed. 191 * Ensure that queued callbacks are all executed.
185 * If we detect that we are nested in a RCU read-side critical 192 * If we detect that we are nested in a RCU read-side critical
186 * section, we should simply fail, otherwise we would deadlock. 193 * section, we should simply fail, otherwise we would deadlock.
194 * In !PREEMPT configurations, there is no way to tell if we are
195 * in a RCU read-side critical section or not, so we never
196 * attempt any fixup and just print a warning.
187 */ 197 */
198#ifndef CONFIG_PREEMPT
199 WARN_ON_ONCE(1);
200 return 0;
201#endif
188 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 202 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
189 irqs_disabled()) { 203 irqs_disabled()) {
190 WARN_ON(1); 204 WARN_ON_ONCE(1);
191 return 0; 205 return 0;
192 } 206 }
193 rcu_barrier(); 207 rcu_barrier();
@@ -214,14 +228,17 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
214 * Ensure that queued callbacks are all executed. 228 * Ensure that queued callbacks are all executed.
215 * If we detect that we are nested in a RCU read-side critical 229 * If we detect that we are nested in a RCU read-side critical
216 * section, we should simply fail, otherwise we would deadlock. 230 * section, we should simply fail, otherwise we would deadlock.
231 * In !PREEMPT configurations, there is no way to tell if we are
232 * in a RCU read-side critical section or not, so we never
233 * attempt any fixup and just print a warning.
217 */ 234 */
218#ifndef CONFIG_PREEMPT 235#ifndef CONFIG_PREEMPT
219 WARN_ON(1); 236 WARN_ON_ONCE(1);
220 return 0; 237 return 0;
221#else 238#endif
222 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 239 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
223 irqs_disabled()) { 240 irqs_disabled()) {
224 WARN_ON(1); 241 WARN_ON_ONCE(1);
225 return 0; 242 return 0;
226 } 243 }
227 rcu_barrier(); 244 rcu_barrier();
@@ -229,7 +246,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
229 rcu_barrier_bh(); 246 rcu_barrier_bh();
230 debug_object_free(head, &rcuhead_debug_descr); 247 debug_object_free(head, &rcuhead_debug_descr);
231 return 1; 248 return 1;
232#endif
233 default: 249 default:
234 return 0; 250 return 0;
235 } 251 }
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 0c343b9a46d5..7bbac7d0f5ab 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -35,15 +35,16 @@
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/time.h> 36#include <linux/time.h>
37#include <linux/cpu.h> 37#include <linux/cpu.h>
38#include <linux/prefetch.h>
38 39
39/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ 40/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
40static struct task_struct *rcu_kthread_task; 41static struct task_struct *rcu_kthread_task;
41static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); 42static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
42static unsigned long have_rcu_kthread_work; 43static unsigned long have_rcu_kthread_work;
43static void invoke_rcu_kthread(void);
44 44
45/* Forward declarations for rcutiny_plugin.h. */ 45/* Forward declarations for rcutiny_plugin.h. */
46struct rcu_ctrlblk; 46struct rcu_ctrlblk;
47static void invoke_rcu_kthread(void);
47static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); 48static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
48static int rcu_kthread(void *arg); 49static int rcu_kthread(void *arg);
49static void __call_rcu(struct rcu_head *head, 50static void __call_rcu(struct rcu_head *head,
@@ -79,36 +80,45 @@ void rcu_exit_nohz(void)
79#endif /* #ifdef CONFIG_NO_HZ */ 80#endif /* #ifdef CONFIG_NO_HZ */
80 81
81/* 82/*
82 * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc(). 83 * Helper function for rcu_sched_qs() and rcu_bh_qs().
83 * Also disable irqs to avoid confusion due to interrupt handlers 84 * Also irqs are disabled to avoid confusion due to interrupt handlers
84 * invoking call_rcu(). 85 * invoking call_rcu().
85 */ 86 */
86static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 87static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
87{ 88{
88 unsigned long flags;
89
90 local_irq_save(flags);
91 if (rcp->rcucblist != NULL && 89 if (rcp->rcucblist != NULL &&
92 rcp->donetail != rcp->curtail) { 90 rcp->donetail != rcp->curtail) {
93 rcp->donetail = rcp->curtail; 91 rcp->donetail = rcp->curtail;
94 local_irq_restore(flags);
95 return 1; 92 return 1;
96 } 93 }
97 local_irq_restore(flags);
98 94
99 return 0; 95 return 0;
100} 96}
101 97
102/* 98/*
99 * Wake up rcu_kthread() to process callbacks now eligible for invocation
100 * or to boost readers.
101 */
102static void invoke_rcu_kthread(void)
103{
104 have_rcu_kthread_work = 1;
105 wake_up(&rcu_kthread_wq);
106}
107
108/*
103 * Record an rcu quiescent state. And an rcu_bh quiescent state while we 109 * Record an rcu quiescent state. And an rcu_bh quiescent state while we
104 * are at it, given that any rcu quiescent state is also an rcu_bh 110 * are at it, given that any rcu quiescent state is also an rcu_bh
105 * quiescent state. Use "+" instead of "||" to defeat short circuiting. 111 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
106 */ 112 */
107void rcu_sched_qs(int cpu) 113void rcu_sched_qs(int cpu)
108{ 114{
115 unsigned long flags;
116
117 local_irq_save(flags);
109 if (rcu_qsctr_help(&rcu_sched_ctrlblk) + 118 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
110 rcu_qsctr_help(&rcu_bh_ctrlblk)) 119 rcu_qsctr_help(&rcu_bh_ctrlblk))
111 invoke_rcu_kthread(); 120 invoke_rcu_kthread();
121 local_irq_restore(flags);
112} 122}
113 123
114/* 124/*
@@ -116,8 +126,12 @@ void rcu_sched_qs(int cpu)
116 */ 126 */
117void rcu_bh_qs(int cpu) 127void rcu_bh_qs(int cpu)
118{ 128{
129 unsigned long flags;
130
131 local_irq_save(flags);
119 if (rcu_qsctr_help(&rcu_bh_ctrlblk)) 132 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
120 invoke_rcu_kthread(); 133 invoke_rcu_kthread();
134 local_irq_restore(flags);
121} 135}
122 136
123/* 137/*
@@ -167,7 +181,7 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
167 prefetch(next); 181 prefetch(next);
168 debug_rcu_head_unqueue(list); 182 debug_rcu_head_unqueue(list);
169 local_bh_disable(); 183 local_bh_disable();
170 list->func(list); 184 __rcu_reclaim(list);
171 local_bh_enable(); 185 local_bh_enable();
172 list = next; 186 list = next;
173 RCU_TRACE(cb_count++); 187 RCU_TRACE(cb_count++);
@@ -208,20 +222,6 @@ static int rcu_kthread(void *arg)
208} 222}
209 223
210/* 224/*
211 * Wake up rcu_kthread() to process callbacks now eligible for invocation
212 * or to boost readers.
213 */
214static void invoke_rcu_kthread(void)
215{
216 unsigned long flags;
217
218 local_irq_save(flags);
219 have_rcu_kthread_work = 1;
220 wake_up(&rcu_kthread_wq);
221 local_irq_restore(flags);
222}
223
224/*
225 * Wait for a grace period to elapse. But it is illegal to invoke 225 * Wait for a grace period to elapse. But it is illegal to invoke
226 * synchronize_sched() from within an RCU read-side critical section. 226 * synchronize_sched() from within an RCU read-side critical section.
227 * Therefore, any legal call to synchronize_sched() is a quiescent 227 * Therefore, any legal call to synchronize_sched() is a quiescent
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 015abaea962a..f259c676195f 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -100,23 +100,28 @@ struct rcu_preempt_ctrlblk {
100 u8 completed; /* Last grace period completed. */ 100 u8 completed; /* Last grace period completed. */
101 /* If all three are equal, RCU is idle. */ 101 /* If all three are equal, RCU is idle. */
102#ifdef CONFIG_RCU_BOOST 102#ifdef CONFIG_RCU_BOOST
103 s8 boosted_this_gp; /* Has boosting already happened? */
104 unsigned long boost_time; /* When to start boosting (jiffies) */ 103 unsigned long boost_time; /* When to start boosting (jiffies) */
105#endif /* #ifdef CONFIG_RCU_BOOST */ 104#endif /* #ifdef CONFIG_RCU_BOOST */
106#ifdef CONFIG_RCU_TRACE 105#ifdef CONFIG_RCU_TRACE
107 unsigned long n_grace_periods; 106 unsigned long n_grace_periods;
108#ifdef CONFIG_RCU_BOOST 107#ifdef CONFIG_RCU_BOOST
109 unsigned long n_tasks_boosted; 108 unsigned long n_tasks_boosted;
109 /* Total number of tasks boosted. */
110 unsigned long n_exp_boosts; 110 unsigned long n_exp_boosts;
111 /* Number of tasks boosted for expedited GP. */
111 unsigned long n_normal_boosts; 112 unsigned long n_normal_boosts;
112 unsigned long n_normal_balk_blkd_tasks; 113 /* Number of tasks boosted for normal GP. */
113 unsigned long n_normal_balk_gp_tasks; 114 unsigned long n_balk_blkd_tasks;
114 unsigned long n_normal_balk_boost_tasks; 115 /* Refused to boost: no blocked tasks. */
115 unsigned long n_normal_balk_boosted; 116 unsigned long n_balk_exp_gp_tasks;
116 unsigned long n_normal_balk_notyet; 117 /* Refused to boost: nothing blocking GP. */
117 unsigned long n_normal_balk_nos; 118 unsigned long n_balk_boost_tasks;
118 unsigned long n_exp_balk_blkd_tasks; 119 /* Refused to boost: already boosting. */
119 unsigned long n_exp_balk_nos; 120 unsigned long n_balk_notyet;
121 /* Refused to boost: not yet time. */
122 unsigned long n_balk_nos;
123 /* Refused to boost: not sure why, though. */
124 /* This can happen due to race conditions. */
120#endif /* #ifdef CONFIG_RCU_BOOST */ 125#endif /* #ifdef CONFIG_RCU_BOOST */
121#endif /* #ifdef CONFIG_RCU_TRACE */ 126#endif /* #ifdef CONFIG_RCU_TRACE */
122}; 127};
@@ -201,7 +206,6 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t)
201 206
202#ifdef CONFIG_RCU_BOOST 207#ifdef CONFIG_RCU_BOOST
203static void rcu_initiate_boost_trace(void); 208static void rcu_initiate_boost_trace(void);
204static void rcu_initiate_exp_boost_trace(void);
205#endif /* #ifdef CONFIG_RCU_BOOST */ 209#endif /* #ifdef CONFIG_RCU_BOOST */
206 210
207/* 211/*
@@ -219,41 +223,21 @@ static void show_tiny_preempt_stats(struct seq_file *m)
219 "N."[!rcu_preempt_ctrlblk.gp_tasks], 223 "N."[!rcu_preempt_ctrlblk.gp_tasks],
220 "E."[!rcu_preempt_ctrlblk.exp_tasks]); 224 "E."[!rcu_preempt_ctrlblk.exp_tasks]);
221#ifdef CONFIG_RCU_BOOST 225#ifdef CONFIG_RCU_BOOST
222 seq_printf(m, " ttb=%c btg=", 226 seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
223 "B."[!rcu_preempt_ctrlblk.boost_tasks]); 227 " ",
224 switch (rcu_preempt_ctrlblk.boosted_this_gp) { 228 "B."[!rcu_preempt_ctrlblk.boost_tasks],
225 case -1:
226 seq_puts(m, "exp");
227 break;
228 case 0:
229 seq_puts(m, "no");
230 break;
231 case 1:
232 seq_puts(m, "begun");
233 break;
234 case 2:
235 seq_puts(m, "done");
236 break;
237 default:
238 seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
239 }
240 seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
241 rcu_preempt_ctrlblk.n_tasks_boosted, 229 rcu_preempt_ctrlblk.n_tasks_boosted,
242 rcu_preempt_ctrlblk.n_exp_boosts, 230 rcu_preempt_ctrlblk.n_exp_boosts,
243 rcu_preempt_ctrlblk.n_normal_boosts, 231 rcu_preempt_ctrlblk.n_normal_boosts,
244 (int)(jiffies & 0xffff), 232 (int)(jiffies & 0xffff),
245 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); 233 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
246 seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n", 234 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n",
247 "normal balk", 235 " balk",
248 rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks, 236 rcu_preempt_ctrlblk.n_balk_blkd_tasks,
249 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks, 237 rcu_preempt_ctrlblk.n_balk_exp_gp_tasks,
250 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks, 238 rcu_preempt_ctrlblk.n_balk_boost_tasks,
251 rcu_preempt_ctrlblk.n_normal_balk_boosted, 239 rcu_preempt_ctrlblk.n_balk_notyet,
252 rcu_preempt_ctrlblk.n_normal_balk_notyet, 240 rcu_preempt_ctrlblk.n_balk_nos);
253 rcu_preempt_ctrlblk.n_normal_balk_nos);
254 seq_printf(m, " exp balk: bt=%lu nos=%lu\n",
255 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
256 rcu_preempt_ctrlblk.n_exp_balk_nos);
257#endif /* #ifdef CONFIG_RCU_BOOST */ 241#endif /* #ifdef CONFIG_RCU_BOOST */
258} 242}
259 243
@@ -271,25 +255,59 @@ static int rcu_boost(void)
271{ 255{
272 unsigned long flags; 256 unsigned long flags;
273 struct rt_mutex mtx; 257 struct rt_mutex mtx;
274 struct list_head *np;
275 struct task_struct *t; 258 struct task_struct *t;
259 struct list_head *tb;
276 260
277 if (rcu_preempt_ctrlblk.boost_tasks == NULL) 261 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
262 rcu_preempt_ctrlblk.exp_tasks == NULL)
278 return 0; /* Nothing to boost. */ 263 return 0; /* Nothing to boost. */
264
279 raw_local_irq_save(flags); 265 raw_local_irq_save(flags);
280 rcu_preempt_ctrlblk.boosted_this_gp++; 266
281 t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct, 267 /*
282 rcu_node_entry); 268 * Recheck with irqs disabled: all tasks in need of boosting
283 np = rcu_next_node_entry(t); 269 * might exit their RCU read-side critical sections on their own
270 * if we are preempted just before disabling irqs.
271 */
272 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
273 rcu_preempt_ctrlblk.exp_tasks == NULL) {
274 raw_local_irq_restore(flags);
275 return 0;
276 }
277
278 /*
279 * Preferentially boost tasks blocking expedited grace periods.
280 * This cannot starve the normal grace periods because a second
281 * expedited grace period must boost all blocked tasks, including
282 * those blocking the pre-existing normal grace period.
283 */
284 if (rcu_preempt_ctrlblk.exp_tasks != NULL) {
285 tb = rcu_preempt_ctrlblk.exp_tasks;
286 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
287 } else {
288 tb = rcu_preempt_ctrlblk.boost_tasks;
289 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
290 }
291 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
292
293 /*
294 * We boost task t by manufacturing an rt_mutex that appears to
295 * be held by task t. We leave a pointer to that rt_mutex where
296 * task t can find it, and task t will release the mutex when it
297 * exits its outermost RCU read-side critical section. Then
298 * simply acquiring this artificial rt_mutex will boost task
299 * t's priority. (Thanks to tglx for suggesting this approach!)
300 */
301 t = container_of(tb, struct task_struct, rcu_node_entry);
284 rt_mutex_init_proxy_locked(&mtx, t); 302 rt_mutex_init_proxy_locked(&mtx, t);
285 t->rcu_boost_mutex = &mtx; 303 t->rcu_boost_mutex = &mtx;
286 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; 304 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
287 raw_local_irq_restore(flags); 305 raw_local_irq_restore(flags);
288 rt_mutex_lock(&mtx); 306 rt_mutex_lock(&mtx);
289 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); 307 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
290 rcu_preempt_ctrlblk.boosted_this_gp++; 308
291 rt_mutex_unlock(&mtx); 309 return rcu_preempt_ctrlblk.boost_tasks != NULL ||
292 return rcu_preempt_ctrlblk.boost_tasks != NULL; 310 rcu_preempt_ctrlblk.exp_tasks != NULL;
293} 311}
294 312
295/* 313/*
@@ -304,42 +322,25 @@ static int rcu_boost(void)
304 */ 322 */
305static int rcu_initiate_boost(void) 323static int rcu_initiate_boost(void)
306{ 324{
307 if (!rcu_preempt_blocked_readers_cgp()) { 325 if (!rcu_preempt_blocked_readers_cgp() &&
308 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++); 326 rcu_preempt_ctrlblk.exp_tasks == NULL) {
327 RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++);
309 return 0; 328 return 0;
310 } 329 }
311 if (rcu_preempt_ctrlblk.gp_tasks != NULL && 330 if (rcu_preempt_ctrlblk.exp_tasks != NULL ||
312 rcu_preempt_ctrlblk.boost_tasks == NULL && 331 (rcu_preempt_ctrlblk.gp_tasks != NULL &&
313 rcu_preempt_ctrlblk.boosted_this_gp == 0 && 332 rcu_preempt_ctrlblk.boost_tasks == NULL &&
314 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) { 333 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) {
315 rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; 334 if (rcu_preempt_ctrlblk.exp_tasks == NULL)
335 rcu_preempt_ctrlblk.boost_tasks =
336 rcu_preempt_ctrlblk.gp_tasks;
316 invoke_rcu_kthread(); 337 invoke_rcu_kthread();
317 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
318 } else 338 } else
319 RCU_TRACE(rcu_initiate_boost_trace()); 339 RCU_TRACE(rcu_initiate_boost_trace());
320 return 1; 340 return 1;
321} 341}
322 342
323/* 343#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
324 * Initiate boosting for an expedited grace period.
325 */
326static void rcu_initiate_expedited_boost(void)
327{
328 unsigned long flags;
329
330 raw_local_irq_save(flags);
331 if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
332 rcu_preempt_ctrlblk.boost_tasks =
333 rcu_preempt_ctrlblk.blkd_tasks.next;
334 rcu_preempt_ctrlblk.boosted_this_gp = -1;
335 invoke_rcu_kthread();
336 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
337 } else
338 RCU_TRACE(rcu_initiate_exp_boost_trace());
339 raw_local_irq_restore(flags);
340}
341
342#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
343 344
344/* 345/*
345 * Do priority-boost accounting for the start of a new grace period. 346 * Do priority-boost accounting for the start of a new grace period.
@@ -347,8 +348,6 @@ static void rcu_initiate_expedited_boost(void)
347static void rcu_preempt_boost_start_gp(void) 348static void rcu_preempt_boost_start_gp(void)
348{ 349{
349 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; 350 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
350 if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
351 rcu_preempt_ctrlblk.boosted_this_gp = 0;
352} 351}
353 352
354#else /* #ifdef CONFIG_RCU_BOOST */ 353#else /* #ifdef CONFIG_RCU_BOOST */
@@ -372,13 +371,6 @@ static int rcu_initiate_boost(void)
372} 371}
373 372
374/* 373/*
375 * If there is no RCU priority boosting, we don't initiate expedited boosting.
376 */
377static void rcu_initiate_expedited_boost(void)
378{
379}
380
381/*
382 * If there is no RCU priority boosting, nothing to do at grace-period start. 374 * If there is no RCU priority boosting, nothing to do at grace-period start.
383 */ 375 */
384static void rcu_preempt_boost_start_gp(void) 376static void rcu_preempt_boost_start_gp(void)
@@ -418,7 +410,7 @@ static void rcu_preempt_cpu_qs(void)
418 if (!rcu_preempt_gp_in_progress()) 410 if (!rcu_preempt_gp_in_progress())
419 return; 411 return;
420 /* 412 /*
421 * Check up on boosting. If there are no readers blocking the 413 * Check up on boosting. If there are readers blocking the
422 * current grace period, leave. 414 * current grace period, leave.
423 */ 415 */
424 if (rcu_initiate_boost()) 416 if (rcu_initiate_boost())
@@ -578,7 +570,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
578 empty = !rcu_preempt_blocked_readers_cgp(); 570 empty = !rcu_preempt_blocked_readers_cgp();
579 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; 571 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
580 np = rcu_next_node_entry(t); 572 np = rcu_next_node_entry(t);
581 list_del(&t->rcu_node_entry); 573 list_del_init(&t->rcu_node_entry);
582 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) 574 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
583 rcu_preempt_ctrlblk.gp_tasks = np; 575 rcu_preempt_ctrlblk.gp_tasks = np;
584 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) 576 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
@@ -587,7 +579,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
587 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) 579 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
588 rcu_preempt_ctrlblk.boost_tasks = np; 580 rcu_preempt_ctrlblk.boost_tasks = np;
589#endif /* #ifdef CONFIG_RCU_BOOST */ 581#endif /* #ifdef CONFIG_RCU_BOOST */
590 INIT_LIST_HEAD(&t->rcu_node_entry);
591 582
592 /* 583 /*
593 * If this was the last task on the current list, and if 584 * If this was the last task on the current list, and if
@@ -812,13 +803,16 @@ void synchronize_rcu_expedited(void)
812 rpcp->exp_tasks = rpcp->blkd_tasks.next; 803 rpcp->exp_tasks = rpcp->blkd_tasks.next;
813 if (rpcp->exp_tasks == &rpcp->blkd_tasks) 804 if (rpcp->exp_tasks == &rpcp->blkd_tasks)
814 rpcp->exp_tasks = NULL; 805 rpcp->exp_tasks = NULL;
815 local_irq_restore(flags);
816 806
817 /* Wait for tail of ->blkd_tasks list to drain. */ 807 /* Wait for tail of ->blkd_tasks list to drain. */
818 if (rcu_preempted_readers_exp()) 808 if (!rcu_preempted_readers_exp())
819 rcu_initiate_expedited_boost(); 809 local_irq_restore(flags);
810 else {
811 rcu_initiate_boost();
812 local_irq_restore(flags);
820 wait_event(sync_rcu_preempt_exp_wq, 813 wait_event(sync_rcu_preempt_exp_wq,
821 !rcu_preempted_readers_exp()); 814 !rcu_preempted_readers_exp());
815 }
822 816
823 /* Clean up and exit. */ 817 /* Clean up and exit. */
824 barrier(); /* ensure expedited GP seen before counter increment. */ 818 barrier(); /* ensure expedited GP seen before counter increment. */
@@ -852,7 +846,7 @@ void exit_rcu(void)
852 if (t->rcu_read_lock_nesting == 0) 846 if (t->rcu_read_lock_nesting == 0)
853 return; 847 return;
854 t->rcu_read_lock_nesting = 1; 848 t->rcu_read_lock_nesting = 1;
855 rcu_read_unlock(); 849 __rcu_read_unlock();
856} 850}
857 851
858#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 852#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
@@ -931,24 +925,17 @@ void __init rcu_scheduler_starting(void)
931 925
932static void rcu_initiate_boost_trace(void) 926static void rcu_initiate_boost_trace(void)
933{ 927{
934 if (rcu_preempt_ctrlblk.gp_tasks == NULL) 928 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
935 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++; 929 rcu_preempt_ctrlblk.n_balk_blkd_tasks++;
930 else if (rcu_preempt_ctrlblk.gp_tasks == NULL &&
931 rcu_preempt_ctrlblk.exp_tasks == NULL)
932 rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++;
936 else if (rcu_preempt_ctrlblk.boost_tasks != NULL) 933 else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
937 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++; 934 rcu_preempt_ctrlblk.n_balk_boost_tasks++;
938 else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
939 rcu_preempt_ctrlblk.n_normal_balk_boosted++;
940 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) 935 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
941 rcu_preempt_ctrlblk.n_normal_balk_notyet++; 936 rcu_preempt_ctrlblk.n_balk_notyet++;
942 else
943 rcu_preempt_ctrlblk.n_normal_balk_nos++;
944}
945
946static void rcu_initiate_exp_boost_trace(void)
947{
948 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
949 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
950 else 937 else
951 rcu_preempt_ctrlblk.n_exp_balk_nos++; 938 rcu_preempt_ctrlblk.n_balk_nos++;
952} 939}
953 940
954#endif /* #ifdef CONFIG_RCU_BOOST */ 941#endif /* #ifdef CONFIG_RCU_BOOST */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 89613f97ff26..2e138db03382 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,7 +47,6 @@
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <asm/byteorder.h> 49#include <asm/byteorder.h>
50#include <linux/sched.h>
51 50
52MODULE_LICENSE("GPL"); 51MODULE_LICENSE("GPL");
53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
@@ -132,7 +131,7 @@ struct rcu_torture {
132 131
133static LIST_HEAD(rcu_torture_freelist); 132static LIST_HEAD(rcu_torture_freelist);
134static struct rcu_torture __rcu *rcu_torture_current; 133static struct rcu_torture __rcu *rcu_torture_current;
135static long rcu_torture_current_version; 134static unsigned long rcu_torture_current_version;
136static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 135static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
137static DEFINE_SPINLOCK(rcu_torture_lock); 136static DEFINE_SPINLOCK(rcu_torture_lock);
138static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = 137static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
@@ -147,8 +146,6 @@ static atomic_t n_rcu_torture_mberror;
147static atomic_t n_rcu_torture_error; 146static atomic_t n_rcu_torture_error;
148static long n_rcu_torture_boost_ktrerror; 147static long n_rcu_torture_boost_ktrerror;
149static long n_rcu_torture_boost_rterror; 148static long n_rcu_torture_boost_rterror;
150static long n_rcu_torture_boost_allocerror;
151static long n_rcu_torture_boost_afferror;
152static long n_rcu_torture_boost_failure; 149static long n_rcu_torture_boost_failure;
153static long n_rcu_torture_boosts; 150static long n_rcu_torture_boosts;
154static long n_rcu_torture_timers; 151static long n_rcu_torture_timers;
@@ -164,11 +161,11 @@ static int stutter_pause_test;
164#endif 161#endif
165int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 162int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
166 163
167#ifdef CONFIG_RCU_BOOST 164#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
168#define rcu_can_boost() 1 165#define rcu_can_boost() 1
169#else /* #ifdef CONFIG_RCU_BOOST */ 166#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
170#define rcu_can_boost() 0 167#define rcu_can_boost() 0
171#endif /* #else #ifdef CONFIG_RCU_BOOST */ 168#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
172 169
173static unsigned long boost_starttime; /* jiffies of next boost test start. */ 170static unsigned long boost_starttime; /* jiffies of next boost test start. */
174DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 171DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
@@ -752,6 +749,7 @@ static int rcu_torture_boost(void *arg)
752 n_rcu_torture_boost_rterror++; 749 n_rcu_torture_boost_rterror++;
753 } 750 }
754 751
752 init_rcu_head_on_stack(&rbi.rcu);
755 /* Each pass through the following loop does one boost-test cycle. */ 753 /* Each pass through the following loop does one boost-test cycle. */
756 do { 754 do {
757 /* Wait for the next test interval. */ 755 /* Wait for the next test interval. */
@@ -811,6 +809,7 @@ checkwait: rcu_stutter_wait("rcu_torture_boost");
811 809
812 /* Clean up and exit. */ 810 /* Clean up and exit. */
813 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); 811 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
812 destroy_rcu_head_on_stack(&rbi.rcu);
814 rcutorture_shutdown_absorb("rcu_torture_boost"); 813 rcutorture_shutdown_absorb("rcu_torture_boost");
815 while (!kthread_should_stop() || rbi.inflight) 814 while (!kthread_should_stop() || rbi.inflight)
816 schedule_timeout_uninterruptible(1); 815 schedule_timeout_uninterruptible(1);
@@ -887,7 +886,7 @@ rcu_torture_writer(void *arg)
887 old_rp->rtort_pipe_count++; 886 old_rp->rtort_pipe_count++;
888 cur_ops->deferred_free(old_rp); 887 cur_ops->deferred_free(old_rp);
889 } 888 }
890 rcu_torture_current_version++; 889 rcutorture_record_progress(++rcu_torture_current_version);
891 oldbatch = cur_ops->completed(); 890 oldbatch = cur_ops->completed();
892 rcu_stutter_wait("rcu_torture_writer"); 891 rcu_stutter_wait("rcu_torture_writer");
893 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 892 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
@@ -1067,8 +1066,8 @@ rcu_torture_printk(char *page)
1067 } 1066 }
1068 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 1067 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
1069 cnt += sprintf(&page[cnt], 1068 cnt += sprintf(&page[cnt],
1070 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 1069 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
1071 "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld " 1070 "rtmbe: %d rtbke: %ld rtbre: %ld "
1072 "rtbf: %ld rtb: %ld nt: %ld", 1071 "rtbf: %ld rtb: %ld nt: %ld",
1073 rcu_torture_current, 1072 rcu_torture_current,
1074 rcu_torture_current_version, 1073 rcu_torture_current_version,
@@ -1079,16 +1078,12 @@ rcu_torture_printk(char *page)
1079 atomic_read(&n_rcu_torture_mberror), 1078 atomic_read(&n_rcu_torture_mberror),
1080 n_rcu_torture_boost_ktrerror, 1079 n_rcu_torture_boost_ktrerror,
1081 n_rcu_torture_boost_rterror, 1080 n_rcu_torture_boost_rterror,
1082 n_rcu_torture_boost_allocerror,
1083 n_rcu_torture_boost_afferror,
1084 n_rcu_torture_boost_failure, 1081 n_rcu_torture_boost_failure,
1085 n_rcu_torture_boosts, 1082 n_rcu_torture_boosts,
1086 n_rcu_torture_timers); 1083 n_rcu_torture_timers);
1087 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1084 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1088 n_rcu_torture_boost_ktrerror != 0 || 1085 n_rcu_torture_boost_ktrerror != 0 ||
1089 n_rcu_torture_boost_rterror != 0 || 1086 n_rcu_torture_boost_rterror != 0 ||
1090 n_rcu_torture_boost_allocerror != 0 ||
1091 n_rcu_torture_boost_afferror != 0 ||
1092 n_rcu_torture_boost_failure != 0) 1087 n_rcu_torture_boost_failure != 0)
1093 cnt += sprintf(&page[cnt], " !!!"); 1088 cnt += sprintf(&page[cnt], " !!!");
1094 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1089 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
@@ -1332,6 +1327,7 @@ rcu_torture_cleanup(void)
1332 int i; 1327 int i;
1333 1328
1334 mutex_lock(&fullstop_mutex); 1329 mutex_lock(&fullstop_mutex);
1330 rcutorture_record_test_transition();
1335 if (fullstop == FULLSTOP_SHUTDOWN) { 1331 if (fullstop == FULLSTOP_SHUTDOWN) {
1336 printk(KERN_WARNING /* but going down anyway, so... */ 1332 printk(KERN_WARNING /* but going down anyway, so... */
1337 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); 1333 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
@@ -1487,8 +1483,6 @@ rcu_torture_init(void)
1487 atomic_set(&n_rcu_torture_error, 0); 1483 atomic_set(&n_rcu_torture_error, 0);
1488 n_rcu_torture_boost_ktrerror = 0; 1484 n_rcu_torture_boost_ktrerror = 0;
1489 n_rcu_torture_boost_rterror = 0; 1485 n_rcu_torture_boost_rterror = 0;
1490 n_rcu_torture_boost_allocerror = 0;
1491 n_rcu_torture_boost_afferror = 0;
1492 n_rcu_torture_boost_failure = 0; 1486 n_rcu_torture_boost_failure = 0;
1493 n_rcu_torture_boosts = 0; 1487 n_rcu_torture_boosts = 0;
1494 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1488 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
@@ -1625,6 +1619,7 @@ rcu_torture_init(void)
1625 } 1619 }
1626 } 1620 }
1627 register_reboot_notifier(&rcutorture_shutdown_nb); 1621 register_reboot_notifier(&rcutorture_shutdown_nb);
1622 rcutorture_record_test_transition();
1628 mutex_unlock(&fullstop_mutex); 1623 mutex_unlock(&fullstop_mutex);
1629 return 0; 1624 return 0;
1630 1625
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index dd4aea806f8e..f07d2f03181a 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -47,6 +47,9 @@
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h> 49#include <linux/kernel_stat.h>
50#include <linux/wait.h>
51#include <linux/kthread.h>
52#include <linux/prefetch.h>
50 53
51#include "rcutree.h" 54#include "rcutree.h"
52 55
@@ -79,10 +82,41 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
79struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 82struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
80DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 83DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
81 84
85static struct rcu_state *rcu_state;
86
82int rcu_scheduler_active __read_mostly; 87int rcu_scheduler_active __read_mostly;
83EXPORT_SYMBOL_GPL(rcu_scheduler_active); 88EXPORT_SYMBOL_GPL(rcu_scheduler_active);
84 89
85/* 90/*
91 * Control variables for per-CPU and per-rcu_node kthreads. These
92 * handle all flavors of RCU.
93 */
94static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
95DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
96DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
97DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
98static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq);
99DEFINE_PER_CPU(char, rcu_cpu_has_work);
100static char rcu_kthreads_spawnable;
101
102static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
103static void invoke_rcu_cpu_kthread(void);
104
105#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */
106
107/*
108 * Track the rcutorture test sequence number and the update version
109 * number within a given test. The rcutorture_testseq is incremented
110 * on every rcutorture module load and unload, so has an odd value
111 * when a test is running. The rcutorture_vernum is set to zero
112 * when rcutorture starts and is incremented on each rcutorture update.
113 * These variables enable correlating rcutorture output with the
114 * RCU tracing information.
115 */
116unsigned long rcutorture_testseq;
117unsigned long rcutorture_vernum;
118
119/*
86 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 120 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
87 * permit this function to be invoked without holding the root rcu_node 121 * permit this function to be invoked without holding the root rcu_node
88 * structure's ->lock, but of course results can be subject to change. 122 * structure's ->lock, but of course results can be subject to change.
@@ -124,6 +158,7 @@ void rcu_note_context_switch(int cpu)
124 rcu_sched_qs(cpu); 158 rcu_sched_qs(cpu);
125 rcu_preempt_note_context_switch(cpu); 159 rcu_preempt_note_context_switch(cpu);
126} 160}
161EXPORT_SYMBOL_GPL(rcu_note_context_switch);
127 162
128#ifdef CONFIG_NO_HZ 163#ifdef CONFIG_NO_HZ
129DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 164DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
@@ -140,10 +175,8 @@ module_param(blimit, int, 0);
140module_param(qhimark, int, 0); 175module_param(qhimark, int, 0);
141module_param(qlowmark, int, 0); 176module_param(qlowmark, int, 0);
142 177
143#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 178int rcu_cpu_stall_suppress __read_mostly;
144int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
145module_param(rcu_cpu_stall_suppress, int, 0644); 179module_param(rcu_cpu_stall_suppress, int, 0644);
146#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
147 180
148static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 181static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
149static int rcu_pending(int cpu); 182static int rcu_pending(int cpu);
@@ -176,6 +209,31 @@ void rcu_bh_force_quiescent_state(void)
176EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 209EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
177 210
178/* 211/*
212 * Record the number of times rcutorture tests have been initiated and
213 * terminated. This information allows the debugfs tracing stats to be
214 * correlated to the rcutorture messages, even when the rcutorture module
215 * is being repeatedly loaded and unloaded. In other words, we cannot
216 * store this state in rcutorture itself.
217 */
218void rcutorture_record_test_transition(void)
219{
220 rcutorture_testseq++;
221 rcutorture_vernum = 0;
222}
223EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
224
225/*
226 * Record the number of writer passes through the current rcutorture test.
227 * This is also used to correlate debugfs tracing stats with the rcutorture
228 * messages.
229 */
230void rcutorture_record_progress(unsigned long vernum)
231{
232 rcutorture_vernum++;
233}
234EXPORT_SYMBOL_GPL(rcutorture_record_progress);
235
236/*
179 * Force a quiescent state for RCU-sched. 237 * Force a quiescent state for RCU-sched.
180 */ 238 */
181void rcu_sched_force_quiescent_state(void) 239void rcu_sched_force_quiescent_state(void)
@@ -234,8 +292,8 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
234 return 1; 292 return 1;
235 } 293 }
236 294
237 /* If preemptable RCU, no point in sending reschedule IPI. */ 295 /* If preemptible RCU, no point in sending reschedule IPI. */
238 if (rdp->preemptable) 296 if (rdp->preemptible)
239 return 0; 297 return 0;
240 298
241 /* The CPU is online, so send it a reschedule IPI. */ 299 /* The CPU is online, so send it a reschedule IPI. */
@@ -450,8 +508,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
450 508
451#endif /* #else #ifdef CONFIG_NO_HZ */ 509#endif /* #else #ifdef CONFIG_NO_HZ */
452 510
453#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
454
455int rcu_cpu_stall_suppress __read_mostly; 511int rcu_cpu_stall_suppress __read_mostly;
456 512
457static void record_gp_stall_check_time(struct rcu_state *rsp) 513static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -537,21 +593,24 @@ static void print_cpu_stall(struct rcu_state *rsp)
537 593
538static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) 594static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
539{ 595{
540 long delta; 596 unsigned long j;
597 unsigned long js;
541 struct rcu_node *rnp; 598 struct rcu_node *rnp;
542 599
543 if (rcu_cpu_stall_suppress) 600 if (rcu_cpu_stall_suppress)
544 return; 601 return;
545 delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); 602 j = ACCESS_ONCE(jiffies);
603 js = ACCESS_ONCE(rsp->jiffies_stall);
546 rnp = rdp->mynode; 604 rnp = rdp->mynode;
547 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { 605 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
548 606
549 /* We haven't checked in, so go dump stack. */ 607 /* We haven't checked in, so go dump stack. */
550 print_cpu_stall(rsp); 608 print_cpu_stall(rsp);
551 609
552 } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) { 610 } else if (rcu_gp_in_progress(rsp) &&
611 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
553 612
554 /* They had two time units to dump stack, so complain. */ 613 /* They had a few time units to dump stack, so complain. */
555 print_other_cpu_stall(rsp); 614 print_other_cpu_stall(rsp);
556 } 615 }
557} 616}
@@ -587,26 +646,6 @@ static void __init check_cpu_stall_init(void)
587 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); 646 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
588} 647}
589 648
590#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
591
592static void record_gp_stall_check_time(struct rcu_state *rsp)
593{
594}
595
596static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
597{
598}
599
600void rcu_cpu_stall_reset(void)
601{
602}
603
604static void __init check_cpu_stall_init(void)
605{
606}
607
608#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
609
610/* 649/*
611 * Update CPU-local rcu_data state to record the newly noticed grace period. 650 * Update CPU-local rcu_data state to record the newly noticed grace period.
612 * This is used both when we started the grace period and when we notice 651 * This is used both when we started the grace period and when we notice
@@ -809,6 +848,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
809 rnp->completed = rsp->completed; 848 rnp->completed = rsp->completed;
810 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 849 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
811 rcu_start_gp_per_cpu(rsp, rnp, rdp); 850 rcu_start_gp_per_cpu(rsp, rnp, rdp);
851 rcu_preempt_boost_start_gp(rnp);
812 raw_spin_unlock_irqrestore(&rnp->lock, flags); 852 raw_spin_unlock_irqrestore(&rnp->lock, flags);
813 return; 853 return;
814 } 854 }
@@ -844,6 +884,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
844 rnp->completed = rsp->completed; 884 rnp->completed = rsp->completed;
845 if (rnp == rdp->mynode) 885 if (rnp == rdp->mynode)
846 rcu_start_gp_per_cpu(rsp, rnp, rdp); 886 rcu_start_gp_per_cpu(rsp, rnp, rdp);
887 rcu_preempt_boost_start_gp(rnp);
847 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 888 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
848 } 889 }
849 890
@@ -864,7 +905,12 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
864static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 905static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
865 __releases(rcu_get_root(rsp)->lock) 906 __releases(rcu_get_root(rsp)->lock)
866{ 907{
908 unsigned long gp_duration;
909
867 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 910 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
911 gp_duration = jiffies - rsp->gp_start;
912 if (gp_duration > rsp->gp_max)
913 rsp->gp_max = gp_duration;
868 rsp->completed = rsp->gpnum; 914 rsp->completed = rsp->gpnum;
869 rsp->signaled = RCU_GP_IDLE; 915 rsp->signaled = RCU_GP_IDLE;
870 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 916 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
@@ -894,7 +940,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
894 return; 940 return;
895 } 941 }
896 rnp->qsmask &= ~mask; 942 rnp->qsmask &= ~mask;
897 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 943 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
898 944
899 /* Other bits still set at this level, so done. */ 945 /* Other bits still set at this level, so done. */
900 raw_spin_unlock_irqrestore(&rnp->lock, flags); 946 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1037,6 +1083,8 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp)
1037/* 1083/*
1038 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy 1084 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
1039 * and move all callbacks from the outgoing CPU to the current one. 1085 * and move all callbacks from the outgoing CPU to the current one.
1086 * There can only be one CPU hotplug operation at a time, so no other
1087 * CPU can be attempting to update rcu_cpu_kthread_task.
1040 */ 1088 */
1041static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) 1089static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1042{ 1090{
@@ -1045,6 +1093,14 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1045 int need_report = 0; 1093 int need_report = 0;
1046 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1094 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1047 struct rcu_node *rnp; 1095 struct rcu_node *rnp;
1096 struct task_struct *t;
1097
1098 /* Stop the CPU's kthread. */
1099 t = per_cpu(rcu_cpu_kthread_task, cpu);
1100 if (t != NULL) {
1101 per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
1102 kthread_stop(t);
1103 }
1048 1104
1049 /* Exclude any attempts to start a new grace period. */ 1105 /* Exclude any attempts to start a new grace period. */
1050 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1106 raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -1082,6 +1138,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1082 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1138 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1083 if (need_report & RCU_OFL_TASKS_EXP_GP) 1139 if (need_report & RCU_OFL_TASKS_EXP_GP)
1084 rcu_report_exp_rnp(rsp, rnp); 1140 rcu_report_exp_rnp(rsp, rnp);
1141 rcu_node_kthread_setaffinity(rnp, -1);
1085} 1142}
1086 1143
1087/* 1144/*
@@ -1143,7 +1200,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1143 next = list->next; 1200 next = list->next;
1144 prefetch(next); 1201 prefetch(next);
1145 debug_rcu_head_unqueue(list); 1202 debug_rcu_head_unqueue(list);
1146 list->func(list); 1203 __rcu_reclaim(list);
1147 list = next; 1204 list = next;
1148 if (++count >= rdp->blimit) 1205 if (++count >= rdp->blimit)
1149 break; 1206 break;
@@ -1179,7 +1236,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1179 1236
1180 /* Re-raise the RCU softirq if there are callbacks remaining. */ 1237 /* Re-raise the RCU softirq if there are callbacks remaining. */
1181 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1238 if (cpu_has_callbacks_ready_to_invoke(rdp))
1182 raise_softirq(RCU_SOFTIRQ); 1239 invoke_rcu_cpu_kthread();
1183} 1240}
1184 1241
1185/* 1242/*
@@ -1225,7 +1282,7 @@ void rcu_check_callbacks(int cpu, int user)
1225 } 1282 }
1226 rcu_preempt_check_callbacks(cpu); 1283 rcu_preempt_check_callbacks(cpu);
1227 if (rcu_pending(cpu)) 1284 if (rcu_pending(cpu))
1228 raise_softirq(RCU_SOFTIRQ); 1285 invoke_rcu_cpu_kthread();
1229} 1286}
1230 1287
1231#ifdef CONFIG_SMP 1288#ifdef CONFIG_SMP
@@ -1233,6 +1290,8 @@ void rcu_check_callbacks(int cpu, int user)
1233/* 1290/*
1234 * Scan the leaf rcu_node structures, processing dyntick state for any that 1291 * Scan the leaf rcu_node structures, processing dyntick state for any that
1235 * have not yet encountered a quiescent state, using the function specified. 1292 * have not yet encountered a quiescent state, using the function specified.
1293 * Also initiate boosting for any threads blocked on the root rcu_node.
1294 *
1236 * The caller must have suppressed start of new grace periods. 1295 * The caller must have suppressed start of new grace periods.
1237 */ 1296 */
1238static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) 1297static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
@@ -1251,7 +1310,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1251 return; 1310 return;
1252 } 1311 }
1253 if (rnp->qsmask == 0) { 1312 if (rnp->qsmask == 0) {
1254 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1313 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
1255 continue; 1314 continue;
1256 } 1315 }
1257 cpu = rnp->grplo; 1316 cpu = rnp->grplo;
@@ -1269,6 +1328,11 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1269 } 1328 }
1270 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1329 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1271 } 1330 }
1331 rnp = rcu_get_root(rsp);
1332 if (rnp->qsmask == 0) {
1333 raw_spin_lock_irqsave(&rnp->lock, flags);
1334 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
1335 }
1272} 1336}
1273 1337
1274/* 1338/*
@@ -1389,7 +1453,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1389/* 1453/*
1390 * Do softirq processing for the current CPU. 1454 * Do softirq processing for the current CPU.
1391 */ 1455 */
1392static void rcu_process_callbacks(struct softirq_action *unused) 1456static void rcu_process_callbacks(void)
1393{ 1457{
1394 /* 1458 /*
1395 * Memory references from any prior RCU read-side critical sections 1459 * Memory references from any prior RCU read-side critical sections
@@ -1414,6 +1478,347 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1414 rcu_needs_cpu_flush(); 1478 rcu_needs_cpu_flush();
1415} 1479}
1416 1480
1481/*
1482 * Wake up the current CPU's kthread. This replaces raise_softirq()
1483 * in earlier versions of RCU. Note that because we are running on
1484 * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
1485 * cannot disappear out from under us.
1486 */
1487static void invoke_rcu_cpu_kthread(void)
1488{
1489 unsigned long flags;
1490
1491 local_irq_save(flags);
1492 __this_cpu_write(rcu_cpu_has_work, 1);
1493 if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
1494 local_irq_restore(flags);
1495 return;
1496 }
1497 wake_up(&__get_cpu_var(rcu_cpu_wq));
1498 local_irq_restore(flags);
1499}
1500
1501/*
1502 * Wake up the specified per-rcu_node-structure kthread.
1503 * Because the per-rcu_node kthreads are immortal, we don't need
1504 * to do anything to keep them alive.
1505 */
1506static void invoke_rcu_node_kthread(struct rcu_node *rnp)
1507{
1508 struct task_struct *t;
1509
1510 t = rnp->node_kthread_task;
1511 if (t != NULL)
1512 wake_up_process(t);
1513}
1514
1515/*
1516 * Set the specified CPU's kthread to run RT or not, as specified by
1517 * the to_rt argument. The CPU-hotplug locks are held, so the task
1518 * is not going away.
1519 */
1520static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1521{
1522 int policy;
1523 struct sched_param sp;
1524 struct task_struct *t;
1525
1526 t = per_cpu(rcu_cpu_kthread_task, cpu);
1527 if (t == NULL)
1528 return;
1529 if (to_rt) {
1530 policy = SCHED_FIFO;
1531 sp.sched_priority = RCU_KTHREAD_PRIO;
1532 } else {
1533 policy = SCHED_NORMAL;
1534 sp.sched_priority = 0;
1535 }
1536 sched_setscheduler_nocheck(t, policy, &sp);
1537}
1538
1539/*
1540 * Timer handler to initiate the waking up of per-CPU kthreads that
1541 * have yielded the CPU due to excess numbers of RCU callbacks.
1542 * We wake up the per-rcu_node kthread, which in turn will wake up
1543 * the booster kthread.
1544 */
1545static void rcu_cpu_kthread_timer(unsigned long arg)
1546{
1547 unsigned long flags;
1548 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
1549 struct rcu_node *rnp = rdp->mynode;
1550
1551 raw_spin_lock_irqsave(&rnp->lock, flags);
1552 rnp->wakemask |= rdp->grpmask;
1553 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1554 invoke_rcu_node_kthread(rnp);
1555}
1556
1557/*
1558 * Drop to non-real-time priority and yield, but only after posting a
1559 * timer that will cause us to regain our real-time priority if we
1560 * remain preempted. Either way, we restore our real-time priority
1561 * before returning.
1562 */
1563static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
1564{
1565 struct sched_param sp;
1566 struct timer_list yield_timer;
1567
1568 setup_timer_on_stack(&yield_timer, f, arg);
1569 mod_timer(&yield_timer, jiffies + 2);
1570 sp.sched_priority = 0;
1571 sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
1572 set_user_nice(current, 19);
1573 schedule();
1574 sp.sched_priority = RCU_KTHREAD_PRIO;
1575 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1576 del_timer(&yield_timer);
1577}
1578
1579/*
1580 * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
1581 * This can happen while the corresponding CPU is either coming online
1582 * or going offline. We cannot wait until the CPU is fully online
1583 * before starting the kthread, because the various notifier functions
1584 * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
1585 * the corresponding CPU is online.
1586 *
1587 * Return 1 if the kthread needs to stop, 0 otherwise.
1588 *
1589 * Caller must disable bh. This function can momentarily enable it.
1590 */
1591static int rcu_cpu_kthread_should_stop(int cpu)
1592{
1593 while (cpu_is_offline(cpu) ||
1594 !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
1595 smp_processor_id() != cpu) {
1596 if (kthread_should_stop())
1597 return 1;
1598 per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1599 per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
1600 local_bh_enable();
1601 schedule_timeout_uninterruptible(1);
1602 if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
1603 set_cpus_allowed_ptr(current, cpumask_of(cpu));
1604 local_bh_disable();
1605 }
1606 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1607 return 0;
1608}
1609
1610/*
1611 * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
1612 * earlier RCU softirq.
1613 */
1614static int rcu_cpu_kthread(void *arg)
1615{
1616 int cpu = (int)(long)arg;
1617 unsigned long flags;
1618 int spincnt = 0;
1619 unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
1620 wait_queue_head_t *wqp = &per_cpu(rcu_cpu_wq, cpu);
1621 char work;
1622 char *workp = &per_cpu(rcu_cpu_has_work, cpu);
1623
1624 for (;;) {
1625 *statusp = RCU_KTHREAD_WAITING;
1626 wait_event_interruptible(*wqp,
1627 *workp != 0 || kthread_should_stop());
1628 local_bh_disable();
1629 if (rcu_cpu_kthread_should_stop(cpu)) {
1630 local_bh_enable();
1631 break;
1632 }
1633 *statusp = RCU_KTHREAD_RUNNING;
1634 per_cpu(rcu_cpu_kthread_loops, cpu)++;
1635 local_irq_save(flags);
1636 work = *workp;
1637 *workp = 0;
1638 local_irq_restore(flags);
1639 if (work)
1640 rcu_process_callbacks();
1641 local_bh_enable();
1642 if (*workp != 0)
1643 spincnt++;
1644 else
1645 spincnt = 0;
1646 if (spincnt > 10) {
1647 *statusp = RCU_KTHREAD_YIELDING;
1648 rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
1649 spincnt = 0;
1650 }
1651 }
1652 *statusp = RCU_KTHREAD_STOPPED;
1653 return 0;
1654}
1655
1656/*
1657 * Spawn a per-CPU kthread, setting up affinity and priority.
1658 * Because the CPU hotplug lock is held, no other CPU will be attempting
1659 * to manipulate rcu_cpu_kthread_task. There might be another CPU
1660 * attempting to access it during boot, but the locking in kthread_bind()
1661 * will enforce sufficient ordering.
1662 */
1663static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
1664{
1665 struct sched_param sp;
1666 struct task_struct *t;
1667
1668 if (!rcu_kthreads_spawnable ||
1669 per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
1670 return 0;
1671 t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
1672 if (IS_ERR(t))
1673 return PTR_ERR(t);
1674 kthread_bind(t, cpu);
1675 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1676 WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
1677 per_cpu(rcu_cpu_kthread_task, cpu) = t;
1678 wake_up_process(t);
1679 sp.sched_priority = RCU_KTHREAD_PRIO;
1680 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1681 return 0;
1682}
1683
1684/*
1685 * Per-rcu_node kthread, which is in charge of waking up the per-CPU
1686 * kthreads when needed. We ignore requests to wake up kthreads
1687 * for offline CPUs, which is OK because force_quiescent_state()
1688 * takes care of this case.
1689 */
1690static int rcu_node_kthread(void *arg)
1691{
1692 int cpu;
1693 unsigned long flags;
1694 unsigned long mask;
1695 struct rcu_node *rnp = (struct rcu_node *)arg;
1696 struct sched_param sp;
1697 struct task_struct *t;
1698
1699 for (;;) {
1700 rnp->node_kthread_status = RCU_KTHREAD_WAITING;
1701 wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0);
1702 rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
1703 raw_spin_lock_irqsave(&rnp->lock, flags);
1704 mask = rnp->wakemask;
1705 rnp->wakemask = 0;
1706 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
1707 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
1708 if ((mask & 0x1) == 0)
1709 continue;
1710 preempt_disable();
1711 t = per_cpu(rcu_cpu_kthread_task, cpu);
1712 if (!cpu_online(cpu) || t == NULL) {
1713 preempt_enable();
1714 continue;
1715 }
1716 per_cpu(rcu_cpu_has_work, cpu) = 1;
1717 sp.sched_priority = RCU_KTHREAD_PRIO;
1718 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1719 preempt_enable();
1720 }
1721 }
1722 /* NOTREACHED */
1723 rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
1724 return 0;
1725}
1726
1727/*
1728 * Set the per-rcu_node kthread's affinity to cover all CPUs that are
1729 * served by the rcu_node in question. The CPU hotplug lock is still
1730 * held, so the value of rnp->qsmaskinit will be stable.
1731 *
1732 * We don't include outgoingcpu in the affinity set, use -1 if there is
1733 * no outgoing CPU. If there are no CPUs left in the affinity set,
1734 * this function allows the kthread to execute on any CPU.
1735 */
1736static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1737{
1738 cpumask_var_t cm;
1739 int cpu;
1740 unsigned long mask = rnp->qsmaskinit;
1741
1742 if (rnp->node_kthread_task == NULL)
1743 return;
1744 if (!alloc_cpumask_var(&cm, GFP_KERNEL))
1745 return;
1746 cpumask_clear(cm);
1747 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1748 if ((mask & 0x1) && cpu != outgoingcpu)
1749 cpumask_set_cpu(cpu, cm);
1750 if (cpumask_weight(cm) == 0) {
1751 cpumask_setall(cm);
1752 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
1753 cpumask_clear_cpu(cpu, cm);
1754 WARN_ON_ONCE(cpumask_weight(cm) == 0);
1755 }
1756 set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
1757 rcu_boost_kthread_setaffinity(rnp, cm);
1758 free_cpumask_var(cm);
1759}
1760
1761/*
1762 * Spawn a per-rcu_node kthread, setting priority and affinity.
1763 * Called during boot before online/offline can happen, or, if
1764 * during runtime, with the main CPU-hotplug locks held. So only
1765 * one of these can be executing at a time.
1766 */
1767static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
1768 struct rcu_node *rnp)
1769{
1770 unsigned long flags;
1771 int rnp_index = rnp - &rsp->node[0];
1772 struct sched_param sp;
1773 struct task_struct *t;
1774
1775 if (!rcu_kthreads_spawnable ||
1776 rnp->qsmaskinit == 0)
1777 return 0;
1778 if (rnp->node_kthread_task == NULL) {
1779 t = kthread_create(rcu_node_kthread, (void *)rnp,
1780 "rcun%d", rnp_index);
1781 if (IS_ERR(t))
1782 return PTR_ERR(t);
1783 raw_spin_lock_irqsave(&rnp->lock, flags);
1784 rnp->node_kthread_task = t;
1785 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1786 wake_up_process(t);
1787 sp.sched_priority = 99;
1788 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1789 }
1790 return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
1791}
1792
1793/*
1794 * Spawn all kthreads -- called as soon as the scheduler is running.
1795 */
1796static int __init rcu_spawn_kthreads(void)
1797{
1798 int cpu;
1799 struct rcu_node *rnp;
1800
1801 rcu_kthreads_spawnable = 1;
1802 for_each_possible_cpu(cpu) {
1803 init_waitqueue_head(&per_cpu(rcu_cpu_wq, cpu));
1804 per_cpu(rcu_cpu_has_work, cpu) = 0;
1805 if (cpu_online(cpu))
1806 (void)rcu_spawn_one_cpu_kthread(cpu);
1807 }
1808 rnp = rcu_get_root(rcu_state);
1809 init_waitqueue_head(&rnp->node_wq);
1810 rcu_init_boost_waitqueue(rnp);
1811 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1812 if (NUM_RCU_NODES > 1)
1813 rcu_for_each_leaf_node(rcu_state, rnp) {
1814 init_waitqueue_head(&rnp->node_wq);
1815 rcu_init_boost_waitqueue(rnp);
1816 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1817 }
1818 return 0;
1819}
1820early_initcall(rcu_spawn_kthreads);
1821
1417static void 1822static void
1418__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), 1823__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1419 struct rcu_state *rsp) 1824 struct rcu_state *rsp)
@@ -1439,6 +1844,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1439 /* Add the callback to our list. */ 1844 /* Add the callback to our list. */
1440 *rdp->nxttail[RCU_NEXT_TAIL] = head; 1845 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1441 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1846 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1847 rdp->qlen++;
1848
1849 /* If interrupts were disabled, don't dive into RCU core. */
1850 if (irqs_disabled_flags(flags)) {
1851 local_irq_restore(flags);
1852 return;
1853 }
1442 1854
1443 /* 1855 /*
1444 * Force the grace period if too many callbacks or too long waiting. 1856 * Force the grace period if too many callbacks or too long waiting.
@@ -1447,7 +1859,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1447 * invoking force_quiescent_state() if the newly enqueued callback 1859 * invoking force_quiescent_state() if the newly enqueued callback
1448 * is the only one waiting for a grace period to complete. 1860 * is the only one waiting for a grace period to complete.
1449 */ 1861 */
1450 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 1862 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1451 1863
1452 /* Are we ignoring a completed grace period? */ 1864 /* Are we ignoring a completed grace period? */
1453 rcu_process_gp_end(rsp, rdp); 1865 rcu_process_gp_end(rsp, rdp);
@@ -1583,7 +1995,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1583 * or RCU-bh, force a local reschedule. 1995 * or RCU-bh, force a local reschedule.
1584 */ 1996 */
1585 rdp->n_rp_qs_pending++; 1997 rdp->n_rp_qs_pending++;
1586 if (!rdp->preemptable && 1998 if (!rdp->preemptible &&
1587 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, 1999 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
1588 jiffies)) 2000 jiffies))
1589 set_need_resched(); 2001 set_need_resched();
@@ -1760,7 +2172,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1760 * that this CPU cannot possibly have any RCU callbacks in flight yet. 2172 * that this CPU cannot possibly have any RCU callbacks in flight yet.
1761 */ 2173 */
1762static void __cpuinit 2174static void __cpuinit
1763rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) 2175rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
1764{ 2176{
1765 unsigned long flags; 2177 unsigned long flags;
1766 unsigned long mask; 2178 unsigned long mask;
@@ -1772,7 +2184,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1772 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 2184 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1773 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 2185 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1774 rdp->beenonline = 1; /* We have now been online. */ 2186 rdp->beenonline = 1; /* We have now been online. */
1775 rdp->preemptable = preemptable; 2187 rdp->preemptible = preemptible;
1776 rdp->qlen_last_fqs_check = 0; 2188 rdp->qlen_last_fqs_check = 0;
1777 rdp->n_force_qs_snap = rsp->n_force_qs; 2189 rdp->n_force_qs_snap = rsp->n_force_qs;
1778 rdp->blimit = blimit; 2190 rdp->blimit = blimit;
@@ -1813,6 +2225,19 @@ static void __cpuinit rcu_online_cpu(int cpu)
1813 rcu_preempt_init_percpu_data(cpu); 2225 rcu_preempt_init_percpu_data(cpu);
1814} 2226}
1815 2227
2228static void __cpuinit rcu_online_kthreads(int cpu)
2229{
2230 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2231 struct rcu_node *rnp = rdp->mynode;
2232
2233 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
2234 if (rcu_kthreads_spawnable) {
2235 (void)rcu_spawn_one_cpu_kthread(cpu);
2236 if (rnp->node_kthread_task == NULL)
2237 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
2238 }
2239}
2240
1816/* 2241/*
1817 * Handle CPU online/offline notification events. 2242 * Handle CPU online/offline notification events.
1818 */ 2243 */
@@ -1820,11 +2245,23 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1820 unsigned long action, void *hcpu) 2245 unsigned long action, void *hcpu)
1821{ 2246{
1822 long cpu = (long)hcpu; 2247 long cpu = (long)hcpu;
2248 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2249 struct rcu_node *rnp = rdp->mynode;
1823 2250
1824 switch (action) { 2251 switch (action) {
1825 case CPU_UP_PREPARE: 2252 case CPU_UP_PREPARE:
1826 case CPU_UP_PREPARE_FROZEN: 2253 case CPU_UP_PREPARE_FROZEN:
1827 rcu_online_cpu(cpu); 2254 rcu_online_cpu(cpu);
2255 rcu_online_kthreads(cpu);
2256 break;
2257 case CPU_ONLINE:
2258 case CPU_DOWN_FAILED:
2259 rcu_node_kthread_setaffinity(rnp, -1);
2260 rcu_cpu_kthread_setrt(cpu, 1);
2261 break;
2262 case CPU_DOWN_PREPARE:
2263 rcu_node_kthread_setaffinity(rnp, cpu);
2264 rcu_cpu_kthread_setrt(cpu, 0);
1828 break; 2265 break;
1829 case CPU_DYING: 2266 case CPU_DYING:
1830 case CPU_DYING_FROZEN: 2267 case CPU_DYING_FROZEN:
@@ -1943,10 +2380,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
1943 j / rsp->levelspread[i - 1]; 2380 j / rsp->levelspread[i - 1];
1944 } 2381 }
1945 rnp->level = i; 2382 rnp->level = i;
1946 INIT_LIST_HEAD(&rnp->blocked_tasks[0]); 2383 INIT_LIST_HEAD(&rnp->blkd_tasks);
1947 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1948 INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
1949 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1950 } 2384 }
1951 } 2385 }
1952 2386
@@ -1968,7 +2402,6 @@ void __init rcu_init(void)
1968 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 2402 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
1969 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 2403 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
1970 __rcu_init_preempt(); 2404 __rcu_init_preempt();
1971 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1972 2405
1973 /* 2406 /*
1974 * We don't need protection against CPU-hotplug here because 2407 * We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index e8f057e44e3e..257664815d5d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -91,6 +91,14 @@ struct rcu_dynticks {
91 /* remains even for nmi from irq handler. */ 91 /* remains even for nmi from irq handler. */
92}; 92};
93 93
94/* RCU's kthread states for tracing. */
95#define RCU_KTHREAD_STOPPED 0
96#define RCU_KTHREAD_RUNNING 1
97#define RCU_KTHREAD_WAITING 2
98#define RCU_KTHREAD_OFFCPU 3
99#define RCU_KTHREAD_YIELDING 4
100#define RCU_KTHREAD_MAX 4
101
94/* 102/*
95 * Definition for node within the RCU grace-period-detection hierarchy. 103 * Definition for node within the RCU grace-period-detection hierarchy.
96 */ 104 */
@@ -109,10 +117,11 @@ struct rcu_node {
109 /* an rcu_data structure, otherwise, each */ 117 /* an rcu_data structure, otherwise, each */
110 /* bit corresponds to a child rcu_node */ 118 /* bit corresponds to a child rcu_node */
111 /* structure. */ 119 /* structure. */
112 unsigned long expmask; /* Groups that have ->blocked_tasks[] */ 120 unsigned long expmask; /* Groups that have ->blkd_tasks */
113 /* elements that need to drain to allow the */ 121 /* elements that need to drain to allow the */
114 /* current expedited grace period to */ 122 /* current expedited grace period to */
115 /* complete (only for TREE_PREEMPT_RCU). */ 123 /* complete (only for TREE_PREEMPT_RCU). */
124 unsigned long wakemask; /* CPUs whose kthread needs to be awakened. */
116 unsigned long qsmaskinit; 125 unsigned long qsmaskinit;
117 /* Per-GP initial value for qsmask & expmask. */ 126 /* Per-GP initial value for qsmask & expmask. */
118 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 127 unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -122,11 +131,68 @@ struct rcu_node {
122 u8 grpnum; /* CPU/group number for next level up. */ 131 u8 grpnum; /* CPU/group number for next level up. */
123 u8 level; /* root is at level 0. */ 132 u8 level; /* root is at level 0. */
124 struct rcu_node *parent; 133 struct rcu_node *parent;
125 struct list_head blocked_tasks[4]; 134 struct list_head blkd_tasks;
126 /* Tasks blocked in RCU read-side critsect. */ 135 /* Tasks blocked in RCU read-side critical */
127 /* Grace period number (->gpnum) x blocked */ 136 /* section. Tasks are placed at the head */
128 /* by tasks on the (x & 0x1) element of the */ 137 /* of this list and age towards the tail. */
129 /* blocked_tasks[] array. */ 138 struct list_head *gp_tasks;
139 /* Pointer to the first task blocking the */
140 /* current grace period, or NULL if there */
141 /* is no such task. */
142 struct list_head *exp_tasks;
143 /* Pointer to the first task blocking the */
144 /* current expedited grace period, or NULL */
145 /* if there is no such task. If there */
146 /* is no current expedited grace period, */
147 /* then there can cannot be any such task. */
148#ifdef CONFIG_RCU_BOOST
149 struct list_head *boost_tasks;
150 /* Pointer to first task that needs to be */
151 /* priority boosted, or NULL if no priority */
152 /* boosting is needed for this rcu_node */
153 /* structure. If there are no tasks */
154 /* queued on this rcu_node structure that */
155 /* are blocking the current grace period, */
156 /* there can be no such task. */
157 unsigned long boost_time;
158 /* When to start boosting (jiffies). */
159 struct task_struct *boost_kthread_task;
160 /* kthread that takes care of priority */
161 /* boosting for this rcu_node structure. */
162 wait_queue_head_t boost_wq;
163 /* Wait queue on which to park the boost */
164 /* kthread. */
165 unsigned int boost_kthread_status;
166 /* State of boost_kthread_task for tracing. */
167 unsigned long n_tasks_boosted;
168 /* Total number of tasks boosted. */
169 unsigned long n_exp_boosts;
170 /* Number of tasks boosted for expedited GP. */
171 unsigned long n_normal_boosts;
172 /* Number of tasks boosted for normal GP. */
173 unsigned long n_balk_blkd_tasks;
174 /* Refused to boost: no blocked tasks. */
175 unsigned long n_balk_exp_gp_tasks;
176 /* Refused to boost: nothing blocking GP. */
177 unsigned long n_balk_boost_tasks;
178 /* Refused to boost: already boosting. */
179 unsigned long n_balk_notblocked;
180 /* Refused to boost: RCU RS CS still running. */
181 unsigned long n_balk_notyet;
182 /* Refused to boost: not yet time. */
183 unsigned long n_balk_nos;
184 /* Refused to boost: not sure why, though. */
185 /* This can happen due to race conditions. */
186#endif /* #ifdef CONFIG_RCU_BOOST */
187 struct task_struct *node_kthread_task;
188 /* kthread that takes care of this rcu_node */
189 /* structure, for example, awakening the */
190 /* per-CPU kthreads as needed. */
191 wait_queue_head_t node_wq;
192 /* Wait queue on which to park the per-node */
193 /* kthread. */
194 unsigned int node_kthread_status;
195 /* State of node_kthread_task for tracing. */
130} ____cacheline_internodealigned_in_smp; 196} ____cacheline_internodealigned_in_smp;
131 197
132/* 198/*
@@ -175,7 +241,7 @@ struct rcu_data {
175 bool passed_quiesc; /* User-mode/idle loop etc. */ 241 bool passed_quiesc; /* User-mode/idle loop etc. */
176 bool qs_pending; /* Core waits for quiesc state. */ 242 bool qs_pending; /* Core waits for quiesc state. */
177 bool beenonline; /* CPU online at least once. */ 243 bool beenonline; /* CPU online at least once. */
178 bool preemptable; /* Preemptable RCU? */ 244 bool preemptible; /* Preemptible RCU? */
179 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 245 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
180 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 246 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
181 247
@@ -254,7 +320,6 @@ struct rcu_data {
254#endif /* #else #ifdef CONFIG_NO_HZ */ 320#endif /* #else #ifdef CONFIG_NO_HZ */
255 321
256#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 322#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
257#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
258 323
259#ifdef CONFIG_PROVE_RCU 324#ifdef CONFIG_PROVE_RCU
260#define RCU_STALL_DELAY_DELTA (5 * HZ) 325#define RCU_STALL_DELAY_DELTA (5 * HZ)
@@ -272,13 +337,6 @@ struct rcu_data {
272 /* scheduling clock irq */ 337 /* scheduling clock irq */
273 /* before ratting on them. */ 338 /* before ratting on them. */
274 339
275#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE
276#define RCU_CPU_STALL_SUPPRESS_INIT 0
277#else
278#define RCU_CPU_STALL_SUPPRESS_INIT 1
279#endif
280
281#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
282 340
283/* 341/*
284 * RCU global state, including node hierarchy. This hierarchy is 342 * RCU global state, including node hierarchy. This hierarchy is
@@ -325,12 +383,12 @@ struct rcu_state {
325 /* due to lock unavailable. */ 383 /* due to lock unavailable. */
326 unsigned long n_force_qs_ngp; /* Number of calls leaving */ 384 unsigned long n_force_qs_ngp; /* Number of calls leaving */
327 /* due to no GP active. */ 385 /* due to no GP active. */
328#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
329 unsigned long gp_start; /* Time at which GP started, */ 386 unsigned long gp_start; /* Time at which GP started, */
330 /* but in jiffies. */ 387 /* but in jiffies. */
331 unsigned long jiffies_stall; /* Time at which to check */ 388 unsigned long jiffies_stall; /* Time at which to check */
332 /* for CPU stalls. */ 389 /* for CPU stalls. */
333#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 390 unsigned long gp_max; /* Maximum GP duration in */
391 /* jiffies. */
334 char *name; /* Name of structure. */ 392 char *name; /* Name of structure. */
335}; 393};
336 394
@@ -361,16 +419,14 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
361static void rcu_bootup_announce(void); 419static void rcu_bootup_announce(void);
362long rcu_batches_completed(void); 420long rcu_batches_completed(void);
363static void rcu_preempt_note_context_switch(int cpu); 421static void rcu_preempt_note_context_switch(int cpu);
364static int rcu_preempted_readers(struct rcu_node *rnp); 422static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
365#ifdef CONFIG_HOTPLUG_CPU 423#ifdef CONFIG_HOTPLUG_CPU
366static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 424static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
367 unsigned long flags); 425 unsigned long flags);
368#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 426#endif /* #ifdef CONFIG_HOTPLUG_CPU */
369#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
370static void rcu_print_detail_task_stall(struct rcu_state *rsp); 427static void rcu_print_detail_task_stall(struct rcu_state *rsp);
371static void rcu_print_task_stall(struct rcu_node *rnp); 428static void rcu_print_task_stall(struct rcu_node *rnp);
372static void rcu_preempt_stall_reset(void); 429static void rcu_preempt_stall_reset(void);
373#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
374static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 430static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
375#ifdef CONFIG_HOTPLUG_CPU 431#ifdef CONFIG_HOTPLUG_CPU
376static int rcu_preempt_offline_tasks(struct rcu_state *rsp, 432static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
@@ -390,5 +446,13 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
390static void rcu_preempt_send_cbs_to_online(void); 446static void rcu_preempt_send_cbs_to_online(void);
391static void __init __rcu_init_preempt(void); 447static void __init __rcu_init_preempt(void);
392static void rcu_needs_cpu_flush(void); 448static void rcu_needs_cpu_flush(void);
449static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp);
450static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
451static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
452 cpumask_var_t cm);
453static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
454static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
455 struct rcu_node *rnp,
456 int rnp_index);
393 457
394#endif /* #ifndef RCU_TREE_NONCORE */ 458#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index a3638710dc67..3f6559a5f5cd 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version) 2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic 3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics. 4 * or preemptible semantics.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -54,10 +54,6 @@ static void __init rcu_bootup_announce_oddness(void)
54#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE 54#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
55 printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); 55 printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
56#endif 56#endif
57#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
58 printk(KERN_INFO
59 "\tRCU-based detection of stalled CPUs is disabled.\n");
60#endif
61#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) 57#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
62 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); 58 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
63#endif 59#endif
@@ -70,6 +66,7 @@ static void __init rcu_bootup_announce_oddness(void)
70 66
71struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 67struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
72DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 68DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
69static struct rcu_state *rcu_state = &rcu_preempt_state;
73 70
74static int rcu_preempted_readers_exp(struct rcu_node *rnp); 71static int rcu_preempted_readers_exp(struct rcu_node *rnp);
75 72
@@ -78,7 +75,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
78 */ 75 */
79static void __init rcu_bootup_announce(void) 76static void __init rcu_bootup_announce(void)
80{ 77{
81 printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n"); 78 printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n");
82 rcu_bootup_announce_oddness(); 79 rcu_bootup_announce_oddness();
83} 80}
84 81
@@ -111,7 +108,7 @@ void rcu_force_quiescent_state(void)
111EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 108EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
112 109
113/* 110/*
114 * Record a preemptable-RCU quiescent state for the specified CPU. Note 111 * Record a preemptible-RCU quiescent state for the specified CPU. Note
115 * that this just means that the task currently running on the CPU is 112 * that this just means that the task currently running on the CPU is
116 * not in a quiescent state. There might be any number of tasks blocked 113 * not in a quiescent state. There might be any number of tasks blocked
117 * while in an RCU read-side critical section. 114 * while in an RCU read-side critical section.
@@ -134,12 +131,12 @@ static void rcu_preempt_qs(int cpu)
134 * We have entered the scheduler, and the current task might soon be 131 * We have entered the scheduler, and the current task might soon be
135 * context-switched away from. If this task is in an RCU read-side 132 * context-switched away from. If this task is in an RCU read-side
136 * critical section, we will no longer be able to rely on the CPU to 133 * critical section, we will no longer be able to rely on the CPU to
137 * record that fact, so we enqueue the task on the appropriate entry 134 * record that fact, so we enqueue the task on the blkd_tasks list.
138 * of the blocked_tasks[] array. The task will dequeue itself when 135 * The task will dequeue itself when it exits the outermost enclosing
139 * it exits the outermost enclosing RCU read-side critical section. 136 * RCU read-side critical section. Therefore, the current grace period
140 * Therefore, the current grace period cannot be permitted to complete 137 * cannot be permitted to complete until the blkd_tasks list entries
141 * until the blocked_tasks[] entry indexed by the low-order bit of 138 * predating the current grace period drain, in other words, until
142 * rnp->gpnum empties. 139 * rnp->gp_tasks becomes NULL.
143 * 140 *
144 * Caller must disable preemption. 141 * Caller must disable preemption.
145 */ 142 */
@@ -147,7 +144,6 @@ static void rcu_preempt_note_context_switch(int cpu)
147{ 144{
148 struct task_struct *t = current; 145 struct task_struct *t = current;
149 unsigned long flags; 146 unsigned long flags;
150 int phase;
151 struct rcu_data *rdp; 147 struct rcu_data *rdp;
152 struct rcu_node *rnp; 148 struct rcu_node *rnp;
153 149
@@ -169,15 +165,30 @@ static void rcu_preempt_note_context_switch(int cpu)
169 * (i.e., this CPU has not yet passed through a quiescent 165 * (i.e., this CPU has not yet passed through a quiescent
170 * state for the current grace period), then as long 166 * state for the current grace period), then as long
171 * as that task remains queued, the current grace period 167 * as that task remains queued, the current grace period
172 * cannot end. 168 * cannot end. Note that there is some uncertainty as
169 * to exactly when the current grace period started.
170 * We take a conservative approach, which can result
171 * in unnecessarily waiting on tasks that started very
172 * slightly after the current grace period began. C'est
173 * la vie!!!
173 * 174 *
174 * But first, note that the current CPU must still be 175 * But first, note that the current CPU must still be
175 * on line! 176 * on line!
176 */ 177 */
177 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); 178 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
178 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 179 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
179 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; 180 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
180 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); 181 list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
182 rnp->gp_tasks = &t->rcu_node_entry;
183#ifdef CONFIG_RCU_BOOST
184 if (rnp->boost_tasks != NULL)
185 rnp->boost_tasks = rnp->gp_tasks;
186#endif /* #ifdef CONFIG_RCU_BOOST */
187 } else {
188 list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
189 if (rnp->qsmask & rdp->grpmask)
190 rnp->gp_tasks = &t->rcu_node_entry;
191 }
181 raw_spin_unlock_irqrestore(&rnp->lock, flags); 192 raw_spin_unlock_irqrestore(&rnp->lock, flags);
182 } 193 }
183 194
@@ -196,7 +207,7 @@ static void rcu_preempt_note_context_switch(int cpu)
196} 207}
197 208
198/* 209/*
199 * Tree-preemptable RCU implementation for rcu_read_lock(). 210 * Tree-preemptible RCU implementation for rcu_read_lock().
200 * Just increment ->rcu_read_lock_nesting, shared state will be updated 211 * Just increment ->rcu_read_lock_nesting, shared state will be updated
201 * if we block. 212 * if we block.
202 */ 213 */
@@ -212,12 +223,9 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
212 * for the specified rcu_node structure. If the caller needs a reliable 223 * for the specified rcu_node structure. If the caller needs a reliable
213 * answer, it must hold the rcu_node's ->lock. 224 * answer, it must hold the rcu_node's ->lock.
214 */ 225 */
215static int rcu_preempted_readers(struct rcu_node *rnp) 226static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
216{ 227{
217 int phase = rnp->gpnum & 0x1; 228 return rnp->gp_tasks != NULL;
218
219 return !list_empty(&rnp->blocked_tasks[phase]) ||
220 !list_empty(&rnp->blocked_tasks[phase + 2]);
221} 229}
222 230
223/* 231/*
@@ -233,7 +241,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
233 unsigned long mask; 241 unsigned long mask;
234 struct rcu_node *rnp_p; 242 struct rcu_node *rnp_p;
235 243
236 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 244 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
237 raw_spin_unlock_irqrestore(&rnp->lock, flags); 245 raw_spin_unlock_irqrestore(&rnp->lock, flags);
238 return; /* Still need more quiescent states! */ 246 return; /* Still need more quiescent states! */
239 } 247 }
@@ -257,6 +265,21 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
257} 265}
258 266
259/* 267/*
268 * Advance a ->blkd_tasks-list pointer to the next entry, instead
269 * returning NULL if at the end of the list.
270 */
271static struct list_head *rcu_next_node_entry(struct task_struct *t,
272 struct rcu_node *rnp)
273{
274 struct list_head *np;
275
276 np = t->rcu_node_entry.next;
277 if (np == &rnp->blkd_tasks)
278 np = NULL;
279 return np;
280}
281
282/*
260 * Handle special cases during rcu_read_unlock(), such as needing to 283 * Handle special cases during rcu_read_unlock(), such as needing to
261 * notify RCU core processing or task having blocked during the RCU 284 * notify RCU core processing or task having blocked during the RCU
262 * read-side critical section. 285 * read-side critical section.
@@ -266,6 +289,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
266 int empty; 289 int empty;
267 int empty_exp; 290 int empty_exp;
268 unsigned long flags; 291 unsigned long flags;
292 struct list_head *np;
269 struct rcu_node *rnp; 293 struct rcu_node *rnp;
270 int special; 294 int special;
271 295
@@ -306,10 +330,19 @@ static void rcu_read_unlock_special(struct task_struct *t)
306 break; 330 break;
307 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 331 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
308 } 332 }
309 empty = !rcu_preempted_readers(rnp); 333 empty = !rcu_preempt_blocked_readers_cgp(rnp);
310 empty_exp = !rcu_preempted_readers_exp(rnp); 334 empty_exp = !rcu_preempted_readers_exp(rnp);
311 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 335 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
336 np = rcu_next_node_entry(t, rnp);
312 list_del_init(&t->rcu_node_entry); 337 list_del_init(&t->rcu_node_entry);
338 if (&t->rcu_node_entry == rnp->gp_tasks)
339 rnp->gp_tasks = np;
340 if (&t->rcu_node_entry == rnp->exp_tasks)
341 rnp->exp_tasks = np;
342#ifdef CONFIG_RCU_BOOST
343 if (&t->rcu_node_entry == rnp->boost_tasks)
344 rnp->boost_tasks = np;
345#endif /* #ifdef CONFIG_RCU_BOOST */
313 t->rcu_blocked_node = NULL; 346 t->rcu_blocked_node = NULL;
314 347
315 /* 348 /*
@@ -322,6 +355,15 @@ static void rcu_read_unlock_special(struct task_struct *t)
322 else 355 else
323 rcu_report_unblock_qs_rnp(rnp, flags); 356 rcu_report_unblock_qs_rnp(rnp, flags);
324 357
358#ifdef CONFIG_RCU_BOOST
359 /* Unboost if we were boosted. */
360 if (special & RCU_READ_UNLOCK_BOOSTED) {
361 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
362 rt_mutex_unlock(t->rcu_boost_mutex);
363 t->rcu_boost_mutex = NULL;
364 }
365#endif /* #ifdef CONFIG_RCU_BOOST */
366
325 /* 367 /*
326 * If this was the last task on the expedited lists, 368 * If this was the last task on the expedited lists,
327 * then we need to report up the rcu_node hierarchy. 369 * then we need to report up the rcu_node hierarchy.
@@ -334,7 +376,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
334} 376}
335 377
336/* 378/*
337 * Tree-preemptable RCU implementation for rcu_read_unlock(). 379 * Tree-preemptible RCU implementation for rcu_read_unlock().
338 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost 380 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
339 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then 381 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
340 * invoke rcu_read_unlock_special() to clean up after a context switch 382 * invoke rcu_read_unlock_special() to clean up after a context switch
@@ -356,8 +398,6 @@ void __rcu_read_unlock(void)
356} 398}
357EXPORT_SYMBOL_GPL(__rcu_read_unlock); 399EXPORT_SYMBOL_GPL(__rcu_read_unlock);
358 400
359#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
360
361#ifdef CONFIG_RCU_CPU_STALL_VERBOSE 401#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
362 402
363/* 403/*
@@ -367,18 +407,16 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock);
367static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) 407static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
368{ 408{
369 unsigned long flags; 409 unsigned long flags;
370 struct list_head *lp;
371 int phase;
372 struct task_struct *t; 410 struct task_struct *t;
373 411
374 if (rcu_preempted_readers(rnp)) { 412 if (!rcu_preempt_blocked_readers_cgp(rnp))
375 raw_spin_lock_irqsave(&rnp->lock, flags); 413 return;
376 phase = rnp->gpnum & 0x1; 414 raw_spin_lock_irqsave(&rnp->lock, flags);
377 lp = &rnp->blocked_tasks[phase]; 415 t = list_entry(rnp->gp_tasks,
378 list_for_each_entry(t, lp, rcu_node_entry) 416 struct task_struct, rcu_node_entry);
379 sched_show_task(t); 417 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
380 raw_spin_unlock_irqrestore(&rnp->lock, flags); 418 sched_show_task(t);
381 } 419 raw_spin_unlock_irqrestore(&rnp->lock, flags);
382} 420}
383 421
384/* 422/*
@@ -408,16 +446,14 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
408 */ 446 */
409static void rcu_print_task_stall(struct rcu_node *rnp) 447static void rcu_print_task_stall(struct rcu_node *rnp)
410{ 448{
411 struct list_head *lp;
412 int phase;
413 struct task_struct *t; 449 struct task_struct *t;
414 450
415 if (rcu_preempted_readers(rnp)) { 451 if (!rcu_preempt_blocked_readers_cgp(rnp))
416 phase = rnp->gpnum & 0x1; 452 return;
417 lp = &rnp->blocked_tasks[phase]; 453 t = list_entry(rnp->gp_tasks,
418 list_for_each_entry(t, lp, rcu_node_entry) 454 struct task_struct, rcu_node_entry);
419 printk(" P%d", t->pid); 455 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
420 } 456 printk(" P%d", t->pid);
421} 457}
422 458
423/* 459/*
@@ -430,18 +466,21 @@ static void rcu_preempt_stall_reset(void)
430 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; 466 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
431} 467}
432 468
433#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
434
435/* 469/*
436 * Check that the list of blocked tasks for the newly completed grace 470 * Check that the list of blocked tasks for the newly completed grace
437 * period is in fact empty. It is a serious bug to complete a grace 471 * period is in fact empty. It is a serious bug to complete a grace
438 * period that still has RCU readers blocked! This function must be 472 * period that still has RCU readers blocked! This function must be
439 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock 473 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
440 * must be held by the caller. 474 * must be held by the caller.
475 *
476 * Also, if there are blocked tasks on the list, they automatically
477 * block the newly created grace period, so set up ->gp_tasks accordingly.
441 */ 478 */
442static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 479static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
443{ 480{
444 WARN_ON_ONCE(rcu_preempted_readers(rnp)); 481 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
482 if (!list_empty(&rnp->blkd_tasks))
483 rnp->gp_tasks = rnp->blkd_tasks.next;
445 WARN_ON_ONCE(rnp->qsmask); 484 WARN_ON_ONCE(rnp->qsmask);
446} 485}
447 486
@@ -465,50 +504,68 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
465 struct rcu_node *rnp, 504 struct rcu_node *rnp,
466 struct rcu_data *rdp) 505 struct rcu_data *rdp)
467{ 506{
468 int i;
469 struct list_head *lp; 507 struct list_head *lp;
470 struct list_head *lp_root; 508 struct list_head *lp_root;
471 int retval = 0; 509 int retval = 0;
472 struct rcu_node *rnp_root = rcu_get_root(rsp); 510 struct rcu_node *rnp_root = rcu_get_root(rsp);
473 struct task_struct *tp; 511 struct task_struct *t;
474 512
475 if (rnp == rnp_root) { 513 if (rnp == rnp_root) {
476 WARN_ONCE(1, "Last CPU thought to be offlined?"); 514 WARN_ONCE(1, "Last CPU thought to be offlined?");
477 return 0; /* Shouldn't happen: at least one CPU online. */ 515 return 0; /* Shouldn't happen: at least one CPU online. */
478 } 516 }
479 WARN_ON_ONCE(rnp != rdp->mynode && 517
480 (!list_empty(&rnp->blocked_tasks[0]) || 518 /* If we are on an internal node, complain bitterly. */
481 !list_empty(&rnp->blocked_tasks[1]) || 519 WARN_ON_ONCE(rnp != rdp->mynode);
482 !list_empty(&rnp->blocked_tasks[2]) ||
483 !list_empty(&rnp->blocked_tasks[3])));
484 520
485 /* 521 /*
486 * Move tasks up to root rcu_node. Rely on the fact that the 522 * Move tasks up to root rcu_node. Don't try to get fancy for
487 * root rcu_node can be at most one ahead of the rest of the 523 * this corner-case operation -- just put this node's tasks
488 * rcu_nodes in terms of gp_num value. This fact allows us to 524 * at the head of the root node's list, and update the root node's
489 * move the blocked_tasks[] array directly, element by element. 525 * ->gp_tasks and ->exp_tasks pointers to those of this node's,
526 * if non-NULL. This might result in waiting for more tasks than
527 * absolutely necessary, but this is a good performance/complexity
528 * tradeoff.
490 */ 529 */
491 if (rcu_preempted_readers(rnp)) 530 if (rcu_preempt_blocked_readers_cgp(rnp))
492 retval |= RCU_OFL_TASKS_NORM_GP; 531 retval |= RCU_OFL_TASKS_NORM_GP;
493 if (rcu_preempted_readers_exp(rnp)) 532 if (rcu_preempted_readers_exp(rnp))
494 retval |= RCU_OFL_TASKS_EXP_GP; 533 retval |= RCU_OFL_TASKS_EXP_GP;
495 for (i = 0; i < 4; i++) { 534 lp = &rnp->blkd_tasks;
496 lp = &rnp->blocked_tasks[i]; 535 lp_root = &rnp_root->blkd_tasks;
497 lp_root = &rnp_root->blocked_tasks[i]; 536 while (!list_empty(lp)) {
498 while (!list_empty(lp)) { 537 t = list_entry(lp->next, typeof(*t), rcu_node_entry);
499 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); 538 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
500 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ 539 list_del(&t->rcu_node_entry);
501 list_del(&tp->rcu_node_entry); 540 t->rcu_blocked_node = rnp_root;
502 tp->rcu_blocked_node = rnp_root; 541 list_add(&t->rcu_node_entry, lp_root);
503 list_add(&tp->rcu_node_entry, lp_root); 542 if (&t->rcu_node_entry == rnp->gp_tasks)
504 raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 543 rnp_root->gp_tasks = rnp->gp_tasks;
505 } 544 if (&t->rcu_node_entry == rnp->exp_tasks)
545 rnp_root->exp_tasks = rnp->exp_tasks;
546#ifdef CONFIG_RCU_BOOST
547 if (&t->rcu_node_entry == rnp->boost_tasks)
548 rnp_root->boost_tasks = rnp->boost_tasks;
549#endif /* #ifdef CONFIG_RCU_BOOST */
550 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
506 } 551 }
552
553#ifdef CONFIG_RCU_BOOST
554 /* In case root is being boosted and leaf is not. */
555 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
556 if (rnp_root->boost_tasks != NULL &&
557 rnp_root->boost_tasks != rnp_root->gp_tasks)
558 rnp_root->boost_tasks = rnp_root->gp_tasks;
559 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
560#endif /* #ifdef CONFIG_RCU_BOOST */
561
562 rnp->gp_tasks = NULL;
563 rnp->exp_tasks = NULL;
507 return retval; 564 return retval;
508} 565}
509 566
510/* 567/*
511 * Do CPU-offline processing for preemptable RCU. 568 * Do CPU-offline processing for preemptible RCU.
512 */ 569 */
513static void rcu_preempt_offline_cpu(int cpu) 570static void rcu_preempt_offline_cpu(int cpu)
514{ 571{
@@ -537,7 +594,7 @@ static void rcu_preempt_check_callbacks(int cpu)
537} 594}
538 595
539/* 596/*
540 * Process callbacks for preemptable RCU. 597 * Process callbacks for preemptible RCU.
541 */ 598 */
542static void rcu_preempt_process_callbacks(void) 599static void rcu_preempt_process_callbacks(void)
543{ 600{
@@ -546,7 +603,7 @@ static void rcu_preempt_process_callbacks(void)
546} 603}
547 604
548/* 605/*
549 * Queue a preemptable-RCU callback for invocation after a grace period. 606 * Queue a preemptible-RCU callback for invocation after a grace period.
550 */ 607 */
551void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 608void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
552{ 609{
@@ -594,8 +651,7 @@ static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
594 */ 651 */
595static int rcu_preempted_readers_exp(struct rcu_node *rnp) 652static int rcu_preempted_readers_exp(struct rcu_node *rnp)
596{ 653{
597 return !list_empty(&rnp->blocked_tasks[2]) || 654 return rnp->exp_tasks != NULL;
598 !list_empty(&rnp->blocked_tasks[3]);
599} 655}
600 656
601/* 657/*
@@ -655,13 +711,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
655static void 711static void
656sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) 712sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
657{ 713{
658 int must_wait; 714 unsigned long flags;
715 int must_wait = 0;
659 716
660 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 717 raw_spin_lock_irqsave(&rnp->lock, flags);
661 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); 718 if (list_empty(&rnp->blkd_tasks))
662 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); 719 raw_spin_unlock_irqrestore(&rnp->lock, flags);
663 must_wait = rcu_preempted_readers_exp(rnp); 720 else {
664 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 721 rnp->exp_tasks = rnp->blkd_tasks.next;
722 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
723 must_wait = 1;
724 }
665 if (!must_wait) 725 if (!must_wait)
666 rcu_report_exp_rnp(rsp, rnp); 726 rcu_report_exp_rnp(rsp, rnp);
667} 727}
@@ -669,9 +729,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
669/* 729/*
670 * Wait for an rcu-preempt grace period, but expedite it. The basic idea 730 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
671 * is to invoke synchronize_sched_expedited() to push all the tasks to 731 * is to invoke synchronize_sched_expedited() to push all the tasks to
672 * the ->blocked_tasks[] lists, move all entries from the first set of 732 * the ->blkd_tasks lists and wait for this list to drain.
673 * ->blocked_tasks[] lists to the second set, and finally wait for this
674 * second set to drain.
675 */ 733 */
676void synchronize_rcu_expedited(void) 734void synchronize_rcu_expedited(void)
677{ 735{
@@ -703,7 +761,7 @@ void synchronize_rcu_expedited(void)
703 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) 761 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
704 goto unlock_mb_ret; /* Others did our work for us. */ 762 goto unlock_mb_ret; /* Others did our work for us. */
705 763
706 /* force all RCU readers onto blocked_tasks[]. */ 764 /* force all RCU readers onto ->blkd_tasks lists. */
707 synchronize_sched_expedited(); 765 synchronize_sched_expedited();
708 766
709 raw_spin_lock_irqsave(&rsp->onofflock, flags); 767 raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -715,7 +773,7 @@ void synchronize_rcu_expedited(void)
715 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 773 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
716 } 774 }
717 775
718 /* Snapshot current state of ->blocked_tasks[] lists. */ 776 /* Snapshot current state of ->blkd_tasks lists. */
719 rcu_for_each_leaf_node(rsp, rnp) 777 rcu_for_each_leaf_node(rsp, rnp)
720 sync_rcu_preempt_exp_init(rsp, rnp); 778 sync_rcu_preempt_exp_init(rsp, rnp);
721 if (NUM_RCU_NODES > 1) 779 if (NUM_RCU_NODES > 1)
@@ -723,7 +781,7 @@ void synchronize_rcu_expedited(void)
723 781
724 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 782 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
725 783
726 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ 784 /* Wait for snapshotted ->blkd_tasks lists to drain. */
727 rnp = rcu_get_root(rsp); 785 rnp = rcu_get_root(rsp);
728 wait_event(sync_rcu_preempt_exp_wq, 786 wait_event(sync_rcu_preempt_exp_wq,
729 sync_rcu_preempt_exp_done(rnp)); 787 sync_rcu_preempt_exp_done(rnp));
@@ -739,7 +797,7 @@ mb_ret:
739EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 797EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
740 798
741/* 799/*
742 * Check to see if there is any immediate preemptable-RCU-related work 800 * Check to see if there is any immediate preemptible-RCU-related work
743 * to be done. 801 * to be done.
744 */ 802 */
745static int rcu_preempt_pending(int cpu) 803static int rcu_preempt_pending(int cpu)
@@ -749,7 +807,7 @@ static int rcu_preempt_pending(int cpu)
749} 807}
750 808
751/* 809/*
752 * Does preemptable RCU need the CPU to stay out of dynticks mode? 810 * Does preemptible RCU need the CPU to stay out of dynticks mode?
753 */ 811 */
754static int rcu_preempt_needs_cpu(int cpu) 812static int rcu_preempt_needs_cpu(int cpu)
755{ 813{
@@ -766,7 +824,7 @@ void rcu_barrier(void)
766EXPORT_SYMBOL_GPL(rcu_barrier); 824EXPORT_SYMBOL_GPL(rcu_barrier);
767 825
768/* 826/*
769 * Initialize preemptable RCU's per-CPU data. 827 * Initialize preemptible RCU's per-CPU data.
770 */ 828 */
771static void __cpuinit rcu_preempt_init_percpu_data(int cpu) 829static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
772{ 830{
@@ -774,7 +832,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
774} 832}
775 833
776/* 834/*
777 * Move preemptable RCU's callbacks from dying CPU to other online CPU. 835 * Move preemptible RCU's callbacks from dying CPU to other online CPU.
778 */ 836 */
779static void rcu_preempt_send_cbs_to_online(void) 837static void rcu_preempt_send_cbs_to_online(void)
780{ 838{
@@ -782,7 +840,7 @@ static void rcu_preempt_send_cbs_to_online(void)
782} 840}
783 841
784/* 842/*
785 * Initialize preemptable RCU's state structures. 843 * Initialize preemptible RCU's state structures.
786 */ 844 */
787static void __init __rcu_init_preempt(void) 845static void __init __rcu_init_preempt(void)
788{ 846{
@@ -790,7 +848,7 @@ static void __init __rcu_init_preempt(void)
790} 848}
791 849
792/* 850/*
793 * Check for a task exiting while in a preemptable-RCU read-side 851 * Check for a task exiting while in a preemptible-RCU read-side
794 * critical section, clean up if so. No need to issue warnings, 852 * critical section, clean up if so. No need to issue warnings,
795 * as debug_check_no_locks_held() already does this if lockdep 853 * as debug_check_no_locks_held() already does this if lockdep
796 * is enabled. 854 * is enabled.
@@ -802,11 +860,13 @@ void exit_rcu(void)
802 if (t->rcu_read_lock_nesting == 0) 860 if (t->rcu_read_lock_nesting == 0)
803 return; 861 return;
804 t->rcu_read_lock_nesting = 1; 862 t->rcu_read_lock_nesting = 1;
805 rcu_read_unlock(); 863 __rcu_read_unlock();
806} 864}
807 865
808#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 866#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
809 867
868static struct rcu_state *rcu_state = &rcu_sched_state;
869
810/* 870/*
811 * Tell them what RCU they are running. 871 * Tell them what RCU they are running.
812 */ 872 */
@@ -836,7 +896,7 @@ void rcu_force_quiescent_state(void)
836EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 896EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
837 897
838/* 898/*
839 * Because preemptable RCU does not exist, we never have to check for 899 * Because preemptible RCU does not exist, we never have to check for
840 * CPUs being in quiescent states. 900 * CPUs being in quiescent states.
841 */ 901 */
842static void rcu_preempt_note_context_switch(int cpu) 902static void rcu_preempt_note_context_switch(int cpu)
@@ -844,10 +904,10 @@ static void rcu_preempt_note_context_switch(int cpu)
844} 904}
845 905
846/* 906/*
847 * Because preemptable RCU does not exist, there are never any preempted 907 * Because preemptible RCU does not exist, there are never any preempted
848 * RCU readers. 908 * RCU readers.
849 */ 909 */
850static int rcu_preempted_readers(struct rcu_node *rnp) 910static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
851{ 911{
852 return 0; 912 return 0;
853} 913}
@@ -862,10 +922,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
862 922
863#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 923#endif /* #ifdef CONFIG_HOTPLUG_CPU */
864 924
865#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
866
867/* 925/*
868 * Because preemptable RCU does not exist, we never have to check for 926 * Because preemptible RCU does not exist, we never have to check for
869 * tasks blocked within RCU read-side critical sections. 927 * tasks blocked within RCU read-side critical sections.
870 */ 928 */
871static void rcu_print_detail_task_stall(struct rcu_state *rsp) 929static void rcu_print_detail_task_stall(struct rcu_state *rsp)
@@ -873,7 +931,7 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
873} 931}
874 932
875/* 933/*
876 * Because preemptable RCU does not exist, we never have to check for 934 * Because preemptible RCU does not exist, we never have to check for
877 * tasks blocked within RCU read-side critical sections. 935 * tasks blocked within RCU read-side critical sections.
878 */ 936 */
879static void rcu_print_task_stall(struct rcu_node *rnp) 937static void rcu_print_task_stall(struct rcu_node *rnp)
@@ -888,10 +946,8 @@ static void rcu_preempt_stall_reset(void)
888{ 946{
889} 947}
890 948
891#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
892
893/* 949/*
894 * Because there is no preemptable RCU, there can be no readers blocked, 950 * Because there is no preemptible RCU, there can be no readers blocked,
895 * so there is no need to check for blocked tasks. So check only for 951 * so there is no need to check for blocked tasks. So check only for
896 * bogus qsmask values. 952 * bogus qsmask values.
897 */ 953 */
@@ -903,7 +959,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
903#ifdef CONFIG_HOTPLUG_CPU 959#ifdef CONFIG_HOTPLUG_CPU
904 960
905/* 961/*
906 * Because preemptable RCU does not exist, it never needs to migrate 962 * Because preemptible RCU does not exist, it never needs to migrate
907 * tasks that were blocked within RCU read-side critical sections, and 963 * tasks that were blocked within RCU read-side critical sections, and
908 * such non-existent tasks cannot possibly have been blocking the current 964 * such non-existent tasks cannot possibly have been blocking the current
909 * grace period. 965 * grace period.
@@ -916,7 +972,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
916} 972}
917 973
918/* 974/*
919 * Because preemptable RCU does not exist, it never needs CPU-offline 975 * Because preemptible RCU does not exist, it never needs CPU-offline
920 * processing. 976 * processing.
921 */ 977 */
922static void rcu_preempt_offline_cpu(int cpu) 978static void rcu_preempt_offline_cpu(int cpu)
@@ -926,7 +982,7 @@ static void rcu_preempt_offline_cpu(int cpu)
926#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 982#endif /* #ifdef CONFIG_HOTPLUG_CPU */
927 983
928/* 984/*
929 * Because preemptable RCU does not exist, it never has any callbacks 985 * Because preemptible RCU does not exist, it never has any callbacks
930 * to check. 986 * to check.
931 */ 987 */
932static void rcu_preempt_check_callbacks(int cpu) 988static void rcu_preempt_check_callbacks(int cpu)
@@ -934,7 +990,7 @@ static void rcu_preempt_check_callbacks(int cpu)
934} 990}
935 991
936/* 992/*
937 * Because preemptable RCU does not exist, it never has any callbacks 993 * Because preemptible RCU does not exist, it never has any callbacks
938 * to process. 994 * to process.
939 */ 995 */
940static void rcu_preempt_process_callbacks(void) 996static void rcu_preempt_process_callbacks(void)
@@ -943,7 +999,7 @@ static void rcu_preempt_process_callbacks(void)
943 999
944/* 1000/*
945 * Wait for an rcu-preempt grace period, but make it happen quickly. 1001 * Wait for an rcu-preempt grace period, but make it happen quickly.
946 * But because preemptable RCU does not exist, map to rcu-sched. 1002 * But because preemptible RCU does not exist, map to rcu-sched.
947 */ 1003 */
948void synchronize_rcu_expedited(void) 1004void synchronize_rcu_expedited(void)
949{ 1005{
@@ -954,7 +1010,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
954#ifdef CONFIG_HOTPLUG_CPU 1010#ifdef CONFIG_HOTPLUG_CPU
955 1011
956/* 1012/*
957 * Because preemptable RCU does not exist, there is never any need to 1013 * Because preemptible RCU does not exist, there is never any need to
958 * report on tasks preempted in RCU read-side critical sections during 1014 * report on tasks preempted in RCU read-side critical sections during
959 * expedited RCU grace periods. 1015 * expedited RCU grace periods.
960 */ 1016 */
@@ -966,7 +1022,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
966#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1022#endif /* #ifdef CONFIG_HOTPLUG_CPU */
967 1023
968/* 1024/*
969 * Because preemptable RCU does not exist, it never has any work to do. 1025 * Because preemptible RCU does not exist, it never has any work to do.
970 */ 1026 */
971static int rcu_preempt_pending(int cpu) 1027static int rcu_preempt_pending(int cpu)
972{ 1028{
@@ -974,7 +1030,7 @@ static int rcu_preempt_pending(int cpu)
974} 1030}
975 1031
976/* 1032/*
977 * Because preemptable RCU does not exist, it never needs any CPU. 1033 * Because preemptible RCU does not exist, it never needs any CPU.
978 */ 1034 */
979static int rcu_preempt_needs_cpu(int cpu) 1035static int rcu_preempt_needs_cpu(int cpu)
980{ 1036{
@@ -982,7 +1038,7 @@ static int rcu_preempt_needs_cpu(int cpu)
982} 1038}
983 1039
984/* 1040/*
985 * Because preemptable RCU does not exist, rcu_barrier() is just 1041 * Because preemptible RCU does not exist, rcu_barrier() is just
986 * another name for rcu_barrier_sched(). 1042 * another name for rcu_barrier_sched().
987 */ 1043 */
988void rcu_barrier(void) 1044void rcu_barrier(void)
@@ -992,7 +1048,7 @@ void rcu_barrier(void)
992EXPORT_SYMBOL_GPL(rcu_barrier); 1048EXPORT_SYMBOL_GPL(rcu_barrier);
993 1049
994/* 1050/*
995 * Because preemptable RCU does not exist, there is no per-CPU 1051 * Because preemptible RCU does not exist, there is no per-CPU
996 * data to initialize. 1052 * data to initialize.
997 */ 1053 */
998static void __cpuinit rcu_preempt_init_percpu_data(int cpu) 1054static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
@@ -1000,14 +1056,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
1000} 1056}
1001 1057
1002/* 1058/*
1003 * Because there is no preemptable RCU, there are no callbacks to move. 1059 * Because there is no preemptible RCU, there are no callbacks to move.
1004 */ 1060 */
1005static void rcu_preempt_send_cbs_to_online(void) 1061static void rcu_preempt_send_cbs_to_online(void)
1006{ 1062{
1007} 1063}
1008 1064
1009/* 1065/*
1010 * Because preemptable RCU does not exist, it need not be initialized. 1066 * Because preemptible RCU does not exist, it need not be initialized.
1011 */ 1067 */
1012static void __init __rcu_init_preempt(void) 1068static void __init __rcu_init_preempt(void)
1013{ 1069{
@@ -1015,6 +1071,276 @@ static void __init __rcu_init_preempt(void)
1015 1071
1016#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1072#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1017 1073
1074#ifdef CONFIG_RCU_BOOST
1075
1076#include "rtmutex_common.h"
1077
1078#ifdef CONFIG_RCU_TRACE
1079
1080static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1081{
1082 if (list_empty(&rnp->blkd_tasks))
1083 rnp->n_balk_blkd_tasks++;
1084 else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
1085 rnp->n_balk_exp_gp_tasks++;
1086 else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
1087 rnp->n_balk_boost_tasks++;
1088 else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
1089 rnp->n_balk_notblocked++;
1090 else if (rnp->gp_tasks != NULL &&
1091 ULONG_CMP_LT(jiffies, rnp->boost_time))
1092 rnp->n_balk_notyet++;
1093 else
1094 rnp->n_balk_nos++;
1095}
1096
1097#else /* #ifdef CONFIG_RCU_TRACE */
1098
1099static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1100{
1101}
1102
1103#endif /* #else #ifdef CONFIG_RCU_TRACE */
1104
1105/*
1106 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1107 * or ->boost_tasks, advancing the pointer to the next task in the
1108 * ->blkd_tasks list.
1109 *
1110 * Note that irqs must be enabled: boosting the task can block.
1111 * Returns 1 if there are more tasks needing to be boosted.
1112 */
1113static int rcu_boost(struct rcu_node *rnp)
1114{
1115 unsigned long flags;
1116 struct rt_mutex mtx;
1117 struct task_struct *t;
1118 struct list_head *tb;
1119
1120 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
1121 return 0; /* Nothing left to boost. */
1122
1123 raw_spin_lock_irqsave(&rnp->lock, flags);
1124
1125 /*
1126 * Recheck under the lock: all tasks in need of boosting
1127 * might exit their RCU read-side critical sections on their own.
1128 */
1129 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
1130 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1131 return 0;
1132 }
1133
1134 /*
1135 * Preferentially boost tasks blocking expedited grace periods.
1136 * This cannot starve the normal grace periods because a second
1137 * expedited grace period must boost all blocked tasks, including
1138 * those blocking the pre-existing normal grace period.
1139 */
1140 if (rnp->exp_tasks != NULL) {
1141 tb = rnp->exp_tasks;
1142 rnp->n_exp_boosts++;
1143 } else {
1144 tb = rnp->boost_tasks;
1145 rnp->n_normal_boosts++;
1146 }
1147 rnp->n_tasks_boosted++;
1148
1149 /*
1150 * We boost task t by manufacturing an rt_mutex that appears to
1151 * be held by task t. We leave a pointer to that rt_mutex where
1152 * task t can find it, and task t will release the mutex when it
1153 * exits its outermost RCU read-side critical section. Then
1154 * simply acquiring this artificial rt_mutex will boost task
1155 * t's priority. (Thanks to tglx for suggesting this approach!)
1156 *
1157 * Note that task t must acquire rnp->lock to remove itself from
1158 * the ->blkd_tasks list, which it will do from exit() if from
1159 * nowhere else. We therefore are guaranteed that task t will
1160 * stay around at least until we drop rnp->lock. Note that
1161 * rnp->lock also resolves races between our priority boosting
1162 * and task t's exiting its outermost RCU read-side critical
1163 * section.
1164 */
1165 t = container_of(tb, struct task_struct, rcu_node_entry);
1166 rt_mutex_init_proxy_locked(&mtx, t);
1167 t->rcu_boost_mutex = &mtx;
1168 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
1169 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1170 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
1171 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
1172
1173 return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
1174}
1175
1176/*
1177 * Timer handler to initiate waking up of boost kthreads that
1178 * have yielded the CPU due to excessive numbers of tasks to
1179 * boost. We wake up the per-rcu_node kthread, which in turn
1180 * will wake up the booster kthread.
1181 */
1182static void rcu_boost_kthread_timer(unsigned long arg)
1183{
1184 invoke_rcu_node_kthread((struct rcu_node *)arg);
1185}
1186
1187/*
1188 * Priority-boosting kthread. One per leaf rcu_node and one for the
1189 * root rcu_node.
1190 */
1191static int rcu_boost_kthread(void *arg)
1192{
1193 struct rcu_node *rnp = (struct rcu_node *)arg;
1194 int spincnt = 0;
1195 int more2boost;
1196
1197 for (;;) {
1198 rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1199 wait_event_interruptible(rnp->boost_wq, rnp->boost_tasks ||
1200 rnp->exp_tasks);
1201 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1202 more2boost = rcu_boost(rnp);
1203 if (more2boost)
1204 spincnt++;
1205 else
1206 spincnt = 0;
1207 if (spincnt > 10) {
1208 rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
1209 spincnt = 0;
1210 }
1211 }
1212 /* NOTREACHED */
1213 return 0;
1214}
1215
1216/*
1217 * Check to see if it is time to start boosting RCU readers that are
1218 * blocking the current grace period, and, if so, tell the per-rcu_node
1219 * kthread to start boosting them. If there is an expedited grace
1220 * period in progress, it is always time to boost.
1221 *
1222 * The caller must hold rnp->lock, which this function releases,
1223 * but irqs remain disabled. The ->boost_kthread_task is immortal,
1224 * so we don't need to worry about it going away.
1225 */
1226static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1227{
1228 struct task_struct *t;
1229
1230 if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
1231 rnp->n_balk_exp_gp_tasks++;
1232 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1233 return;
1234 }
1235 if (rnp->exp_tasks != NULL ||
1236 (rnp->gp_tasks != NULL &&
1237 rnp->boost_tasks == NULL &&
1238 rnp->qsmask == 0 &&
1239 ULONG_CMP_GE(jiffies, rnp->boost_time))) {
1240 if (rnp->exp_tasks == NULL)
1241 rnp->boost_tasks = rnp->gp_tasks;
1242 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1243 t = rnp->boost_kthread_task;
1244 if (t != NULL)
1245 wake_up_process(t);
1246 } else {
1247 rcu_initiate_boost_trace(rnp);
1248 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1249 }
1250}
1251
1252/*
1253 * Set the affinity of the boost kthread. The CPU-hotplug locks are
1254 * held, so no one should be messing with the existence of the boost
1255 * kthread.
1256 */
1257static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
1258 cpumask_var_t cm)
1259{
1260 struct task_struct *t;
1261
1262 t = rnp->boost_kthread_task;
1263 if (t != NULL)
1264 set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
1265}
1266
1267#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
1268
1269/*
1270 * Do priority-boost accounting for the start of a new grace period.
1271 */
1272static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1273{
1274 rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
1275}
1276
1277/*
1278 * Initialize the RCU-boost waitqueue.
1279 */
1280static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
1281{
1282 init_waitqueue_head(&rnp->boost_wq);
1283}
1284
1285/*
1286 * Create an RCU-boost kthread for the specified node if one does not
1287 * already exist. We only create this kthread for preemptible RCU.
1288 * Returns zero if all is well, a negated errno otherwise.
1289 */
1290static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1291 struct rcu_node *rnp,
1292 int rnp_index)
1293{
1294 unsigned long flags;
1295 struct sched_param sp;
1296 struct task_struct *t;
1297
1298 if (&rcu_preempt_state != rsp)
1299 return 0;
1300 if (rnp->boost_kthread_task != NULL)
1301 return 0;
1302 t = kthread_create(rcu_boost_kthread, (void *)rnp,
1303 "rcub%d", rnp_index);
1304 if (IS_ERR(t))
1305 return PTR_ERR(t);
1306 raw_spin_lock_irqsave(&rnp->lock, flags);
1307 rnp->boost_kthread_task = t;
1308 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1309 wake_up_process(t);
1310 sp.sched_priority = RCU_KTHREAD_PRIO;
1311 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1312 return 0;
1313}
1314
1315#else /* #ifdef CONFIG_RCU_BOOST */
1316
1317static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1318{
1319 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1320}
1321
1322static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
1323 cpumask_var_t cm)
1324{
1325}
1326
1327static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1328{
1329}
1330
1331static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
1332{
1333}
1334
1335static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1336 struct rcu_node *rnp,
1337 int rnp_index)
1338{
1339 return 0;
1340}
1341
1342#endif /* #else #ifdef CONFIG_RCU_BOOST */
1343
1018#ifndef CONFIG_SMP 1344#ifndef CONFIG_SMP
1019 1345
1020void synchronize_sched_expedited(void) 1346void synchronize_sched_expedited(void)
@@ -1187,8 +1513,8 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1187 * 1513 *
1188 * Because it is not legal to invoke rcu_process_callbacks() with irqs 1514 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1189 * disabled, we do one pass of force_quiescent_state(), then do a 1515 * disabled, we do one pass of force_quiescent_state(), then do a
1190 * raise_softirq() to cause rcu_process_callbacks() to be invoked later. 1516 * invoke_rcu_cpu_kthread() to cause rcu_process_callbacks() to be invoked
1191 * The per-cpu rcu_dyntick_drain variable controls the sequencing. 1517 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing.
1192 */ 1518 */
1193int rcu_needs_cpu(int cpu) 1519int rcu_needs_cpu(int cpu)
1194{ 1520{
@@ -1239,7 +1565,7 @@ int rcu_needs_cpu(int cpu)
1239 1565
1240 /* If RCU callbacks are still pending, RCU still needs this CPU. */ 1566 /* If RCU callbacks are still pending, RCU still needs this CPU. */
1241 if (c) 1567 if (c)
1242 raise_softirq(RCU_SOFTIRQ); 1568 invoke_rcu_cpu_kthread();
1243 return c; 1569 return c;
1244} 1570}
1245 1571
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index c8e97853b970..aa0fd72b4bc7 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,6 +46,18 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
50DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
51DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
52DECLARE_PER_CPU(char, rcu_cpu_has_work);
53
54static char convert_kthread_status(unsigned int kthread_status)
55{
56 if (kthread_status > RCU_KTHREAD_MAX)
57 return '?';
58 return "SRWOY"[kthread_status];
59}
60
49static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 61static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
50{ 62{
51 if (!rdp->beenonline) 63 if (!rdp->beenonline)
@@ -64,7 +76,21 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
64 rdp->dynticks_fqs); 76 rdp->dynticks_fqs);
65#endif /* #ifdef CONFIG_NO_HZ */ 77#endif /* #ifdef CONFIG_NO_HZ */
66 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 78 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
67 seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); 79 seq_printf(m, " ql=%ld qs=%c%c%c%c kt=%d/%c/%d ktl=%x b=%ld",
80 rdp->qlen,
81 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
82 rdp->nxttail[RCU_NEXT_TAIL]],
83 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
84 rdp->nxttail[RCU_NEXT_READY_TAIL]],
85 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
86 rdp->nxttail[RCU_WAIT_TAIL]],
87 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]],
88 per_cpu(rcu_cpu_has_work, rdp->cpu),
89 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
90 rdp->cpu)),
91 per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
92 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff,
93 rdp->blimit);
68 seq_printf(m, " ci=%lu co=%lu ca=%lu\n", 94 seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
69 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 95 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
70} 96}
@@ -121,7 +147,18 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
121 rdp->dynticks_fqs); 147 rdp->dynticks_fqs);
122#endif /* #ifdef CONFIG_NO_HZ */ 148#endif /* #ifdef CONFIG_NO_HZ */
123 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 149 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
124 seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); 150 seq_printf(m, ",%ld,\"%c%c%c%c\",%d,\"%c\",%ld", rdp->qlen,
151 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
152 rdp->nxttail[RCU_NEXT_TAIL]],
153 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
154 rdp->nxttail[RCU_NEXT_READY_TAIL]],
155 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
156 rdp->nxttail[RCU_WAIT_TAIL]],
157 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]],
158 per_cpu(rcu_cpu_has_work, rdp->cpu),
159 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
160 rdp->cpu)),
161 rdp->blimit);
125 seq_printf(m, ",%lu,%lu,%lu\n", 162 seq_printf(m, ",%lu,%lu,%lu\n",
126 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 163 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
127} 164}
@@ -157,11 +194,76 @@ static const struct file_operations rcudata_csv_fops = {
157 .release = single_release, 194 .release = single_release,
158}; 195};
159 196
197#ifdef CONFIG_RCU_BOOST
198
199static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
200{
201 seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu "
202 "j=%04x bt=%04x\n",
203 rnp->grplo, rnp->grphi,
204 "T."[list_empty(&rnp->blkd_tasks)],
205 "N."[!rnp->gp_tasks],
206 "E."[!rnp->exp_tasks],
207 "B."[!rnp->boost_tasks],
208 convert_kthread_status(rnp->boost_kthread_status),
209 rnp->n_tasks_boosted, rnp->n_exp_boosts,
210 rnp->n_normal_boosts,
211 (int)(jiffies & 0xffff),
212 (int)(rnp->boost_time & 0xffff));
213 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
214 " balk",
215 rnp->n_balk_blkd_tasks,
216 rnp->n_balk_exp_gp_tasks,
217 rnp->n_balk_boost_tasks,
218 rnp->n_balk_notblocked,
219 rnp->n_balk_notyet,
220 rnp->n_balk_nos);
221}
222
223static int show_rcu_node_boost(struct seq_file *m, void *unused)
224{
225 struct rcu_node *rnp;
226
227 rcu_for_each_leaf_node(&rcu_preempt_state, rnp)
228 print_one_rcu_node_boost(m, rnp);
229 return 0;
230}
231
232static int rcu_node_boost_open(struct inode *inode, struct file *file)
233{
234 return single_open(file, show_rcu_node_boost, NULL);
235}
236
237static const struct file_operations rcu_node_boost_fops = {
238 .owner = THIS_MODULE,
239 .open = rcu_node_boost_open,
240 .read = seq_read,
241 .llseek = seq_lseek,
242 .release = single_release,
243};
244
245/*
246 * Create the rcuboost debugfs entry. Standard error return.
247 */
248static int rcu_boost_trace_create_file(struct dentry *rcudir)
249{
250 return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
251 &rcu_node_boost_fops);
252}
253
254#else /* #ifdef CONFIG_RCU_BOOST */
255
256static int rcu_boost_trace_create_file(struct dentry *rcudir)
257{
258 return 0; /* There cannot be an error if we didn't create it! */
259}
260
261#endif /* #else #ifdef CONFIG_RCU_BOOST */
262
160static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 263static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
161{ 264{
162 unsigned long gpnum; 265 unsigned long gpnum;
163 int level = 0; 266 int level = 0;
164 int phase;
165 struct rcu_node *rnp; 267 struct rcu_node *rnp;
166 268
167 gpnum = rsp->gpnum; 269 gpnum = rsp->gpnum;
@@ -178,13 +280,11 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
178 seq_puts(m, "\n"); 280 seq_puts(m, "\n");
179 level = rnp->level; 281 level = rnp->level;
180 } 282 }
181 phase = gpnum & 0x1; 283 seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ",
182 seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ",
183 rnp->qsmask, rnp->qsmaskinit, 284 rnp->qsmask, rnp->qsmaskinit,
184 "T."[list_empty(&rnp->blocked_tasks[phase])], 285 ".G"[rnp->gp_tasks != NULL],
185 "E."[list_empty(&rnp->blocked_tasks[phase + 2])], 286 ".E"[rnp->exp_tasks != NULL],
186 "T."[list_empty(&rnp->blocked_tasks[!phase])], 287 ".T"[!list_empty(&rnp->blkd_tasks)],
187 "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
188 rnp->grplo, rnp->grphi, rnp->grpnum); 288 rnp->grplo, rnp->grphi, rnp->grpnum);
189 } 289 }
190 seq_puts(m, "\n"); 290 seq_puts(m, "\n");
@@ -216,16 +316,35 @@ static const struct file_operations rcuhier_fops = {
216 .release = single_release, 316 .release = single_release,
217}; 317};
218 318
319static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
320{
321 unsigned long flags;
322 unsigned long completed;
323 unsigned long gpnum;
324 unsigned long gpage;
325 unsigned long gpmax;
326 struct rcu_node *rnp = &rsp->node[0];
327
328 raw_spin_lock_irqsave(&rnp->lock, flags);
329 completed = rsp->completed;
330 gpnum = rsp->gpnum;
331 if (rsp->completed == rsp->gpnum)
332 gpage = 0;
333 else
334 gpage = jiffies - rsp->gp_start;
335 gpmax = rsp->gp_max;
336 raw_spin_unlock_irqrestore(&rnp->lock, flags);
337 seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n",
338 rsp->name, completed, gpnum, gpage, gpmax);
339}
340
219static int show_rcugp(struct seq_file *m, void *unused) 341static int show_rcugp(struct seq_file *m, void *unused)
220{ 342{
221#ifdef CONFIG_TREE_PREEMPT_RCU 343#ifdef CONFIG_TREE_PREEMPT_RCU
222 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n", 344 show_one_rcugp(m, &rcu_preempt_state);
223 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
224#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 345#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
225 seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n", 346 show_one_rcugp(m, &rcu_sched_state);
226 rcu_sched_state.completed, rcu_sched_state.gpnum); 347 show_one_rcugp(m, &rcu_bh_state);
227 seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n",
228 rcu_bh_state.completed, rcu_bh_state.gpnum);
229 return 0; 348 return 0;
230} 349}
231 350
@@ -298,6 +417,29 @@ static const struct file_operations rcu_pending_fops = {
298 .release = single_release, 417 .release = single_release,
299}; 418};
300 419
420static int show_rcutorture(struct seq_file *m, void *unused)
421{
422 seq_printf(m, "rcutorture test sequence: %lu %s\n",
423 rcutorture_testseq >> 1,
424 (rcutorture_testseq & 0x1) ? "(test in progress)" : "");
425 seq_printf(m, "rcutorture update version number: %lu\n",
426 rcutorture_vernum);
427 return 0;
428}
429
430static int rcutorture_open(struct inode *inode, struct file *file)
431{
432 return single_open(file, show_rcutorture, NULL);
433}
434
435static const struct file_operations rcutorture_fops = {
436 .owner = THIS_MODULE,
437 .open = rcutorture_open,
438 .read = seq_read,
439 .llseek = seq_lseek,
440 .release = single_release,
441};
442
301static struct dentry *rcudir; 443static struct dentry *rcudir;
302 444
303static int __init rcutree_trace_init(void) 445static int __init rcutree_trace_init(void)
@@ -318,6 +460,9 @@ static int __init rcutree_trace_init(void)
318 if (!retval) 460 if (!retval)
319 goto free_out; 461 goto free_out;
320 462
463 if (rcu_boost_trace_create_file(rcudir))
464 goto free_out;
465
321 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 466 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
322 if (!retval) 467 if (!retval)
323 goto free_out; 468 goto free_out;
@@ -331,6 +476,11 @@ static int __init rcutree_trace_init(void)
331 NULL, &rcu_pending_fops); 476 NULL, &rcu_pending_fops);
332 if (!retval) 477 if (!retval)
333 goto free_out; 478 goto free_out;
479
480 retval = debugfs_create_file("rcutorture", 0444, rcudir,
481 NULL, &rcutorture_fops);
482 if (!retval)
483 goto free_out;
334 return 0; 484 return 0;
335free_out: 485free_out:
336 debugfs_remove_recursive(rcudir); 486 debugfs_remove_recursive(rcudir);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index c7eaa37a768b..34683efa2cce 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
126 pos, buf, s - buf); 126 pos, buf, s - buf);
127} 127}
128 128
129#if BITS_PER_LONG == 32
130u64 res_counter_read_u64(struct res_counter *counter, int member)
131{
132 unsigned long flags;
133 u64 ret;
134
135 spin_lock_irqsave(&counter->lock, flags);
136 ret = *res_counter_member(counter, member);
137 spin_unlock_irqrestore(&counter->lock, flags);
138
139 return ret;
140}
141#else
129u64 res_counter_read_u64(struct res_counter *counter, int member) 142u64 res_counter_read_u64(struct res_counter *counter, int member)
130{ 143{
131 return *res_counter_member(counter, member); 144 return *res_counter_member(counter, member);
132} 145}
146#endif
133 147
134int res_counter_memparse_write_strategy(const char *buf, 148int res_counter_memparse_write_strategy(const char *buf,
135 unsigned long long *res) 149 unsigned long long *res)
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index ddabb54bb5c8..3c7cbc2c33be 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
215 put_pid(waiter->deadlock_task_pid); 215 put_pid(waiter->deadlock_task_pid);
216 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); 216 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
217 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 217 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
218 TRACE_WARN_ON(waiter->task);
219 memset(waiter, 0x22, sizeof(*waiter)); 218 memset(waiter, 0x22, sizeof(*waiter));
220} 219}
221 220
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 66cb89bc5ef1..5c9ccd380966 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -9,7 +9,6 @@
9#include <linux/kthread.h> 9#include <linux/kthread.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/smp_lock.h>
13#include <linux/spinlock.h> 12#include <linux/spinlock.h>
14#include <linux/sysdev.h> 13#include <linux/sysdev.h>
15#include <linux/timer.h> 14#include <linux/timer.h>
@@ -27,7 +26,6 @@ struct test_thread_data {
27 int opcode; 26 int opcode;
28 int opdata; 27 int opdata;
29 int mutexes[MAX_RT_TEST_MUTEXES]; 28 int mutexes[MAX_RT_TEST_MUTEXES];
30 int bkl;
31 int event; 29 int event;
32 struct sys_device sysdev; 30 struct sys_device sysdev;
33}; 31};
@@ -46,9 +44,8 @@ enum test_opcodes {
46 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ 44 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
47 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ 45 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
48 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ 46 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
49 RTTEST_LOCKBKL, /* 9 Lock BKL */ 47 /* 9, 10 - reserved for BKL commemoration */
50 RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ 48 RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */
51 RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */
52 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ 49 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
53 RTTEST_RESET = 99, /* 99 Reset all pending operations */ 50 RTTEST_RESET = 99, /* 99 Reset all pending operations */
54}; 51};
@@ -74,13 +71,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
74 td->mutexes[i] = 0; 71 td->mutexes[i] = 0;
75 } 72 }
76 } 73 }
77
78 if (!lockwakeup && td->bkl == 4) {
79#ifdef CONFIG_LOCK_KERNEL
80 unlock_kernel();
81#endif
82 td->bkl = 0;
83 }
84 return 0; 74 return 0;
85 75
86 case RTTEST_RESETEVENT: 76 case RTTEST_RESETEVENT:
@@ -131,25 +121,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
131 td->mutexes[id] = 0; 121 td->mutexes[id] = 0;
132 return 0; 122 return 0;
133 123
134 case RTTEST_LOCKBKL:
135 if (td->bkl)
136 return 0;
137 td->bkl = 1;
138#ifdef CONFIG_LOCK_KERNEL
139 lock_kernel();
140#endif
141 td->bkl = 4;
142 return 0;
143
144 case RTTEST_UNLOCKBKL:
145 if (td->bkl != 4)
146 break;
147#ifdef CONFIG_LOCK_KERNEL
148 unlock_kernel();
149#endif
150 td->bkl = 0;
151 return 0;
152
153 default: 124 default:
154 break; 125 break;
155 } 126 }
@@ -196,7 +167,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
196 td->event = atomic_add_return(1, &rttest_event); 167 td->event = atomic_add_return(1, &rttest_event);
197 break; 168 break;
198 169
199 case RTTEST_LOCKBKL:
200 default: 170 default:
201 break; 171 break;
202 } 172 }
@@ -229,8 +199,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
229 td->event = atomic_add_return(1, &rttest_event); 199 td->event = atomic_add_return(1, &rttest_event);
230 return; 200 return;
231 201
232 case RTTEST_LOCKBKL:
233 return;
234 default: 202 default:
235 return; 203 return;
236 } 204 }
@@ -380,11 +348,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
380 spin_lock(&rttest_lock); 348 spin_lock(&rttest_lock);
381 349
382 curr += sprintf(curr, 350 curr += sprintf(curr,
383 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", 351 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
384 td->opcode, td->event, tsk->state, 352 td->opcode, td->event, tsk->state,
385 (MAX_RT_PRIO - 1) - tsk->prio, 353 (MAX_RT_PRIO - 1) - tsk->prio,
386 (MAX_RT_PRIO - 1) - tsk->normal_prio, 354 (MAX_RT_PRIO - 1) - tsk->normal_prio,
387 tsk->pi_blocked_on, td->bkl); 355 tsk->pi_blocked_on);
388 356
389 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) 357 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
390 curr += sprintf(curr, "%d", td->mutexes[i]); 358 curr += sprintf(curr, "%d", td->mutexes[i]);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a9604815786a..ab449117aaf2 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -20,41 +20,34 @@
20/* 20/*
21 * lock->owner state tracking: 21 * lock->owner state tracking:
22 * 22 *
23 * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 23 * lock->owner holds the task_struct pointer of the owner. Bit 0
24 * are used to keep track of the "owner is pending" and "lock has 24 * is used to keep track of the "lock has waiters" state.
25 * waiters" state.
26 * 25 *
27 * owner bit1 bit0 26 * owner bit0
28 * NULL 0 0 lock is free (fast acquire possible) 27 * NULL 0 lock is free (fast acquire possible)
29 * NULL 0 1 invalid state 28 * NULL 1 lock is free and has waiters and the top waiter
30 * NULL 1 0 Transitional State* 29 * is going to take the lock*
31 * NULL 1 1 invalid state 30 * taskpointer 0 lock is held (fast release possible)
32 * taskpointer 0 0 lock is held (fast release possible) 31 * taskpointer 1 lock is held and has waiters**
33 * taskpointer 0 1 task is pending owner
34 * taskpointer 1 0 lock is held and has waiters
35 * taskpointer 1 1 task is pending owner and lock has more waiters
36 *
37 * Pending ownership is assigned to the top (highest priority)
38 * waiter of the lock, when the lock is released. The thread is woken
39 * up and can now take the lock. Until the lock is taken (bit 0
40 * cleared) a competing higher priority thread can steal the lock
41 * which puts the woken up thread back on the waiters list.
42 * 32 *
43 * The fast atomic compare exchange based acquire and release is only 33 * The fast atomic compare exchange based acquire and release is only
44 * possible when bit 0 and 1 of lock->owner are 0. 34 * possible when bit 0 of lock->owner is 0.
35 *
36 * (*) It also can be a transitional state when grabbing the lock
37 * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
38 * we need to set the bit0 before looking at the lock, and the owner may be
39 * NULL in this small time, hence this can be a transitional state.
45 * 40 *
46 * (*) There's a small time where the owner can be NULL and the 41 * (**) There is a small time when bit 0 is set but there are no
47 * "lock has waiters" bit is set. This can happen when grabbing the lock. 42 * waiters. This can happen when grabbing the lock in the slow path.
48 * To prevent a cmpxchg of the owner releasing the lock, we need to set this 43 * To prevent a cmpxchg of the owner releasing the lock, we need to
49 * bit before looking at the lock, hence the reason this is a transitional 44 * set this bit before looking at the lock.
50 * state.
51 */ 45 */
52 46
53static void 47static void
54rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, 48rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
55 unsigned long mask)
56{ 49{
57 unsigned long val = (unsigned long)owner | mask; 50 unsigned long val = (unsigned long)owner;
58 51
59 if (rt_mutex_has_waiters(lock)) 52 if (rt_mutex_has_waiters(lock))
60 val |= RT_MUTEX_HAS_WAITERS; 53 val |= RT_MUTEX_HAS_WAITERS;
@@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
203 * reached or the state of the chain has changed while we 196 * reached or the state of the chain has changed while we
204 * dropped the locks. 197 * dropped the locks.
205 */ 198 */
206 if (!waiter || !waiter->task) 199 if (!waiter)
207 goto out_unlock_pi; 200 goto out_unlock_pi;
208 201
209 /* 202 /*
210 * Check the orig_waiter state. After we dropped the locks, 203 * Check the orig_waiter state. After we dropped the locks,
211 * the previous owner of the lock might have released the lock 204 * the previous owner of the lock might have released the lock.
212 * and made us the pending owner:
213 */ 205 */
214 if (orig_waiter && !orig_waiter->task) 206 if (orig_waiter && !rt_mutex_owner(orig_lock))
215 goto out_unlock_pi; 207 goto out_unlock_pi;
216 208
217 /* 209 /*
@@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
254 246
255 /* Release the task */ 247 /* Release the task */
256 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 248 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
249 if (!rt_mutex_owner(lock)) {
250 /*
251 * If the requeue above changed the top waiter, then we need
252 * to wake the new top waiter up to try to get the lock.
253 */
254
255 if (top_waiter != rt_mutex_top_waiter(lock))
256 wake_up_process(rt_mutex_top_waiter(lock)->task);
257 raw_spin_unlock(&lock->wait_lock);
258 goto out_put_task;
259 }
257 put_task_struct(task); 260 put_task_struct(task);
258 261
259 /* Grab the next task */ 262 /* Grab the next task */
@@ -296,78 +299,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
296} 299}
297 300
298/* 301/*
299 * Optimization: check if we can steal the lock from the
300 * assigned pending owner [which might not have taken the
301 * lock yet]:
302 */
303static inline int try_to_steal_lock(struct rt_mutex *lock,
304 struct task_struct *task)
305{
306 struct task_struct *pendowner = rt_mutex_owner(lock);
307 struct rt_mutex_waiter *next;
308 unsigned long flags;
309
310 if (!rt_mutex_owner_pending(lock))
311 return 0;
312
313 if (pendowner == task)
314 return 1;
315
316 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
317 if (task->prio >= pendowner->prio) {
318 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
319 return 0;
320 }
321
322 /*
323 * Check if a waiter is enqueued on the pending owners
324 * pi_waiters list. Remove it and readjust pending owners
325 * priority.
326 */
327 if (likely(!rt_mutex_has_waiters(lock))) {
328 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
329 return 1;
330 }
331
332 /* No chain handling, pending owner is not blocked on anything: */
333 next = rt_mutex_top_waiter(lock);
334 plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
335 __rt_mutex_adjust_prio(pendowner);
336 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
337
338 /*
339 * We are going to steal the lock and a waiter was
340 * enqueued on the pending owners pi_waiters queue. So
341 * we have to enqueue this waiter into
342 * task->pi_waiters list. This covers the case,
343 * where task is boosted because it holds another
344 * lock and gets unboosted because the booster is
345 * interrupted, so we would delay a waiter with higher
346 * priority as task->normal_prio.
347 *
348 * Note: in the rare case of a SCHED_OTHER task changing
349 * its priority and thus stealing the lock, next->task
350 * might be task:
351 */
352 if (likely(next->task != task)) {
353 raw_spin_lock_irqsave(&task->pi_lock, flags);
354 plist_add(&next->pi_list_entry, &task->pi_waiters);
355 __rt_mutex_adjust_prio(task);
356 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
357 }
358 return 1;
359}
360
361/*
362 * Try to take an rt-mutex 302 * Try to take an rt-mutex
363 * 303 *
364 * This fails
365 * - when the lock has a real owner
366 * - when a different pending owner exists and has higher priority than current
367 *
368 * Must be called with lock->wait_lock held. 304 * Must be called with lock->wait_lock held.
305 *
306 * @lock: the lock to be acquired.
307 * @task: the task which wants to acquire the lock
308 * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
369 */ 309 */
370static int try_to_take_rt_mutex(struct rt_mutex *lock) 310static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
311 struct rt_mutex_waiter *waiter)
371{ 312{
372 /* 313 /*
373 * We have to be careful here if the atomic speedups are 314 * We have to be careful here if the atomic speedups are
@@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
390 */ 331 */
391 mark_rt_mutex_waiters(lock); 332 mark_rt_mutex_waiters(lock);
392 333
393 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) 334 if (rt_mutex_owner(lock))
394 return 0; 335 return 0;
395 336
337 /*
338 * It will get the lock because of one of these conditions:
339 * 1) there is no waiter
340 * 2) higher priority than waiters
341 * 3) it is top waiter
342 */
343 if (rt_mutex_has_waiters(lock)) {
344 if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) {
345 if (!waiter || waiter != rt_mutex_top_waiter(lock))
346 return 0;
347 }
348 }
349
350 if (waiter || rt_mutex_has_waiters(lock)) {
351 unsigned long flags;
352 struct rt_mutex_waiter *top;
353
354 raw_spin_lock_irqsave(&task->pi_lock, flags);
355
356 /* remove the queued waiter. */
357 if (waiter) {
358 plist_del(&waiter->list_entry, &lock->wait_list);
359 task->pi_blocked_on = NULL;
360 }
361
362 /*
363 * We have to enqueue the top waiter(if it exists) into
364 * task->pi_waiters list.
365 */
366 if (rt_mutex_has_waiters(lock)) {
367 top = rt_mutex_top_waiter(lock);
368 top->pi_list_entry.prio = top->list_entry.prio;
369 plist_add(&top->pi_list_entry, &task->pi_waiters);
370 }
371 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
372 }
373
396 /* We got the lock. */ 374 /* We got the lock. */
397 debug_rt_mutex_lock(lock); 375 debug_rt_mutex_lock(lock);
398 376
399 rt_mutex_set_owner(lock, current, 0); 377 rt_mutex_set_owner(lock, task);
400 378
401 rt_mutex_deadlock_account_lock(lock, current); 379 rt_mutex_deadlock_account_lock(lock, task);
402 380
403 return 1; 381 return 1;
404} 382}
@@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
436 414
437 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 415 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
438 416
417 if (!owner)
418 return 0;
419
439 if (waiter == rt_mutex_top_waiter(lock)) { 420 if (waiter == rt_mutex_top_waiter(lock)) {
440 raw_spin_lock_irqsave(&owner->pi_lock, flags); 421 raw_spin_lock_irqsave(&owner->pi_lock, flags);
441 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); 422 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
@@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
472/* 453/*
473 * Wake up the next waiter on the lock. 454 * Wake up the next waiter on the lock.
474 * 455 *
475 * Remove the top waiter from the current tasks waiter list and from 456 * Remove the top waiter from the current tasks waiter list and wake it up.
476 * the lock waiter list. Set it as pending owner. Then wake it up.
477 * 457 *
478 * Called with lock->wait_lock held. 458 * Called with lock->wait_lock held.
479 */ 459 */
480static void wakeup_next_waiter(struct rt_mutex *lock) 460static void wakeup_next_waiter(struct rt_mutex *lock)
481{ 461{
482 struct rt_mutex_waiter *waiter; 462 struct rt_mutex_waiter *waiter;
483 struct task_struct *pendowner;
484 unsigned long flags; 463 unsigned long flags;
485 464
486 raw_spin_lock_irqsave(&current->pi_lock, flags); 465 raw_spin_lock_irqsave(&current->pi_lock, flags);
487 466
488 waiter = rt_mutex_top_waiter(lock); 467 waiter = rt_mutex_top_waiter(lock);
489 plist_del(&waiter->list_entry, &lock->wait_list);
490 468
491 /* 469 /*
492 * Remove it from current->pi_waiters. We do not adjust a 470 * Remove it from current->pi_waiters. We do not adjust a
@@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
495 * lock->wait_lock. 473 * lock->wait_lock.
496 */ 474 */
497 plist_del(&waiter->pi_list_entry, &current->pi_waiters); 475 plist_del(&waiter->pi_list_entry, &current->pi_waiters);
498 pendowner = waiter->task;
499 waiter->task = NULL;
500 476
501 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); 477 rt_mutex_set_owner(lock, NULL);
502 478
503 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 479 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
504 480
505 /* 481 wake_up_process(waiter->task);
506 * Clear the pi_blocked_on variable and enqueue a possible
507 * waiter into the pi_waiters list of the pending owner. This
508 * prevents that in case the pending owner gets unboosted a
509 * waiter with higher priority than pending-owner->normal_prio
510 * is blocked on the unboosted (pending) owner.
511 */
512 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
513
514 WARN_ON(!pendowner->pi_blocked_on);
515 WARN_ON(pendowner->pi_blocked_on != waiter);
516 WARN_ON(pendowner->pi_blocked_on->lock != lock);
517
518 pendowner->pi_blocked_on = NULL;
519
520 if (rt_mutex_has_waiters(lock)) {
521 struct rt_mutex_waiter *next;
522
523 next = rt_mutex_top_waiter(lock);
524 plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
525 }
526 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
527
528 wake_up_process(pendowner);
529} 482}
530 483
531/* 484/*
532 * Remove a waiter from a lock 485 * Remove a waiter from a lock and give up
533 * 486 *
534 * Must be called with lock->wait_lock held 487 * Must be called with lock->wait_lock held and
488 * have just failed to try_to_take_rt_mutex().
535 */ 489 */
536static void remove_waiter(struct rt_mutex *lock, 490static void remove_waiter(struct rt_mutex *lock,
537 struct rt_mutex_waiter *waiter) 491 struct rt_mutex_waiter *waiter)
@@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock,
543 497
544 raw_spin_lock_irqsave(&current->pi_lock, flags); 498 raw_spin_lock_irqsave(&current->pi_lock, flags);
545 plist_del(&waiter->list_entry, &lock->wait_list); 499 plist_del(&waiter->list_entry, &lock->wait_list);
546 waiter->task = NULL;
547 current->pi_blocked_on = NULL; 500 current->pi_blocked_on = NULL;
548 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 501 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
549 502
550 if (first && owner != current) { 503 if (!owner)
504 return;
505
506 if (first) {
551 507
552 raw_spin_lock_irqsave(&owner->pi_lock, flags); 508 raw_spin_lock_irqsave(&owner->pi_lock, flags);
553 509
@@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task)
614 * or TASK_UNINTERRUPTIBLE) 570 * or TASK_UNINTERRUPTIBLE)
615 * @timeout: the pre-initialized and started timer, or NULL for none 571 * @timeout: the pre-initialized and started timer, or NULL for none
616 * @waiter: the pre-initialized rt_mutex_waiter 572 * @waiter: the pre-initialized rt_mutex_waiter
617 * @detect_deadlock: passed to task_blocks_on_rt_mutex
618 * 573 *
619 * lock->wait_lock must be held by the caller. 574 * lock->wait_lock must be held by the caller.
620 */ 575 */
621static int __sched 576static int __sched
622__rt_mutex_slowlock(struct rt_mutex *lock, int state, 577__rt_mutex_slowlock(struct rt_mutex *lock, int state,
623 struct hrtimer_sleeper *timeout, 578 struct hrtimer_sleeper *timeout,
624 struct rt_mutex_waiter *waiter, 579 struct rt_mutex_waiter *waiter)
625 int detect_deadlock)
626{ 580{
627 int ret = 0; 581 int ret = 0;
628 582
629 for (;;) { 583 for (;;) {
630 /* Try to acquire the lock: */ 584 /* Try to acquire the lock: */
631 if (try_to_take_rt_mutex(lock)) 585 if (try_to_take_rt_mutex(lock, current, waiter))
632 break; 586 break;
633 587
634 /* 588 /*
@@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
645 break; 599 break;
646 } 600 }
647 601
648 /*
649 * waiter->task is NULL the first time we come here and
650 * when we have been woken up by the previous owner
651 * but the lock got stolen by a higher prio task.
652 */
653 if (!waiter->task) {
654 ret = task_blocks_on_rt_mutex(lock, waiter, current,
655 detect_deadlock);
656 /*
657 * If we got woken up by the owner then start loop
658 * all over without going into schedule to try
659 * to get the lock now:
660 */
661 if (unlikely(!waiter->task)) {
662 /*
663 * Reset the return value. We might
664 * have returned with -EDEADLK and the
665 * owner released the lock while we
666 * were walking the pi chain.
667 */
668 ret = 0;
669 continue;
670 }
671 if (unlikely(ret))
672 break;
673 }
674
675 raw_spin_unlock(&lock->wait_lock); 602 raw_spin_unlock(&lock->wait_lock);
676 603
677 debug_rt_mutex_print_deadlock(waiter); 604 debug_rt_mutex_print_deadlock(waiter);
678 605
679 if (waiter->task) 606 schedule_rt_mutex(lock);
680 schedule_rt_mutex(lock);
681 607
682 raw_spin_lock(&lock->wait_lock); 608 raw_spin_lock(&lock->wait_lock);
683 set_current_state(state); 609 set_current_state(state);
@@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
698 int ret = 0; 624 int ret = 0;
699 625
700 debug_rt_mutex_init_waiter(&waiter); 626 debug_rt_mutex_init_waiter(&waiter);
701 waiter.task = NULL;
702 627
703 raw_spin_lock(&lock->wait_lock); 628 raw_spin_lock(&lock->wait_lock);
704 629
705 /* Try to acquire the lock again: */ 630 /* Try to acquire the lock again: */
706 if (try_to_take_rt_mutex(lock)) { 631 if (try_to_take_rt_mutex(lock, current, NULL)) {
707 raw_spin_unlock(&lock->wait_lock); 632 raw_spin_unlock(&lock->wait_lock);
708 return 0; 633 return 0;
709 } 634 }
@@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
717 timeout->task = NULL; 642 timeout->task = NULL;
718 } 643 }
719 644
720 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, 645 ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock);
721 detect_deadlock); 646
647 if (likely(!ret))
648 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
722 649
723 set_current_state(TASK_RUNNING); 650 set_current_state(TASK_RUNNING);
724 651
725 if (unlikely(waiter.task)) 652 if (unlikely(ret))
726 remove_waiter(lock, &waiter); 653 remove_waiter(lock, &waiter);
727 654
728 /* 655 /*
@@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
737 if (unlikely(timeout)) 664 if (unlikely(timeout))
738 hrtimer_cancel(&timeout->timer); 665 hrtimer_cancel(&timeout->timer);
739 666
740 /*
741 * Readjust priority, when we did not get the lock. We might
742 * have been the pending owner and boosted. Since we did not
743 * take the lock, the PI boost has to go.
744 */
745 if (unlikely(ret))
746 rt_mutex_adjust_prio(current);
747
748 debug_rt_mutex_free_waiter(&waiter); 667 debug_rt_mutex_free_waiter(&waiter);
749 668
750 return ret; 669 return ret;
@@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
762 681
763 if (likely(rt_mutex_owner(lock) != current)) { 682 if (likely(rt_mutex_owner(lock) != current)) {
764 683
765 ret = try_to_take_rt_mutex(lock); 684 ret = try_to_take_rt_mutex(lock, current, NULL);
766 /* 685 /*
767 * try_to_take_rt_mutex() sets the lock waiters 686 * try_to_take_rt_mutex() sets the lock waiters
768 * bit unconditionally. Clean this up. 687 * bit unconditionally. Clean this up.
@@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
992{ 911{
993 __rt_mutex_init(lock, NULL); 912 __rt_mutex_init(lock, NULL);
994 debug_rt_mutex_proxy_lock(lock, proxy_owner); 913 debug_rt_mutex_proxy_lock(lock, proxy_owner);
995 rt_mutex_set_owner(lock, proxy_owner, 0); 914 rt_mutex_set_owner(lock, proxy_owner);
996 rt_mutex_deadlock_account_lock(lock, proxy_owner); 915 rt_mutex_deadlock_account_lock(lock, proxy_owner);
997} 916}
998 917
@@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
1008 struct task_struct *proxy_owner) 927 struct task_struct *proxy_owner)
1009{ 928{
1010 debug_rt_mutex_proxy_unlock(lock); 929 debug_rt_mutex_proxy_unlock(lock);
1011 rt_mutex_set_owner(lock, NULL, 0); 930 rt_mutex_set_owner(lock, NULL);
1012 rt_mutex_deadlock_account_unlock(proxy_owner); 931 rt_mutex_deadlock_account_unlock(proxy_owner);
1013} 932}
1014 933
@@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1034 953
1035 raw_spin_lock(&lock->wait_lock); 954 raw_spin_lock(&lock->wait_lock);
1036 955
1037 mark_rt_mutex_waiters(lock); 956 if (try_to_take_rt_mutex(lock, task, NULL)) {
1038
1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock);
1042 rt_mutex_set_owner(lock, task, 0);
1043 raw_spin_unlock(&lock->wait_lock); 957 raw_spin_unlock(&lock->wait_lock);
1044 rt_mutex_deadlock_account_lock(lock, task);
1045 return 1; 958 return 1;
1046 } 959 }
1047 960
1048 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); 961 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
1049 962
1050 if (ret && !waiter->task) { 963 if (ret && !rt_mutex_owner(lock)) {
1051 /* 964 /*
1052 * Reset the return value. We might have 965 * Reset the return value. We might have
1053 * returned with -EDEADLK and the owner 966 * returned with -EDEADLK and the owner
@@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1056 */ 969 */
1057 ret = 0; 970 ret = 0;
1058 } 971 }
972
973 if (unlikely(ret))
974 remove_waiter(lock, waiter);
975
1059 raw_spin_unlock(&lock->wait_lock); 976 raw_spin_unlock(&lock->wait_lock);
1060 977
1061 debug_rt_mutex_print_deadlock(waiter); 978 debug_rt_mutex_print_deadlock(waiter);
@@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1110 1027
1111 set_current_state(TASK_INTERRUPTIBLE); 1028 set_current_state(TASK_INTERRUPTIBLE);
1112 1029
1113 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, 1030 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
1114 detect_deadlock);
1115 1031
1116 set_current_state(TASK_RUNNING); 1032 set_current_state(TASK_RUNNING);
1117 1033
1118 if (unlikely(waiter->task)) 1034 if (unlikely(ret))
1119 remove_waiter(lock, waiter); 1035 remove_waiter(lock, waiter);
1120 1036
1121 /* 1037 /*
@@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1126 1042
1127 raw_spin_unlock(&lock->wait_lock); 1043 raw_spin_unlock(&lock->wait_lock);
1128 1044
1129 /*
1130 * Readjust priority, when we did not get the lock. We might have been
1131 * the pending owner and boosted. Since we did not take the lock, the
1132 * PI boost has to go.
1133 */
1134 if (unlikely(ret))
1135 rt_mutex_adjust_prio(current);
1136
1137 return ret; 1045 return ret;
1138} 1046}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 97a2f81866af..53a66c85261b 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p)
91/* 91/*
92 * lock->owner state tracking: 92 * lock->owner state tracking:
93 */ 93 */
94#define RT_MUTEX_OWNER_PENDING 1UL 94#define RT_MUTEX_HAS_WAITERS 1UL
95#define RT_MUTEX_HAS_WAITERS 2UL 95#define RT_MUTEX_OWNER_MASKALL 1UL
96#define RT_MUTEX_OWNER_MASKALL 3UL
97 96
98static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) 97static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
99{ 98{
@@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
101 ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); 100 ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
102} 101}
103 102
104static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
105{
106 return (struct task_struct *)
107 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
108}
109
110static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
111{
112 return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
113}
114
115/* 103/*
116 * PI-futex support (proxy locking functions, etc.): 104 * PI-futex support (proxy locking functions, etc.):
117 */ 105 */
diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4ec7ba..5e43e9dc65d1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -32,7 +32,6 @@
32#include <linux/init.h> 32#include <linux/init.h>
33#include <linux/uaccess.h> 33#include <linux/uaccess.h>
34#include <linux/highmem.h> 34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h> 35#include <asm/mmu_context.h>
37#include <linux/interrupt.h> 36#include <linux/interrupt.h>
38#include <linux/capability.h> 37#include <linux/capability.h>
@@ -232,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
232#endif 231#endif
233 232
234/* 233/*
235 * sched_domains_mutex serializes calls to arch_init_sched_domains, 234 * sched_domains_mutex serializes calls to init_sched_domains,
236 * detach_destroy_domains and partition_sched_domains. 235 * detach_destroy_domains and partition_sched_domains.
237 */ 236 */
238static DEFINE_MUTEX(sched_domains_mutex); 237static DEFINE_MUTEX(sched_domains_mutex);
@@ -294,7 +293,7 @@ static DEFINE_SPINLOCK(task_group_lock);
294 * limitation from this.) 293 * limitation from this.)
295 */ 294 */
296#define MIN_SHARES 2 295#define MIN_SHARES 2
297#define MAX_SHARES (1UL << 18) 296#define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION))
298 297
299static int root_task_group_load = ROOT_TASK_GROUP_LOAD; 298static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
300#endif 299#endif
@@ -313,6 +312,9 @@ struct cfs_rq {
313 312
314 u64 exec_clock; 313 u64 exec_clock;
315 u64 min_vruntime; 314 u64 min_vruntime;
315#ifndef CONFIG_64BIT
316 u64 min_vruntime_copy;
317#endif
316 318
317 struct rb_root tasks_timeline; 319 struct rb_root tasks_timeline;
318 struct rb_node *rb_leftmost; 320 struct rb_node *rb_leftmost;
@@ -324,9 +326,11 @@ struct cfs_rq {
324 * 'curr' points to currently running entity on this cfs_rq. 326 * 'curr' points to currently running entity on this cfs_rq.
325 * It is set to NULL otherwise (i.e when none are currently running). 327 * It is set to NULL otherwise (i.e when none are currently running).
326 */ 328 */
327 struct sched_entity *curr, *next, *last; 329 struct sched_entity *curr, *next, *last, *skip;
328 330
331#ifdef CONFIG_SCHED_DEBUG
329 unsigned int nr_spread_over; 332 unsigned int nr_spread_over;
333#endif
330 334
331#ifdef CONFIG_FAIR_GROUP_SCHED 335#ifdef CONFIG_FAIR_GROUP_SCHED
332 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 336 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -418,6 +422,7 @@ struct rt_rq {
418 */ 422 */
419struct root_domain { 423struct root_domain {
420 atomic_t refcount; 424 atomic_t refcount;
425 struct rcu_head rcu;
421 cpumask_var_t span; 426 cpumask_var_t span;
422 cpumask_var_t online; 427 cpumask_var_t online;
423 428
@@ -461,7 +466,7 @@ struct rq {
461 u64 nohz_stamp; 466 u64 nohz_stamp;
462 unsigned char nohz_balance_kick; 467 unsigned char nohz_balance_kick;
463#endif 468#endif
464 unsigned int skip_clock_update; 469 int skip_clock_update;
465 470
466 /* capture load from *all* tasks on this cpu: */ 471 /* capture load from *all* tasks on this cpu: */
467 struct load_weight load; 472 struct load_weight load;
@@ -554,6 +559,10 @@ struct rq {
554 unsigned int ttwu_count; 559 unsigned int ttwu_count;
555 unsigned int ttwu_local; 560 unsigned int ttwu_local;
556#endif 561#endif
562
563#ifdef CONFIG_SMP
564 struct task_struct *wake_list;
565#endif
557}; 566};
558 567
559static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 568static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -572,7 +581,7 @@ static inline int cpu_of(struct rq *rq)
572 581
573#define rcu_dereference_check_sched_domain(p) \ 582#define rcu_dereference_check_sched_domain(p) \
574 rcu_dereference_check((p), \ 583 rcu_dereference_check((p), \
575 rcu_read_lock_sched_held() || \ 584 rcu_read_lock_held() || \
576 lockdep_is_held(&sched_domains_mutex)) 585 lockdep_is_held(&sched_domains_mutex))
577 586
578/* 587/*
@@ -597,7 +606,7 @@ static inline int cpu_of(struct rq *rq)
597 * Return the group to which this tasks belongs. 606 * Return the group to which this tasks belongs.
598 * 607 *
599 * We use task_subsys_state_check() and extend the RCU verification 608 * We use task_subsys_state_check() and extend the RCU verification
600 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() 609 * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
601 * holds that lock for each task it moves into the cgroup. Therefore 610 * holds that lock for each task it moves into the cgroup. Therefore
602 * by holding that lock, we pin the task to the current cgroup. 611 * by holding that lock, we pin the task to the current cgroup.
603 */ 612 */
@@ -606,11 +615,8 @@ static inline struct task_group *task_group(struct task_struct *p)
606 struct task_group *tg; 615 struct task_group *tg;
607 struct cgroup_subsys_state *css; 616 struct cgroup_subsys_state *css;
608 617
609 if (p->flags & PF_EXITING)
610 return &root_task_group;
611
612 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 618 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
613 lockdep_is_held(&task_rq(p)->lock)); 619 lockdep_is_held(&p->pi_lock));
614 tg = container_of(css, struct task_group, css); 620 tg = container_of(css, struct task_group, css);
615 621
616 return autogroup_task_group(p, tg); 622 return autogroup_task_group(p, tg);
@@ -646,7 +652,7 @@ static void update_rq_clock(struct rq *rq)
646{ 652{
647 s64 delta; 653 s64 delta;
648 654
649 if (rq->skip_clock_update) 655 if (rq->skip_clock_update > 0)
650 return; 656 return;
651 657
652 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 658 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -664,10 +670,9 @@ static void update_rq_clock(struct rq *rq)
664#endif 670#endif
665 671
666/** 672/**
667 * runqueue_is_locked 673 * runqueue_is_locked - Returns true if the current cpu runqueue is locked
668 * @cpu: the processor in question. 674 * @cpu: the processor in question.
669 * 675 *
670 * Returns true if the current cpu runqueue is locked.
671 * This interface allows printk to be called with the runqueue lock 676 * This interface allows printk to be called with the runqueue lock
672 * held and know whether or not it is OK to wake up the klogd. 677 * held and know whether or not it is OK to wake up the klogd.
673 */ 678 */
@@ -843,18 +848,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
843 return rq->curr == p; 848 return rq->curr == p;
844} 849}
845 850
846#ifndef __ARCH_WANT_UNLOCKED_CTXSW
847static inline int task_running(struct rq *rq, struct task_struct *p) 851static inline int task_running(struct rq *rq, struct task_struct *p)
848{ 852{
853#ifdef CONFIG_SMP
854 return p->on_cpu;
855#else
849 return task_current(rq, p); 856 return task_current(rq, p);
857#endif
850} 858}
851 859
860#ifndef __ARCH_WANT_UNLOCKED_CTXSW
852static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 861static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
853{ 862{
863#ifdef CONFIG_SMP
864 /*
865 * We can optimise this out completely for !SMP, because the
866 * SMP rebalancing from interrupt is the only thing that cares
867 * here.
868 */
869 next->on_cpu = 1;
870#endif
854} 871}
855 872
856static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 873static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
857{ 874{
875#ifdef CONFIG_SMP
876 /*
877 * After ->on_cpu is cleared, the task can be moved to a different CPU.
878 * We must ensure this doesn't happen until the switch is completely
879 * finished.
880 */
881 smp_wmb();
882 prev->on_cpu = 0;
883#endif
858#ifdef CONFIG_DEBUG_SPINLOCK 884#ifdef CONFIG_DEBUG_SPINLOCK
859 /* this is a valid case when another task releases the spinlock */ 885 /* this is a valid case when another task releases the spinlock */
860 rq->lock.owner = current; 886 rq->lock.owner = current;
@@ -870,15 +896,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
870} 896}
871 897
872#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 898#else /* __ARCH_WANT_UNLOCKED_CTXSW */
873static inline int task_running(struct rq *rq, struct task_struct *p)
874{
875#ifdef CONFIG_SMP
876 return p->oncpu;
877#else
878 return task_current(rq, p);
879#endif
880}
881
882static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 899static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
883{ 900{
884#ifdef CONFIG_SMP 901#ifdef CONFIG_SMP
@@ -887,7 +904,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
887 * SMP rebalancing from interrupt is the only thing that cares 904 * SMP rebalancing from interrupt is the only thing that cares
888 * here. 905 * here.
889 */ 906 */
890 next->oncpu = 1; 907 next->on_cpu = 1;
891#endif 908#endif
892#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 909#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
893 raw_spin_unlock_irq(&rq->lock); 910 raw_spin_unlock_irq(&rq->lock);
@@ -900,12 +917,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
900{ 917{
901#ifdef CONFIG_SMP 918#ifdef CONFIG_SMP
902 /* 919 /*
903 * After ->oncpu is cleared, the task can be moved to a different CPU. 920 * After ->on_cpu is cleared, the task can be moved to a different CPU.
904 * We must ensure this doesn't happen until the switch is completely 921 * We must ensure this doesn't happen until the switch is completely
905 * finished. 922 * finished.
906 */ 923 */
907 smp_wmb(); 924 smp_wmb();
908 prev->oncpu = 0; 925 prev->on_cpu = 0;
909#endif 926#endif
910#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 927#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
911 local_irq_enable(); 928 local_irq_enable();
@@ -914,23 +931,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
914#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 931#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
915 932
916/* 933/*
917 * Check whether the task is waking, we use this to synchronize ->cpus_allowed 934 * __task_rq_lock - lock the rq @p resides on.
918 * against ttwu().
919 */
920static inline int task_is_waking(struct task_struct *p)
921{
922 return unlikely(p->state == TASK_WAKING);
923}
924
925/*
926 * __task_rq_lock - lock the runqueue a given task resides on.
927 * Must be called interrupts disabled.
928 */ 935 */
929static inline struct rq *__task_rq_lock(struct task_struct *p) 936static inline struct rq *__task_rq_lock(struct task_struct *p)
930 __acquires(rq->lock) 937 __acquires(rq->lock)
931{ 938{
932 struct rq *rq; 939 struct rq *rq;
933 940
941 lockdep_assert_held(&p->pi_lock);
942
934 for (;;) { 943 for (;;) {
935 rq = task_rq(p); 944 rq = task_rq(p);
936 raw_spin_lock(&rq->lock); 945 raw_spin_lock(&rq->lock);
@@ -941,22 +950,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
941} 950}
942 951
943/* 952/*
944 * task_rq_lock - lock the runqueue a given task resides on and disable 953 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
945 * interrupts. Note the ordering: we can safely lookup the task_rq without
946 * explicitly disabling preemption.
947 */ 954 */
948static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 955static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
956 __acquires(p->pi_lock)
949 __acquires(rq->lock) 957 __acquires(rq->lock)
950{ 958{
951 struct rq *rq; 959 struct rq *rq;
952 960
953 for (;;) { 961 for (;;) {
954 local_irq_save(*flags); 962 raw_spin_lock_irqsave(&p->pi_lock, *flags);
955 rq = task_rq(p); 963 rq = task_rq(p);
956 raw_spin_lock(&rq->lock); 964 raw_spin_lock(&rq->lock);
957 if (likely(rq == task_rq(p))) 965 if (likely(rq == task_rq(p)))
958 return rq; 966 return rq;
959 raw_spin_unlock_irqrestore(&rq->lock, *flags); 967 raw_spin_unlock(&rq->lock);
968 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
960 } 969 }
961} 970}
962 971
@@ -966,10 +975,13 @@ static void __task_rq_unlock(struct rq *rq)
966 raw_spin_unlock(&rq->lock); 975 raw_spin_unlock(&rq->lock);
967} 976}
968 977
969static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 978static inline void
979task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
970 __releases(rq->lock) 980 __releases(rq->lock)
981 __releases(p->pi_lock)
971{ 982{
972 raw_spin_unlock_irqrestore(&rq->lock, *flags); 983 raw_spin_unlock(&rq->lock);
984 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
973} 985}
974 986
975/* 987/*
@@ -1198,11 +1210,17 @@ int get_nohz_timer_target(void)
1198 int i; 1210 int i;
1199 struct sched_domain *sd; 1211 struct sched_domain *sd;
1200 1212
1213 rcu_read_lock();
1201 for_each_domain(cpu, sd) { 1214 for_each_domain(cpu, sd) {
1202 for_each_cpu(i, sched_domain_span(sd)) 1215 for_each_cpu(i, sched_domain_span(sd)) {
1203 if (!idle_cpu(i)) 1216 if (!idle_cpu(i)) {
1204 return i; 1217 cpu = i;
1218 goto unlock;
1219 }
1220 }
1205 } 1221 }
1222unlock:
1223 rcu_read_unlock();
1206 return cpu; 1224 return cpu;
1207} 1225}
1208/* 1226/*
@@ -1312,15 +1330,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1312{ 1330{
1313 u64 tmp; 1331 u64 tmp;
1314 1332
1333 /*
1334 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1335 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1336 * 2^SCHED_LOAD_RESOLUTION.
1337 */
1338 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1339 tmp = (u64)delta_exec * scale_load_down(weight);
1340 else
1341 tmp = (u64)delta_exec;
1342
1315 if (!lw->inv_weight) { 1343 if (!lw->inv_weight) {
1316 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) 1344 unsigned long w = scale_load_down(lw->weight);
1345
1346 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1317 lw->inv_weight = 1; 1347 lw->inv_weight = 1;
1348 else if (unlikely(!w))
1349 lw->inv_weight = WMULT_CONST;
1318 else 1350 else
1319 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) 1351 lw->inv_weight = WMULT_CONST / w;
1320 / (lw->weight+1);
1321 } 1352 }
1322 1353
1323 tmp = (u64)delta_exec * weight;
1324 /* 1354 /*
1325 * Check whether we'd overflow the 64-bit multiplication: 1355 * Check whether we'd overflow the 64-bit multiplication:
1326 */ 1356 */
@@ -1686,6 +1716,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1686 __release(rq2->lock); 1716 __release(rq2->lock);
1687} 1717}
1688 1718
1719#else /* CONFIG_SMP */
1720
1721/*
1722 * double_rq_lock - safely lock two runqueues
1723 *
1724 * Note this does not disable interrupts like task_rq_lock,
1725 * you need to do so manually before calling.
1726 */
1727static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1728 __acquires(rq1->lock)
1729 __acquires(rq2->lock)
1730{
1731 BUG_ON(!irqs_disabled());
1732 BUG_ON(rq1 != rq2);
1733 raw_spin_lock(&rq1->lock);
1734 __acquire(rq2->lock); /* Fake it out ;) */
1735}
1736
1737/*
1738 * double_rq_unlock - safely unlock two runqueues
1739 *
1740 * Note this does not restore interrupts like task_rq_unlock,
1741 * you need to do so manually after calling.
1742 */
1743static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1744 __releases(rq1->lock)
1745 __releases(rq2->lock)
1746{
1747 BUG_ON(rq1 != rq2);
1748 raw_spin_unlock(&rq1->lock);
1749 __release(rq2->lock);
1750}
1751
1689#endif 1752#endif
1690 1753
1691static void calc_load_account_idle(struct rq *this_rq); 1754static void calc_load_account_idle(struct rq *this_rq);
@@ -1727,17 +1790,20 @@ static void dec_nr_running(struct rq *rq)
1727 1790
1728static void set_load_weight(struct task_struct *p) 1791static void set_load_weight(struct task_struct *p)
1729{ 1792{
1793 int prio = p->static_prio - MAX_RT_PRIO;
1794 struct load_weight *load = &p->se.load;
1795
1730 /* 1796 /*
1731 * SCHED_IDLE tasks get minimal weight: 1797 * SCHED_IDLE tasks get minimal weight:
1732 */ 1798 */
1733 if (p->policy == SCHED_IDLE) { 1799 if (p->policy == SCHED_IDLE) {
1734 p->se.load.weight = WEIGHT_IDLEPRIO; 1800 load->weight = scale_load(WEIGHT_IDLEPRIO);
1735 p->se.load.inv_weight = WMULT_IDLEPRIO; 1801 load->inv_weight = WMULT_IDLEPRIO;
1736 return; 1802 return;
1737 } 1803 }
1738 1804
1739 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; 1805 load->weight = scale_load(prio_to_weight[prio]);
1740 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1806 load->inv_weight = prio_to_wmult[prio];
1741} 1807}
1742 1808
1743static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 1809static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1745,7 +1811,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1745 update_rq_clock(rq); 1811 update_rq_clock(rq);
1746 sched_info_queued(p); 1812 sched_info_queued(p);
1747 p->sched_class->enqueue_task(rq, p, flags); 1813 p->sched_class->enqueue_task(rq, p, flags);
1748 p->se.on_rq = 1;
1749} 1814}
1750 1815
1751static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 1816static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1753,7 +1818,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1753 update_rq_clock(rq); 1818 update_rq_clock(rq);
1754 sched_info_dequeued(p); 1819 sched_info_dequeued(p);
1755 p->sched_class->dequeue_task(rq, p, flags); 1820 p->sched_class->dequeue_task(rq, p, flags);
1756 p->se.on_rq = 0;
1757} 1821}
1758 1822
1759/* 1823/*
@@ -1880,7 +1944,7 @@ void account_system_vtime(struct task_struct *curr)
1880 */ 1944 */
1881 if (hardirq_count()) 1945 if (hardirq_count())
1882 __this_cpu_add(cpu_hardirq_time, delta); 1946 __this_cpu_add(cpu_hardirq_time, delta);
1883 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) 1947 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
1884 __this_cpu_add(cpu_softirq_time, delta); 1948 __this_cpu_add(cpu_softirq_time, delta);
1885 1949
1886 irq_time_write_end(); 1950 irq_time_write_end();
@@ -1920,8 +1984,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
1920 sched_rt_avg_update(rq, irq_delta); 1984 sched_rt_avg_update(rq, irq_delta);
1921} 1985}
1922 1986
1987static int irqtime_account_hi_update(void)
1988{
1989 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1990 unsigned long flags;
1991 u64 latest_ns;
1992 int ret = 0;
1993
1994 local_irq_save(flags);
1995 latest_ns = this_cpu_read(cpu_hardirq_time);
1996 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
1997 ret = 1;
1998 local_irq_restore(flags);
1999 return ret;
2000}
2001
2002static int irqtime_account_si_update(void)
2003{
2004 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2005 unsigned long flags;
2006 u64 latest_ns;
2007 int ret = 0;
2008
2009 local_irq_save(flags);
2010 latest_ns = this_cpu_read(cpu_softirq_time);
2011 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
2012 ret = 1;
2013 local_irq_restore(flags);
2014 return ret;
2015}
2016
1923#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 2017#else /* CONFIG_IRQ_TIME_ACCOUNTING */
1924 2018
2019#define sched_clock_irqtime (0)
2020
1925static void update_rq_clock_task(struct rq *rq, s64 delta) 2021static void update_rq_clock_task(struct rq *rq, s64 delta)
1926{ 2022{
1927 rq->clock_task += delta; 2023 rq->clock_task += delta;
@@ -2025,14 +2121,14 @@ inline int task_curr(const struct task_struct *p)
2025 2121
2026static inline void check_class_changed(struct rq *rq, struct task_struct *p, 2122static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2027 const struct sched_class *prev_class, 2123 const struct sched_class *prev_class,
2028 int oldprio, int running) 2124 int oldprio)
2029{ 2125{
2030 if (prev_class != p->sched_class) { 2126 if (prev_class != p->sched_class) {
2031 if (prev_class->switched_from) 2127 if (prev_class->switched_from)
2032 prev_class->switched_from(rq, p, running); 2128 prev_class->switched_from(rq, p);
2033 p->sched_class->switched_to(rq, p, running); 2129 p->sched_class->switched_to(rq, p);
2034 } else 2130 } else if (oldprio != p->prio)
2035 p->sched_class->prio_changed(rq, p, oldprio, running); 2131 p->sched_class->prio_changed(rq, p, oldprio);
2036} 2132}
2037 2133
2038static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 2134static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2056,7 +2152,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2056 * A queue event has occurred, and we're going to schedule. In 2152 * A queue event has occurred, and we're going to schedule. In
2057 * this case, we can save a useless back to back clock update. 2153 * this case, we can save a useless back to back clock update.
2058 */ 2154 */
2059 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) 2155 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
2060 rq->skip_clock_update = 1; 2156 rq->skip_clock_update = 1;
2061} 2157}
2062 2158
@@ -2102,6 +2198,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2102 */ 2198 */
2103 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 2199 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2104 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2200 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2201
2202#ifdef CONFIG_LOCKDEP
2203 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2204 lockdep_is_held(&task_rq(p)->lock)));
2205#endif
2105#endif 2206#endif
2106 2207
2107 trace_sched_migrate_task(p, new_cpu); 2208 trace_sched_migrate_task(p, new_cpu);
@@ -2122,19 +2223,6 @@ struct migration_arg {
2122static int migration_cpu_stop(void *data); 2223static int migration_cpu_stop(void *data);
2123 2224
2124/* 2225/*
2125 * The task's runqueue lock must be held.
2126 * Returns true if you have to wait for migration thread.
2127 */
2128static bool migrate_task(struct task_struct *p, struct rq *rq)
2129{
2130 /*
2131 * If the task is not on a runqueue (and not running), then
2132 * the next wake-up will properly place the task.
2133 */
2134 return p->se.on_rq || task_running(rq, p);
2135}
2136
2137/*
2138 * wait_task_inactive - wait for a thread to unschedule. 2226 * wait_task_inactive - wait for a thread to unschedule.
2139 * 2227 *
2140 * If @match_state is nonzero, it's the @p->state value just checked and 2228 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2191,11 +2279,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2191 rq = task_rq_lock(p, &flags); 2279 rq = task_rq_lock(p, &flags);
2192 trace_sched_wait_task(p); 2280 trace_sched_wait_task(p);
2193 running = task_running(rq, p); 2281 running = task_running(rq, p);
2194 on_rq = p->se.on_rq; 2282 on_rq = p->on_rq;
2195 ncsw = 0; 2283 ncsw = 0;
2196 if (!match_state || p->state == match_state) 2284 if (!match_state || p->state == match_state)
2197 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2285 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2198 task_rq_unlock(rq, &flags); 2286 task_rq_unlock(rq, p, &flags);
2199 2287
2200 /* 2288 /*
2201 * If it changed from the expected state, bail out now. 2289 * If it changed from the expected state, bail out now.
@@ -2224,7 +2312,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2224 * yield - it could be a while. 2312 * yield - it could be a while.
2225 */ 2313 */
2226 if (unlikely(on_rq)) { 2314 if (unlikely(on_rq)) {
2227 schedule_timeout_uninterruptible(1); 2315 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
2316
2317 set_current_state(TASK_UNINTERRUPTIBLE);
2318 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2228 continue; 2319 continue;
2229 } 2320 }
2230 2321
@@ -2246,7 +2337,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2246 * Cause a process which is running on another CPU to enter 2337 * Cause a process which is running on another CPU to enter
2247 * kernel-mode, without any delay. (to get signals handled.) 2338 * kernel-mode, without any delay. (to get signals handled.)
2248 * 2339 *
2249 * NOTE: this function doesnt have to take the runqueue lock, 2340 * NOTE: this function doesn't have to take the runqueue lock,
2250 * because all it wants to ensure is that the remote task enters 2341 * because all it wants to ensure is that the remote task enters
2251 * the kernel. If the IPI races and the task has been migrated 2342 * the kernel. If the IPI races and the task has been migrated
2252 * to another CPU then no harm is done and the purpose has been 2343 * to another CPU then no harm is done and the purpose has been
@@ -2265,30 +2356,9 @@ void kick_process(struct task_struct *p)
2265EXPORT_SYMBOL_GPL(kick_process); 2356EXPORT_SYMBOL_GPL(kick_process);
2266#endif /* CONFIG_SMP */ 2357#endif /* CONFIG_SMP */
2267 2358
2268/**
2269 * task_oncpu_function_call - call a function on the cpu on which a task runs
2270 * @p: the task to evaluate
2271 * @func: the function to be called
2272 * @info: the function call argument
2273 *
2274 * Calls the function @func when the task is currently running. This might
2275 * be on the current CPU, which just calls the function directly
2276 */
2277void task_oncpu_function_call(struct task_struct *p,
2278 void (*func) (void *info), void *info)
2279{
2280 int cpu;
2281
2282 preempt_disable();
2283 cpu = task_cpu(p);
2284 if (task_curr(p))
2285 smp_call_function_single(cpu, func, info, 1);
2286 preempt_enable();
2287}
2288
2289#ifdef CONFIG_SMP 2359#ifdef CONFIG_SMP
2290/* 2360/*
2291 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2361 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
2292 */ 2362 */
2293static int select_fallback_rq(int cpu, struct task_struct *p) 2363static int select_fallback_rq(int cpu, struct task_struct *p)
2294{ 2364{
@@ -2321,12 +2391,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2321} 2391}
2322 2392
2323/* 2393/*
2324 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. 2394 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
2325 */ 2395 */
2326static inline 2396static inline
2327int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) 2397int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2328{ 2398{
2329 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); 2399 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2330 2400
2331 /* 2401 /*
2332 * In order not to call set_task_cpu() on a blocking task we need 2402 * In order not to call set_task_cpu() on a blocking task we need
@@ -2352,27 +2422,62 @@ static void update_avg(u64 *avg, u64 sample)
2352} 2422}
2353#endif 2423#endif
2354 2424
2355static inline void ttwu_activate(struct task_struct *p, struct rq *rq, 2425static void
2356 bool is_sync, bool is_migrate, bool is_local, 2426ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2357 unsigned long en_flags)
2358{ 2427{
2428#ifdef CONFIG_SCHEDSTATS
2429 struct rq *rq = this_rq();
2430
2431#ifdef CONFIG_SMP
2432 int this_cpu = smp_processor_id();
2433
2434 if (cpu == this_cpu) {
2435 schedstat_inc(rq, ttwu_local);
2436 schedstat_inc(p, se.statistics.nr_wakeups_local);
2437 } else {
2438 struct sched_domain *sd;
2439
2440 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2441 rcu_read_lock();
2442 for_each_domain(this_cpu, sd) {
2443 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2444 schedstat_inc(sd, ttwu_wake_remote);
2445 break;
2446 }
2447 }
2448 rcu_read_unlock();
2449 }
2450#endif /* CONFIG_SMP */
2451
2452 schedstat_inc(rq, ttwu_count);
2359 schedstat_inc(p, se.statistics.nr_wakeups); 2453 schedstat_inc(p, se.statistics.nr_wakeups);
2360 if (is_sync) 2454
2455 if (wake_flags & WF_SYNC)
2361 schedstat_inc(p, se.statistics.nr_wakeups_sync); 2456 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2362 if (is_migrate) 2457
2458 if (cpu != task_cpu(p))
2363 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 2459 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2364 if (is_local)
2365 schedstat_inc(p, se.statistics.nr_wakeups_local);
2366 else
2367 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2368 2460
2461#endif /* CONFIG_SCHEDSTATS */
2462}
2463
2464static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2465{
2369 activate_task(rq, p, en_flags); 2466 activate_task(rq, p, en_flags);
2467 p->on_rq = 1;
2468
2469 /* if a worker is waking up, notify workqueue */
2470 if (p->flags & PF_WQ_WORKER)
2471 wq_worker_waking_up(p, cpu_of(rq));
2370} 2472}
2371 2473
2372static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, 2474/*
2373 int wake_flags, bool success) 2475 * Mark the task runnable and perform wakeup-preemption.
2476 */
2477static void
2478ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2374{ 2479{
2375 trace_sched_wakeup(p, success); 2480 trace_sched_wakeup(p, true);
2376 check_preempt_curr(rq, p, wake_flags); 2481 check_preempt_curr(rq, p, wake_flags);
2377 2482
2378 p->state = TASK_RUNNING; 2483 p->state = TASK_RUNNING;
@@ -2391,9 +2496,99 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2391 rq->idle_stamp = 0; 2496 rq->idle_stamp = 0;
2392 } 2497 }
2393#endif 2498#endif
2394 /* if a worker is waking up, notify workqueue */ 2499}
2395 if ((p->flags & PF_WQ_WORKER) && success) 2500
2396 wq_worker_waking_up(p, cpu_of(rq)); 2501static void
2502ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
2503{
2504#ifdef CONFIG_SMP
2505 if (p->sched_contributes_to_load)
2506 rq->nr_uninterruptible--;
2507#endif
2508
2509 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
2510 ttwu_do_wakeup(rq, p, wake_flags);
2511}
2512
2513/*
2514 * Called in case the task @p isn't fully descheduled from its runqueue,
2515 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2516 * since all we need to do is flip p->state to TASK_RUNNING, since
2517 * the task is still ->on_rq.
2518 */
2519static int ttwu_remote(struct task_struct *p, int wake_flags)
2520{
2521 struct rq *rq;
2522 int ret = 0;
2523
2524 rq = __task_rq_lock(p);
2525 if (p->on_rq) {
2526 ttwu_do_wakeup(rq, p, wake_flags);
2527 ret = 1;
2528 }
2529 __task_rq_unlock(rq);
2530
2531 return ret;
2532}
2533
2534#ifdef CONFIG_SMP
2535static void sched_ttwu_pending(void)
2536{
2537 struct rq *rq = this_rq();
2538 struct task_struct *list = xchg(&rq->wake_list, NULL);
2539
2540 if (!list)
2541 return;
2542
2543 raw_spin_lock(&rq->lock);
2544
2545 while (list) {
2546 struct task_struct *p = list;
2547 list = list->wake_entry;
2548 ttwu_do_activate(rq, p, 0);
2549 }
2550
2551 raw_spin_unlock(&rq->lock);
2552}
2553
2554void scheduler_ipi(void)
2555{
2556 sched_ttwu_pending();
2557}
2558
2559static void ttwu_queue_remote(struct task_struct *p, int cpu)
2560{
2561 struct rq *rq = cpu_rq(cpu);
2562 struct task_struct *next = rq->wake_list;
2563
2564 for (;;) {
2565 struct task_struct *old = next;
2566
2567 p->wake_entry = next;
2568 next = cmpxchg(&rq->wake_list, old, p);
2569 if (next == old)
2570 break;
2571 }
2572
2573 if (!next)
2574 smp_send_reschedule(cpu);
2575}
2576#endif
2577
2578static void ttwu_queue(struct task_struct *p, int cpu)
2579{
2580 struct rq *rq = cpu_rq(cpu);
2581
2582#if defined(CONFIG_SMP)
2583 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2584 ttwu_queue_remote(p, cpu);
2585 return;
2586 }
2587#endif
2588
2589 raw_spin_lock(&rq->lock);
2590 ttwu_do_activate(rq, p, 0);
2591 raw_spin_unlock(&rq->lock);
2397} 2592}
2398 2593
2399/** 2594/**
@@ -2411,92 +2606,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2411 * Returns %true if @p was woken up, %false if it was already running 2606 * Returns %true if @p was woken up, %false if it was already running
2412 * or @state didn't match @p's state. 2607 * or @state didn't match @p's state.
2413 */ 2608 */
2414static int try_to_wake_up(struct task_struct *p, unsigned int state, 2609static int
2415 int wake_flags) 2610try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2416{ 2611{
2417 int cpu, orig_cpu, this_cpu, success = 0;
2418 unsigned long flags; 2612 unsigned long flags;
2419 unsigned long en_flags = ENQUEUE_WAKEUP; 2613 int cpu, success = 0;
2420 struct rq *rq;
2421
2422 this_cpu = get_cpu();
2423 2614
2424 smp_wmb(); 2615 smp_wmb();
2425 rq = task_rq_lock(p, &flags); 2616 raw_spin_lock_irqsave(&p->pi_lock, flags);
2426 if (!(p->state & state)) 2617 if (!(p->state & state))
2427 goto out; 2618 goto out;
2428 2619
2429 if (p->se.on_rq) 2620 success = 1; /* we're going to change ->state */
2430 goto out_running;
2431
2432 cpu = task_cpu(p); 2621 cpu = task_cpu(p);
2433 orig_cpu = cpu;
2434 2622
2435#ifdef CONFIG_SMP 2623 if (p->on_rq && ttwu_remote(p, wake_flags))
2436 if (unlikely(task_running(rq, p))) 2624 goto stat;
2437 goto out_activate;
2438 2625
2626#ifdef CONFIG_SMP
2439 /* 2627 /*
2440 * In order to handle concurrent wakeups and release the rq->lock 2628 * If the owning (remote) cpu is still in the middle of schedule() with
2441 * we put the task in TASK_WAKING state. 2629 * this task as prev, wait until its done referencing the task.
2442 *
2443 * First fix up the nr_uninterruptible count:
2444 */ 2630 */
2445 if (task_contributes_to_load(p)) { 2631 while (p->on_cpu) {
2446 if (likely(cpu_online(orig_cpu))) 2632#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2447 rq->nr_uninterruptible--; 2633 /*
2448 else 2634 * If called from interrupt context we could have landed in the
2449 this_rq()->nr_uninterruptible--; 2635 * middle of schedule(), in this case we should take care not
2450 } 2636 * to spin on ->on_cpu if p is current, since that would
2451 p->state = TASK_WAKING; 2637 * deadlock.
2452 2638 */
2453 if (p->sched_class->task_waking) { 2639 if (p == current) {
2454 p->sched_class->task_waking(rq, p); 2640 ttwu_queue(p, cpu);
2455 en_flags |= ENQUEUE_WAKING; 2641 goto stat;
2642 }
2643#endif
2644 cpu_relax();
2456 } 2645 }
2457
2458 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2459 if (cpu != orig_cpu)
2460 set_task_cpu(p, cpu);
2461 __task_rq_unlock(rq);
2462
2463 rq = cpu_rq(cpu);
2464 raw_spin_lock(&rq->lock);
2465
2466 /* 2646 /*
2467 * We migrated the task without holding either rq->lock, however 2647 * Pairs with the smp_wmb() in finish_lock_switch().
2468 * since the task is not on the task list itself, nobody else
2469 * will try and migrate the task, hence the rq should match the
2470 * cpu we just moved it to.
2471 */ 2648 */
2472 WARN_ON(task_cpu(p) != cpu); 2649 smp_rmb();
2473 WARN_ON(p->state != TASK_WAKING);
2474 2650
2475#ifdef CONFIG_SCHEDSTATS 2651 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2476 schedstat_inc(rq, ttwu_count); 2652 p->state = TASK_WAKING;
2477 if (cpu == this_cpu)
2478 schedstat_inc(rq, ttwu_local);
2479 else {
2480 struct sched_domain *sd;
2481 for_each_domain(this_cpu, sd) {
2482 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2483 schedstat_inc(sd, ttwu_wake_remote);
2484 break;
2485 }
2486 }
2487 }
2488#endif /* CONFIG_SCHEDSTATS */
2489 2653
2490out_activate: 2654 if (p->sched_class->task_waking)
2655 p->sched_class->task_waking(p);
2656
2657 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2658 if (task_cpu(p) != cpu)
2659 set_task_cpu(p, cpu);
2491#endif /* CONFIG_SMP */ 2660#endif /* CONFIG_SMP */
2492 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, 2661
2493 cpu == this_cpu, en_flags); 2662 ttwu_queue(p, cpu);
2494 success = 1; 2663stat:
2495out_running: 2664 ttwu_stat(p, cpu, wake_flags);
2496 ttwu_post_activation(p, rq, wake_flags, success);
2497out: 2665out:
2498 task_rq_unlock(rq, &flags); 2666 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2499 put_cpu();
2500 2667
2501 return success; 2668 return success;
2502} 2669}
@@ -2505,31 +2672,34 @@ out:
2505 * try_to_wake_up_local - try to wake up a local task with rq lock held 2672 * try_to_wake_up_local - try to wake up a local task with rq lock held
2506 * @p: the thread to be awakened 2673 * @p: the thread to be awakened
2507 * 2674 *
2508 * Put @p on the run-queue if it's not already there. The caller must 2675 * Put @p on the run-queue if it's not already there. The caller must
2509 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2676 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2510 * the current task. this_rq() stays locked over invocation. 2677 * the current task.
2511 */ 2678 */
2512static void try_to_wake_up_local(struct task_struct *p) 2679static void try_to_wake_up_local(struct task_struct *p)
2513{ 2680{
2514 struct rq *rq = task_rq(p); 2681 struct rq *rq = task_rq(p);
2515 bool success = false;
2516 2682
2517 BUG_ON(rq != this_rq()); 2683 BUG_ON(rq != this_rq());
2518 BUG_ON(p == current); 2684 BUG_ON(p == current);
2519 lockdep_assert_held(&rq->lock); 2685 lockdep_assert_held(&rq->lock);
2520 2686
2687 if (!raw_spin_trylock(&p->pi_lock)) {
2688 raw_spin_unlock(&rq->lock);
2689 raw_spin_lock(&p->pi_lock);
2690 raw_spin_lock(&rq->lock);
2691 }
2692
2521 if (!(p->state & TASK_NORMAL)) 2693 if (!(p->state & TASK_NORMAL))
2522 return; 2694 goto out;
2523 2695
2524 if (!p->se.on_rq) { 2696 if (!p->on_rq)
2525 if (likely(!task_running(rq, p))) { 2697 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2526 schedstat_inc(rq, ttwu_count); 2698
2527 schedstat_inc(rq, ttwu_local); 2699 ttwu_do_wakeup(rq, p, 0);
2528 } 2700 ttwu_stat(p, smp_processor_id(), 0);
2529 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); 2701out:
2530 success = true; 2702 raw_spin_unlock(&p->pi_lock);
2531 }
2532 ttwu_post_activation(p, rq, 0, success);
2533} 2703}
2534 2704
2535/** 2705/**
@@ -2562,18 +2732,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
2562 */ 2732 */
2563static void __sched_fork(struct task_struct *p) 2733static void __sched_fork(struct task_struct *p)
2564{ 2734{
2735 p->on_rq = 0;
2736
2737 p->se.on_rq = 0;
2565 p->se.exec_start = 0; 2738 p->se.exec_start = 0;
2566 p->se.sum_exec_runtime = 0; 2739 p->se.sum_exec_runtime = 0;
2567 p->se.prev_sum_exec_runtime = 0; 2740 p->se.prev_sum_exec_runtime = 0;
2568 p->se.nr_migrations = 0; 2741 p->se.nr_migrations = 0;
2742 p->se.vruntime = 0;
2743 INIT_LIST_HEAD(&p->se.group_node);
2569 2744
2570#ifdef CONFIG_SCHEDSTATS 2745#ifdef CONFIG_SCHEDSTATS
2571 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2746 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2572#endif 2747#endif
2573 2748
2574 INIT_LIST_HEAD(&p->rt.run_list); 2749 INIT_LIST_HEAD(&p->rt.run_list);
2575 p->se.on_rq = 0;
2576 INIT_LIST_HEAD(&p->se.group_node);
2577 2750
2578#ifdef CONFIG_PREEMPT_NOTIFIERS 2751#ifdef CONFIG_PREEMPT_NOTIFIERS
2579 INIT_HLIST_HEAD(&p->preempt_notifiers); 2752 INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2583,8 +2756,9 @@ static void __sched_fork(struct task_struct *p)
2583/* 2756/*
2584 * fork()/clone()-time setup: 2757 * fork()/clone()-time setup:
2585 */ 2758 */
2586void sched_fork(struct task_struct *p, int clone_flags) 2759void sched_fork(struct task_struct *p)
2587{ 2760{
2761 unsigned long flags;
2588 int cpu = get_cpu(); 2762 int cpu = get_cpu();
2589 2763
2590 __sched_fork(p); 2764 __sched_fork(p);
@@ -2635,16 +2809,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
2635 * 2809 *
2636 * Silence PROVE_RCU. 2810 * Silence PROVE_RCU.
2637 */ 2811 */
2638 rcu_read_lock(); 2812 raw_spin_lock_irqsave(&p->pi_lock, flags);
2639 set_task_cpu(p, cpu); 2813 set_task_cpu(p, cpu);
2640 rcu_read_unlock(); 2814 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2641 2815
2642#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2816#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2643 if (likely(sched_info_on())) 2817 if (likely(sched_info_on()))
2644 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2818 memset(&p->sched_info, 0, sizeof(p->sched_info));
2645#endif 2819#endif
2646#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 2820#if defined(CONFIG_SMP)
2647 p->oncpu = 0; 2821 p->on_cpu = 0;
2648#endif 2822#endif
2649#ifdef CONFIG_PREEMPT 2823#ifdef CONFIG_PREEMPT
2650 /* Want to start with kernel preemption disabled. */ 2824 /* Want to start with kernel preemption disabled. */
@@ -2664,41 +2838,31 @@ void sched_fork(struct task_struct *p, int clone_flags)
2664 * that must be done for every newly created context, then puts the task 2838 * that must be done for every newly created context, then puts the task
2665 * on the runqueue and wakes it. 2839 * on the runqueue and wakes it.
2666 */ 2840 */
2667void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 2841void wake_up_new_task(struct task_struct *p)
2668{ 2842{
2669 unsigned long flags; 2843 unsigned long flags;
2670 struct rq *rq; 2844 struct rq *rq;
2671 int cpu __maybe_unused = get_cpu();
2672 2845
2846 raw_spin_lock_irqsave(&p->pi_lock, flags);
2673#ifdef CONFIG_SMP 2847#ifdef CONFIG_SMP
2674 rq = task_rq_lock(p, &flags);
2675 p->state = TASK_WAKING;
2676
2677 /* 2848 /*
2678 * Fork balancing, do it here and not earlier because: 2849 * Fork balancing, do it here and not earlier because:
2679 * - cpus_allowed can change in the fork path 2850 * - cpus_allowed can change in the fork path
2680 * - any previously selected cpu might disappear through hotplug 2851 * - any previously selected cpu might disappear through hotplug
2681 *
2682 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2683 * without people poking at ->cpus_allowed.
2684 */ 2852 */
2685 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); 2853 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
2686 set_task_cpu(p, cpu);
2687
2688 p->state = TASK_RUNNING;
2689 task_rq_unlock(rq, &flags);
2690#endif 2854#endif
2691 2855
2692 rq = task_rq_lock(p, &flags); 2856 rq = __task_rq_lock(p);
2693 activate_task(rq, p, 0); 2857 activate_task(rq, p, 0);
2694 trace_sched_wakeup_new(p, 1); 2858 p->on_rq = 1;
2859 trace_sched_wakeup_new(p, true);
2695 check_preempt_curr(rq, p, WF_FORK); 2860 check_preempt_curr(rq, p, WF_FORK);
2696#ifdef CONFIG_SMP 2861#ifdef CONFIG_SMP
2697 if (p->sched_class->task_woken) 2862 if (p->sched_class->task_woken)
2698 p->sched_class->task_woken(rq, p); 2863 p->sched_class->task_woken(rq, p);
2699#endif 2864#endif
2700 task_rq_unlock(rq, &flags); 2865 task_rq_unlock(rq, p, &flags);
2701 put_cpu();
2702} 2866}
2703 2867
2704#ifdef CONFIG_PREEMPT_NOTIFIERS 2868#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2776,9 +2940,12 @@ static inline void
2776prepare_task_switch(struct rq *rq, struct task_struct *prev, 2940prepare_task_switch(struct rq *rq, struct task_struct *prev,
2777 struct task_struct *next) 2941 struct task_struct *next)
2778{ 2942{
2943 sched_info_switch(prev, next);
2944 perf_event_task_sched_out(prev, next);
2779 fire_sched_out_preempt_notifiers(prev, next); 2945 fire_sched_out_preempt_notifiers(prev, next);
2780 prepare_lock_switch(rq, next); 2946 prepare_lock_switch(rq, next);
2781 prepare_arch_switch(next); 2947 prepare_arch_switch(next);
2948 trace_sched_switch(prev, next);
2782} 2949}
2783 2950
2784/** 2951/**
@@ -2911,7 +3078,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2911 struct mm_struct *mm, *oldmm; 3078 struct mm_struct *mm, *oldmm;
2912 3079
2913 prepare_task_switch(rq, prev, next); 3080 prepare_task_switch(rq, prev, next);
2914 trace_sched_switch(prev, next); 3081
2915 mm = next->mm; 3082 mm = next->mm;
2916 oldmm = prev->active_mm; 3083 oldmm = prev->active_mm;
2917 /* 3084 /*
@@ -3404,27 +3571,22 @@ void sched_exec(void)
3404{ 3571{
3405 struct task_struct *p = current; 3572 struct task_struct *p = current;
3406 unsigned long flags; 3573 unsigned long flags;
3407 struct rq *rq;
3408 int dest_cpu; 3574 int dest_cpu;
3409 3575
3410 rq = task_rq_lock(p, &flags); 3576 raw_spin_lock_irqsave(&p->pi_lock, flags);
3411 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); 3577 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
3412 if (dest_cpu == smp_processor_id()) 3578 if (dest_cpu == smp_processor_id())
3413 goto unlock; 3579 goto unlock;
3414 3580
3415 /* 3581 if (likely(cpu_active(dest_cpu))) {
3416 * select_task_rq() can race against ->cpus_allowed
3417 */
3418 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3419 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3420 struct migration_arg arg = { p, dest_cpu }; 3582 struct migration_arg arg = { p, dest_cpu };
3421 3583
3422 task_rq_unlock(rq, &flags); 3584 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3423 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 3585 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3424 return; 3586 return;
3425 } 3587 }
3426unlock: 3588unlock:
3427 task_rq_unlock(rq, &flags); 3589 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3428} 3590}
3429 3591
3430#endif 3592#endif
@@ -3461,7 +3623,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
3461 3623
3462 rq = task_rq_lock(p, &flags); 3624 rq = task_rq_lock(p, &flags);
3463 ns = do_task_delta_exec(p, rq); 3625 ns = do_task_delta_exec(p, rq);
3464 task_rq_unlock(rq, &flags); 3626 task_rq_unlock(rq, p, &flags);
3465 3627
3466 return ns; 3628 return ns;
3467} 3629}
@@ -3479,7 +3641,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3479 3641
3480 rq = task_rq_lock(p, &flags); 3642 rq = task_rq_lock(p, &flags);
3481 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 3643 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3482 task_rq_unlock(rq, &flags); 3644 task_rq_unlock(rq, p, &flags);
3483 3645
3484 return ns; 3646 return ns;
3485} 3647}
@@ -3503,7 +3665,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
3503 rq = task_rq_lock(p, &flags); 3665 rq = task_rq_lock(p, &flags);
3504 thread_group_cputime(p, &totals); 3666 thread_group_cputime(p, &totals);
3505 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 3667 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3506 task_rq_unlock(rq, &flags); 3668 task_rq_unlock(rq, p, &flags);
3507 3669
3508 return ns; 3670 return ns;
3509} 3671}
@@ -3568,6 +3730,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3568} 3730}
3569 3731
3570/* 3732/*
3733 * Account system cpu time to a process and desired cpustat field
3734 * @p: the process that the cpu time gets accounted to
3735 * @cputime: the cpu time spent in kernel space since the last update
3736 * @cputime_scaled: cputime scaled by cpu frequency
3737 * @target_cputime64: pointer to cpustat field that has to be updated
3738 */
3739static inline
3740void __account_system_time(struct task_struct *p, cputime_t cputime,
3741 cputime_t cputime_scaled, cputime64_t *target_cputime64)
3742{
3743 cputime64_t tmp = cputime_to_cputime64(cputime);
3744
3745 /* Add system time to process. */
3746 p->stime = cputime_add(p->stime, cputime);
3747 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3748 account_group_system_time(p, cputime);
3749
3750 /* Add system time to cpustat. */
3751 *target_cputime64 = cputime64_add(*target_cputime64, tmp);
3752 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3753
3754 /* Account for system time used */
3755 acct_update_integrals(p);
3756}
3757
3758/*
3571 * Account system cpu time to a process. 3759 * Account system cpu time to a process.
3572 * @p: the process that the cpu time gets accounted to 3760 * @p: the process that the cpu time gets accounted to
3573 * @hardirq_offset: the offset to subtract from hardirq_count() 3761 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3578,36 +3766,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3578 cputime_t cputime, cputime_t cputime_scaled) 3766 cputime_t cputime, cputime_t cputime_scaled)
3579{ 3767{
3580 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3768 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3581 cputime64_t tmp; 3769 cputime64_t *target_cputime64;
3582 3770
3583 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 3771 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3584 account_guest_time(p, cputime, cputime_scaled); 3772 account_guest_time(p, cputime, cputime_scaled);
3585 return; 3773 return;
3586 } 3774 }
3587 3775
3588 /* Add system time to process. */
3589 p->stime = cputime_add(p->stime, cputime);
3590 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3591 account_group_system_time(p, cputime);
3592
3593 /* Add system time to cpustat. */
3594 tmp = cputime_to_cputime64(cputime);
3595 if (hardirq_count() - hardirq_offset) 3776 if (hardirq_count() - hardirq_offset)
3596 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3777 target_cputime64 = &cpustat->irq;
3597 else if (in_serving_softirq()) 3778 else if (in_serving_softirq())
3598 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3779 target_cputime64 = &cpustat->softirq;
3599 else 3780 else
3600 cpustat->system = cputime64_add(cpustat->system, tmp); 3781 target_cputime64 = &cpustat->system;
3601 3782
3602 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); 3783 __account_system_time(p, cputime, cputime_scaled, target_cputime64);
3603
3604 /* Account for system time used */
3605 acct_update_integrals(p);
3606} 3784}
3607 3785
3608/* 3786/*
3609 * Account for involuntary wait time. 3787 * Account for involuntary wait time.
3610 * @steal: the cpu time spent in involuntary wait 3788 * @cputime: the cpu time spent in involuntary wait
3611 */ 3789 */
3612void account_steal_time(cputime_t cputime) 3790void account_steal_time(cputime_t cputime)
3613{ 3791{
@@ -3635,6 +3813,73 @@ void account_idle_time(cputime_t cputime)
3635 3813
3636#ifndef CONFIG_VIRT_CPU_ACCOUNTING 3814#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3637 3815
3816#ifdef CONFIG_IRQ_TIME_ACCOUNTING
3817/*
3818 * Account a tick to a process and cpustat
3819 * @p: the process that the cpu time gets accounted to
3820 * @user_tick: is the tick from userspace
3821 * @rq: the pointer to rq
3822 *
3823 * Tick demultiplexing follows the order
3824 * - pending hardirq update
3825 * - pending softirq update
3826 * - user_time
3827 * - idle_time
3828 * - system time
3829 * - check for guest_time
3830 * - else account as system_time
3831 *
3832 * Check for hardirq is done both for system and user time as there is
3833 * no timer going off while we are on hardirq and hence we may never get an
3834 * opportunity to update it solely in system time.
3835 * p->stime and friends are only updated on system time and not on irq
3836 * softirq as those do not count in task exec_runtime any more.
3837 */
3838static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3839 struct rq *rq)
3840{
3841 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3842 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3843 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3844
3845 if (irqtime_account_hi_update()) {
3846 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3847 } else if (irqtime_account_si_update()) {
3848 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3849 } else if (this_cpu_ksoftirqd() == p) {
3850 /*
3851 * ksoftirqd time do not get accounted in cpu_softirq_time.
3852 * So, we have to handle it separately here.
3853 * Also, p->stime needs to be updated for ksoftirqd.
3854 */
3855 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3856 &cpustat->softirq);
3857 } else if (user_tick) {
3858 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3859 } else if (p == rq->idle) {
3860 account_idle_time(cputime_one_jiffy);
3861 } else if (p->flags & PF_VCPU) { /* System time or guest time */
3862 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3863 } else {
3864 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3865 &cpustat->system);
3866 }
3867}
3868
3869static void irqtime_account_idle_ticks(int ticks)
3870{
3871 int i;
3872 struct rq *rq = this_rq();
3873
3874 for (i = 0; i < ticks; i++)
3875 irqtime_account_process_tick(current, 0, rq);
3876}
3877#else /* CONFIG_IRQ_TIME_ACCOUNTING */
3878static void irqtime_account_idle_ticks(int ticks) {}
3879static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3880 struct rq *rq) {}
3881#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
3882
3638/* 3883/*
3639 * Account a single tick of cpu time. 3884 * Account a single tick of cpu time.
3640 * @p: the process that the cpu time gets accounted to 3885 * @p: the process that the cpu time gets accounted to
@@ -3645,6 +3890,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
3645 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 3890 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3646 struct rq *rq = this_rq(); 3891 struct rq *rq = this_rq();
3647 3892
3893 if (sched_clock_irqtime) {
3894 irqtime_account_process_tick(p, user_tick, rq);
3895 return;
3896 }
3897
3648 if (user_tick) 3898 if (user_tick)
3649 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 3899 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3650 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 3900 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3670,6 +3920,12 @@ void account_steal_ticks(unsigned long ticks)
3670 */ 3920 */
3671void account_idle_ticks(unsigned long ticks) 3921void account_idle_ticks(unsigned long ticks)
3672{ 3922{
3923
3924 if (sched_clock_irqtime) {
3925 irqtime_account_idle_ticks(ticks);
3926 return;
3927 }
3928
3673 account_idle_time(jiffies_to_cputime(ticks)); 3929 account_idle_time(jiffies_to_cputime(ticks));
3674} 3930}
3675 3931
@@ -3763,9 +4019,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3763/* 4019/*
3764 * This function gets called by the timer code, with HZ frequency. 4020 * This function gets called by the timer code, with HZ frequency.
3765 * We call it with interrupts disabled. 4021 * We call it with interrupts disabled.
3766 *
3767 * It also gets called by the fork code, when changing the parent's
3768 * timeslices.
3769 */ 4022 */
3770void scheduler_tick(void) 4023void scheduler_tick(void)
3771{ 4024{
@@ -3885,17 +4138,11 @@ static inline void schedule_debug(struct task_struct *prev)
3885 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 4138 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3886 4139
3887 schedstat_inc(this_rq(), sched_count); 4140 schedstat_inc(this_rq(), sched_count);
3888#ifdef CONFIG_SCHEDSTATS
3889 if (unlikely(prev->lock_depth >= 0)) {
3890 schedstat_inc(this_rq(), rq_sched_info.bkl_count);
3891 schedstat_inc(prev, sched_info.bkl_count);
3892 }
3893#endif
3894} 4141}
3895 4142
3896static void put_prev_task(struct rq *rq, struct task_struct *prev) 4143static void put_prev_task(struct rq *rq, struct task_struct *prev)
3897{ 4144{
3898 if (prev->se.on_rq) 4145 if (prev->on_rq || rq->skip_clock_update < 0)
3899 update_rq_clock(rq); 4146 update_rq_clock(rq);
3900 prev->sched_class->put_prev_task(rq, prev); 4147 prev->sched_class->put_prev_task(rq, prev);
3901} 4148}
@@ -3945,9 +4192,6 @@ need_resched:
3945 rcu_note_context_switch(cpu); 4192 rcu_note_context_switch(cpu);
3946 prev = rq->curr; 4193 prev = rq->curr;
3947 4194
3948 release_kernel_lock(prev);
3949need_resched_nonpreemptible:
3950
3951 schedule_debug(prev); 4195 schedule_debug(prev);
3952 4196
3953 if (sched_feat(HRTICK)) 4197 if (sched_feat(HRTICK))
@@ -3960,11 +4204,13 @@ need_resched_nonpreemptible:
3960 if (unlikely(signal_pending_state(prev->state, prev))) { 4204 if (unlikely(signal_pending_state(prev->state, prev))) {
3961 prev->state = TASK_RUNNING; 4205 prev->state = TASK_RUNNING;
3962 } else { 4206 } else {
4207 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4208 prev->on_rq = 0;
4209
3963 /* 4210 /*
3964 * If a worker is going to sleep, notify and 4211 * If a worker went to sleep, notify and ask workqueue
3965 * ask workqueue whether it wants to wake up a 4212 * whether it wants to wake up a task to maintain
3966 * task to maintain concurrency. If so, wake 4213 * concurrency.
3967 * up the task.
3968 */ 4214 */
3969 if (prev->flags & PF_WQ_WORKER) { 4215 if (prev->flags & PF_WQ_WORKER) {
3970 struct task_struct *to_wakeup; 4216 struct task_struct *to_wakeup;
@@ -3973,7 +4219,16 @@ need_resched_nonpreemptible:
3973 if (to_wakeup) 4219 if (to_wakeup)
3974 try_to_wake_up_local(to_wakeup); 4220 try_to_wake_up_local(to_wakeup);
3975 } 4221 }
3976 deactivate_task(rq, prev, DEQUEUE_SLEEP); 4222
4223 /*
4224 * If we are going to sleep and we have plugged IO
4225 * queued, make sure to submit it to avoid deadlocks.
4226 */
4227 if (blk_needs_flush_plug(prev)) {
4228 raw_spin_unlock(&rq->lock);
4229 blk_schedule_flush_plug(prev);
4230 raw_spin_lock(&rq->lock);
4231 }
3977 } 4232 }
3978 switch_count = &prev->nvcsw; 4233 switch_count = &prev->nvcsw;
3979 } 4234 }
@@ -3989,9 +4244,6 @@ need_resched_nonpreemptible:
3989 rq->skip_clock_update = 0; 4244 rq->skip_clock_update = 0;
3990 4245
3991 if (likely(prev != next)) { 4246 if (likely(prev != next)) {
3992 sched_info_switch(prev, next);
3993 perf_event_task_sched_out(prev, next);
3994
3995 rq->nr_switches++; 4247 rq->nr_switches++;
3996 rq->curr = next; 4248 rq->curr = next;
3997 ++*switch_count; 4249 ++*switch_count;
@@ -4010,9 +4262,6 @@ need_resched_nonpreemptible:
4010 4262
4011 post_schedule(rq); 4263 post_schedule(rq);
4012 4264
4013 if (unlikely(reacquire_kernel_lock(prev)))
4014 goto need_resched_nonpreemptible;
4015
4016 preempt_enable_no_resched(); 4265 preempt_enable_no_resched();
4017 if (need_resched()) 4266 if (need_resched())
4018 goto need_resched; 4267 goto need_resched;
@@ -4020,70 +4269,53 @@ need_resched_nonpreemptible:
4020EXPORT_SYMBOL(schedule); 4269EXPORT_SYMBOL(schedule);
4021 4270
4022#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 4271#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4023/*
4024 * Look out! "owner" is an entirely speculative pointer
4025 * access and not reliable.
4026 */
4027int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4028{
4029 unsigned int cpu;
4030 struct rq *rq;
4031 4272
4032 if (!sched_feat(OWNER_SPIN)) 4273static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4033 return 0; 4274{
4275 bool ret = false;
4034 4276
4035#ifdef CONFIG_DEBUG_PAGEALLOC 4277 rcu_read_lock();
4036 /* 4278 if (lock->owner != owner)
4037 * Need to access the cpu field knowing that 4279 goto fail;
4038 * DEBUG_PAGEALLOC could have unmapped it if
4039 * the mutex owner just released it and exited.
4040 */
4041 if (probe_kernel_address(&owner->cpu, cpu))
4042 return 0;
4043#else
4044 cpu = owner->cpu;
4045#endif
4046 4280
4047 /* 4281 /*
4048 * Even if the access succeeded (likely case), 4282 * Ensure we emit the owner->on_cpu, dereference _after_ checking
4049 * the cpu field may no longer be valid. 4283 * lock->owner still matches owner, if that fails, owner might
4284 * point to free()d memory, if it still matches, the rcu_read_lock()
4285 * ensures the memory stays valid.
4050 */ 4286 */
4051 if (cpu >= nr_cpumask_bits) 4287 barrier();
4052 return 0;
4053 4288
4054 /* 4289 ret = owner->on_cpu;
4055 * We need to validate that we can do a 4290fail:
4056 * get_cpu() and that we have the percpu area. 4291 rcu_read_unlock();
4057 */
4058 if (!cpu_online(cpu))
4059 return 0;
4060 4292
4061 rq = cpu_rq(cpu); 4293 return ret;
4294}
4062 4295
4063 for (;;) { 4296/*
4064 /* 4297 * Look out! "owner" is an entirely speculative pointer
4065 * Owner changed, break to re-assess state. 4298 * access and not reliable.
4066 */ 4299 */
4067 if (lock->owner != owner) { 4300int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
4068 /* 4301{
4069 * If the lock has switched to a different owner, 4302 if (!sched_feat(OWNER_SPIN))
4070 * we likely have heavy contention. Return 0 to quit 4303 return 0;
4071 * optimistic spinning and not contend further:
4072 */
4073 if (lock->owner)
4074 return 0;
4075 break;
4076 }
4077 4304
4078 /* 4305 while (owner_running(lock, owner)) {
4079 * Is that owner really running on that cpu? 4306 if (need_resched())
4080 */
4081 if (task_thread_info(rq->curr) != owner || need_resched())
4082 return 0; 4307 return 0;
4083 4308
4084 arch_mutex_cpu_relax(); 4309 arch_mutex_cpu_relax();
4085 } 4310 }
4086 4311
4312 /*
4313 * If the owner changed to another task there is likely
4314 * heavy contention, stop spinning.
4315 */
4316 if (lock->owner)
4317 return 0;
4318
4087 return 1; 4319 return 1;
4088} 4320}
4089#endif 4321#endif
@@ -4213,6 +4445,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
4213{ 4445{
4214 __wake_up_common(q, mode, 1, 0, key); 4446 __wake_up_common(q, mode, 1, 0, key);
4215} 4447}
4448EXPORT_SYMBOL_GPL(__wake_up_locked_key);
4216 4449
4217/** 4450/**
4218 * __wake_up_sync_key - wake up threads blocked on a waitqueue. 4451 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
@@ -4542,19 +4775,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
4542 */ 4775 */
4543void rt_mutex_setprio(struct task_struct *p, int prio) 4776void rt_mutex_setprio(struct task_struct *p, int prio)
4544{ 4777{
4545 unsigned long flags;
4546 int oldprio, on_rq, running; 4778 int oldprio, on_rq, running;
4547 struct rq *rq; 4779 struct rq *rq;
4548 const struct sched_class *prev_class; 4780 const struct sched_class *prev_class;
4549 4781
4550 BUG_ON(prio < 0 || prio > MAX_PRIO); 4782 BUG_ON(prio < 0 || prio > MAX_PRIO);
4551 4783
4552 rq = task_rq_lock(p, &flags); 4784 rq = __task_rq_lock(p);
4553 4785
4554 trace_sched_pi_setprio(p, prio); 4786 trace_sched_pi_setprio(p, prio);
4555 oldprio = p->prio; 4787 oldprio = p->prio;
4556 prev_class = p->sched_class; 4788 prev_class = p->sched_class;
4557 on_rq = p->se.on_rq; 4789 on_rq = p->on_rq;
4558 running = task_current(rq, p); 4790 running = task_current(rq, p);
4559 if (on_rq) 4791 if (on_rq)
4560 dequeue_task(rq, p, 0); 4792 dequeue_task(rq, p, 0);
@@ -4570,12 +4802,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4570 4802
4571 if (running) 4803 if (running)
4572 p->sched_class->set_curr_task(rq); 4804 p->sched_class->set_curr_task(rq);
4573 if (on_rq) { 4805 if (on_rq)
4574 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4806 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4575 4807
4576 check_class_changed(rq, p, prev_class, oldprio, running); 4808 check_class_changed(rq, p, prev_class, oldprio);
4577 } 4809 __task_rq_unlock(rq);
4578 task_rq_unlock(rq, &flags);
4579} 4810}
4580 4811
4581#endif 4812#endif
@@ -4603,7 +4834,7 @@ void set_user_nice(struct task_struct *p, long nice)
4603 p->static_prio = NICE_TO_PRIO(nice); 4834 p->static_prio = NICE_TO_PRIO(nice);
4604 goto out_unlock; 4835 goto out_unlock;
4605 } 4836 }
4606 on_rq = p->se.on_rq; 4837 on_rq = p->on_rq;
4607 if (on_rq) 4838 if (on_rq)
4608 dequeue_task(rq, p, 0); 4839 dequeue_task(rq, p, 0);
4609 4840
@@ -4623,7 +4854,7 @@ void set_user_nice(struct task_struct *p, long nice)
4623 resched_task(rq->curr); 4854 resched_task(rq->curr);
4624 } 4855 }
4625out_unlock: 4856out_unlock:
4626 task_rq_unlock(rq, &flags); 4857 task_rq_unlock(rq, p, &flags);
4627} 4858}
4628EXPORT_SYMBOL(set_user_nice); 4859EXPORT_SYMBOL(set_user_nice);
4629 4860
@@ -4737,8 +4968,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
4737static void 4968static void
4738__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 4969__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4739{ 4970{
4740 BUG_ON(p->se.on_rq);
4741
4742 p->policy = policy; 4971 p->policy = policy;
4743 p->rt_priority = prio; 4972 p->rt_priority = prio;
4744 p->normal_prio = normal_prio(p); 4973 p->normal_prio = normal_prio(p);
@@ -4761,8 +4990,11 @@ static bool check_same_owner(struct task_struct *p)
4761 4990
4762 rcu_read_lock(); 4991 rcu_read_lock();
4763 pcred = __task_cred(p); 4992 pcred = __task_cred(p);
4764 match = (cred->euid == pcred->euid || 4993 if (cred->user->user_ns == pcred->user->user_ns)
4765 cred->euid == pcred->uid); 4994 match = (cred->euid == pcred->euid ||
4995 cred->euid == pcred->uid);
4996 else
4997 match = false;
4766 rcu_read_unlock(); 4998 rcu_read_unlock();
4767 return match; 4999 return match;
4768} 5000}
@@ -4822,12 +5054,15 @@ recheck:
4822 param->sched_priority > rlim_rtprio) 5054 param->sched_priority > rlim_rtprio)
4823 return -EPERM; 5055 return -EPERM;
4824 } 5056 }
5057
4825 /* 5058 /*
4826 * Like positive nice levels, dont allow tasks to 5059 * Treat SCHED_IDLE as nice 20. Only allow a switch to
4827 * move out of SCHED_IDLE either: 5060 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
4828 */ 5061 */
4829 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) 5062 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
4830 return -EPERM; 5063 if (!can_nice(p, TASK_NICE(p)))
5064 return -EPERM;
5065 }
4831 5066
4832 /* can't change other user's priorities */ 5067 /* can't change other user's priorities */
4833 if (!check_same_owner(p)) 5068 if (!check_same_owner(p))
@@ -4847,21 +5082,29 @@ recheck:
4847 /* 5082 /*
4848 * make sure no PI-waiters arrive (or leave) while we are 5083 * make sure no PI-waiters arrive (or leave) while we are
4849 * changing the priority of the task: 5084 * changing the priority of the task:
4850 */ 5085 *
4851 raw_spin_lock_irqsave(&p->pi_lock, flags); 5086 * To be able to change p->policy safely, the appropriate
4852 /*
4853 * To be able to change p->policy safely, the apropriate
4854 * runqueue lock must be held. 5087 * runqueue lock must be held.
4855 */ 5088 */
4856 rq = __task_rq_lock(p); 5089 rq = task_rq_lock(p, &flags);
4857 5090
4858 /* 5091 /*
4859 * Changing the policy of the stop threads its a very bad idea 5092 * Changing the policy of the stop threads its a very bad idea
4860 */ 5093 */
4861 if (p == rq->stop) { 5094 if (p == rq->stop) {
5095 task_rq_unlock(rq, p, &flags);
5096 return -EINVAL;
5097 }
5098
5099 /*
5100 * If not changing anything there's no need to proceed further:
5101 */
5102 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
5103 param->sched_priority == p->rt_priority))) {
5104
4862 __task_rq_unlock(rq); 5105 __task_rq_unlock(rq);
4863 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5106 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4864 return -EINVAL; 5107 return 0;
4865 } 5108 }
4866 5109
4867#ifdef CONFIG_RT_GROUP_SCHED 5110#ifdef CONFIG_RT_GROUP_SCHED
@@ -4873,8 +5116,7 @@ recheck:
4873 if (rt_bandwidth_enabled() && rt_policy(policy) && 5116 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4874 task_group(p)->rt_bandwidth.rt_runtime == 0 && 5117 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4875 !task_group_is_autogroup(task_group(p))) { 5118 !task_group_is_autogroup(task_group(p))) {
4876 __task_rq_unlock(rq); 5119 task_rq_unlock(rq, p, &flags);
4877 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4878 return -EPERM; 5120 return -EPERM;
4879 } 5121 }
4880 } 5122 }
@@ -4883,11 +5125,10 @@ recheck:
4883 /* recheck policy now with rq lock held */ 5125 /* recheck policy now with rq lock held */
4884 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 5126 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4885 policy = oldpolicy = -1; 5127 policy = oldpolicy = -1;
4886 __task_rq_unlock(rq); 5128 task_rq_unlock(rq, p, &flags);
4887 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4888 goto recheck; 5129 goto recheck;
4889 } 5130 }
4890 on_rq = p->se.on_rq; 5131 on_rq = p->on_rq;
4891 running = task_current(rq, p); 5132 running = task_current(rq, p);
4892 if (on_rq) 5133 if (on_rq)
4893 deactivate_task(rq, p, 0); 5134 deactivate_task(rq, p, 0);
@@ -4902,13 +5143,11 @@ recheck:
4902 5143
4903 if (running) 5144 if (running)
4904 p->sched_class->set_curr_task(rq); 5145 p->sched_class->set_curr_task(rq);
4905 if (on_rq) { 5146 if (on_rq)
4906 activate_task(rq, p, 0); 5147 activate_task(rq, p, 0);
4907 5148
4908 check_class_changed(rq, p, prev_class, oldprio, running); 5149 check_class_changed(rq, p, prev_class, oldprio);
4909 } 5150 task_rq_unlock(rq, p, &flags);
4910 __task_rq_unlock(rq);
4911 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4912 5151
4913 rt_mutex_adjust_pi(p); 5152 rt_mutex_adjust_pi(p);
4914 5153
@@ -5088,7 +5327,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5088 goto out_free_cpus_allowed; 5327 goto out_free_cpus_allowed;
5089 } 5328 }
5090 retval = -EPERM; 5329 retval = -EPERM;
5091 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5330 if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
5092 goto out_unlock; 5331 goto out_unlock;
5093 5332
5094 retval = security_task_setscheduler(p); 5333 retval = security_task_setscheduler(p);
@@ -5159,7 +5398,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5159{ 5398{
5160 struct task_struct *p; 5399 struct task_struct *p;
5161 unsigned long flags; 5400 unsigned long flags;
5162 struct rq *rq;
5163 int retval; 5401 int retval;
5164 5402
5165 get_online_cpus(); 5403 get_online_cpus();
@@ -5174,9 +5412,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5174 if (retval) 5412 if (retval)
5175 goto out_unlock; 5413 goto out_unlock;
5176 5414
5177 rq = task_rq_lock(p, &flags); 5415 raw_spin_lock_irqsave(&p->pi_lock, flags);
5178 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 5416 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5179 task_rq_unlock(rq, &flags); 5417 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5180 5418
5181out_unlock: 5419out_unlock:
5182 rcu_read_unlock(); 5420 rcu_read_unlock();
@@ -5323,6 +5561,67 @@ void __sched yield(void)
5323} 5561}
5324EXPORT_SYMBOL(yield); 5562EXPORT_SYMBOL(yield);
5325 5563
5564/**
5565 * yield_to - yield the current processor to another thread in
5566 * your thread group, or accelerate that thread toward the
5567 * processor it's on.
5568 * @p: target task
5569 * @preempt: whether task preemption is allowed or not
5570 *
5571 * It's the caller's job to ensure that the target task struct
5572 * can't go away on us before we can do any checks.
5573 *
5574 * Returns true if we indeed boosted the target task.
5575 */
5576bool __sched yield_to(struct task_struct *p, bool preempt)
5577{
5578 struct task_struct *curr = current;
5579 struct rq *rq, *p_rq;
5580 unsigned long flags;
5581 bool yielded = 0;
5582
5583 local_irq_save(flags);
5584 rq = this_rq();
5585
5586again:
5587 p_rq = task_rq(p);
5588 double_rq_lock(rq, p_rq);
5589 while (task_rq(p) != p_rq) {
5590 double_rq_unlock(rq, p_rq);
5591 goto again;
5592 }
5593
5594 if (!curr->sched_class->yield_to_task)
5595 goto out;
5596
5597 if (curr->sched_class != p->sched_class)
5598 goto out;
5599
5600 if (task_running(p_rq, p) || p->state)
5601 goto out;
5602
5603 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5604 if (yielded) {
5605 schedstat_inc(rq, yld_count);
5606 /*
5607 * Make p's CPU reschedule; pick_next_entity takes care of
5608 * fairness.
5609 */
5610 if (preempt && rq != p_rq)
5611 resched_task(p_rq->curr);
5612 }
5613
5614out:
5615 double_rq_unlock(rq, p_rq);
5616 local_irq_restore(flags);
5617
5618 if (yielded)
5619 schedule();
5620
5621 return yielded;
5622}
5623EXPORT_SYMBOL_GPL(yield_to);
5624
5326/* 5625/*
5327 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 5626 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
5328 * that process accounting knows that this is a task in IO wait state. 5627 * that process accounting knows that this is a task in IO wait state.
@@ -5333,6 +5632,7 @@ void __sched io_schedule(void)
5333 5632
5334 delayacct_blkio_start(); 5633 delayacct_blkio_start();
5335 atomic_inc(&rq->nr_iowait); 5634 atomic_inc(&rq->nr_iowait);
5635 blk_flush_plug(current);
5336 current->in_iowait = 1; 5636 current->in_iowait = 1;
5337 schedule(); 5637 schedule();
5338 current->in_iowait = 0; 5638 current->in_iowait = 0;
@@ -5348,6 +5648,7 @@ long __sched io_schedule_timeout(long timeout)
5348 5648
5349 delayacct_blkio_start(); 5649 delayacct_blkio_start();
5350 atomic_inc(&rq->nr_iowait); 5650 atomic_inc(&rq->nr_iowait);
5651 blk_flush_plug(current);
5351 current->in_iowait = 1; 5652 current->in_iowait = 1;
5352 ret = schedule_timeout(timeout); 5653 ret = schedule_timeout(timeout);
5353 current->in_iowait = 0; 5654 current->in_iowait = 0;
@@ -5438,7 +5739,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5438 5739
5439 rq = task_rq_lock(p, &flags); 5740 rq = task_rq_lock(p, &flags);
5440 time_slice = p->sched_class->get_rr_interval(rq, p); 5741 time_slice = p->sched_class->get_rr_interval(rq, p);
5441 task_rq_unlock(rq, &flags); 5742 task_rq_unlock(rq, p, &flags);
5442 5743
5443 rcu_read_unlock(); 5744 rcu_read_unlock();
5444 jiffies_to_timespec(time_slice, &t); 5745 jiffies_to_timespec(time_slice, &t);
@@ -5496,7 +5797,7 @@ void show_state_filter(unsigned long state_filter)
5496 do_each_thread(g, p) { 5797 do_each_thread(g, p) {
5497 /* 5798 /*
5498 * reset the NMI-timeout, listing all files on a slow 5799 * reset the NMI-timeout, listing all files on a slow
5499 * console might take alot of time: 5800 * console might take a lot of time:
5500 */ 5801 */
5501 touch_nmi_watchdog(); 5802 touch_nmi_watchdog();
5502 if (!state_filter || (p->state & state_filter)) 5803 if (!state_filter || (p->state & state_filter))
@@ -5556,22 +5857,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5556 rcu_read_unlock(); 5857 rcu_read_unlock();
5557 5858
5558 rq->curr = rq->idle = idle; 5859 rq->curr = rq->idle = idle;
5559#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5860#if defined(CONFIG_SMP)
5560 idle->oncpu = 1; 5861 idle->on_cpu = 1;
5561#endif 5862#endif
5562 raw_spin_unlock_irqrestore(&rq->lock, flags); 5863 raw_spin_unlock_irqrestore(&rq->lock, flags);
5563 5864
5564 /* Set the preempt count _outside_ the spinlocks! */ 5865 /* Set the preempt count _outside_ the spinlocks! */
5565#if defined(CONFIG_PREEMPT)
5566 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5567#else
5568 task_thread_info(idle)->preempt_count = 0; 5866 task_thread_info(idle)->preempt_count = 0;
5569#endif 5867
5570 /* 5868 /*
5571 * The idle tasks have their own, simple scheduling class: 5869 * The idle tasks have their own, simple scheduling class:
5572 */ 5870 */
5573 idle->sched_class = &idle_sched_class; 5871 idle->sched_class = &idle_sched_class;
5574 ftrace_graph_init_task(idle); 5872 ftrace_graph_init_idle_task(idle, cpu);
5575} 5873}
5576 5874
5577/* 5875/*
@@ -5661,26 +5959,17 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5661 unsigned int dest_cpu; 5959 unsigned int dest_cpu;
5662 int ret = 0; 5960 int ret = 0;
5663 5961
5664 /*
5665 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5666 * drop the rq->lock and still rely on ->cpus_allowed.
5667 */
5668again:
5669 while (task_is_waking(p))
5670 cpu_relax();
5671 rq = task_rq_lock(p, &flags); 5962 rq = task_rq_lock(p, &flags);
5672 if (task_is_waking(p)) { 5963
5673 task_rq_unlock(rq, &flags); 5964 if (cpumask_equal(&p->cpus_allowed, new_mask))
5674 goto again; 5965 goto out;
5675 }
5676 5966
5677 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5967 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5678 ret = -EINVAL; 5968 ret = -EINVAL;
5679 goto out; 5969 goto out;
5680 } 5970 }
5681 5971
5682 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 5972 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
5683 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5684 ret = -EINVAL; 5973 ret = -EINVAL;
5685 goto out; 5974 goto out;
5686 } 5975 }
@@ -5697,16 +5986,16 @@ again:
5697 goto out; 5986 goto out;
5698 5987
5699 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5988 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5700 if (migrate_task(p, rq)) { 5989 if (p->on_rq) {
5701 struct migration_arg arg = { p, dest_cpu }; 5990 struct migration_arg arg = { p, dest_cpu };
5702 /* Need help from migration thread: drop lock and wait. */ 5991 /* Need help from migration thread: drop lock and wait. */
5703 task_rq_unlock(rq, &flags); 5992 task_rq_unlock(rq, p, &flags);
5704 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 5993 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5705 tlb_migrate_finish(p->mm); 5994 tlb_migrate_finish(p->mm);
5706 return 0; 5995 return 0;
5707 } 5996 }
5708out: 5997out:
5709 task_rq_unlock(rq, &flags); 5998 task_rq_unlock(rq, p, &flags);
5710 5999
5711 return ret; 6000 return ret;
5712} 6001}
@@ -5734,6 +6023,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5734 rq_src = cpu_rq(src_cpu); 6023 rq_src = cpu_rq(src_cpu);
5735 rq_dest = cpu_rq(dest_cpu); 6024 rq_dest = cpu_rq(dest_cpu);
5736 6025
6026 raw_spin_lock(&p->pi_lock);
5737 double_rq_lock(rq_src, rq_dest); 6027 double_rq_lock(rq_src, rq_dest);
5738 /* Already moved. */ 6028 /* Already moved. */
5739 if (task_cpu(p) != src_cpu) 6029 if (task_cpu(p) != src_cpu)
@@ -5746,7 +6036,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5746 * If we're not on a rq, the next wake-up will ensure we're 6036 * If we're not on a rq, the next wake-up will ensure we're
5747 * placed properly. 6037 * placed properly.
5748 */ 6038 */
5749 if (p->se.on_rq) { 6039 if (p->on_rq) {
5750 deactivate_task(rq_src, p, 0); 6040 deactivate_task(rq_src, p, 0);
5751 set_task_cpu(p, dest_cpu); 6041 set_task_cpu(p, dest_cpu);
5752 activate_task(rq_dest, p, 0); 6042 activate_task(rq_dest, p, 0);
@@ -5756,6 +6046,7 @@ done:
5756 ret = 1; 6046 ret = 1;
5757fail: 6047fail:
5758 double_rq_unlock(rq_src, rq_dest); 6048 double_rq_unlock(rq_src, rq_dest);
6049 raw_spin_unlock(&p->pi_lock);
5759 return ret; 6050 return ret;
5760} 6051}
5761 6052
@@ -6096,6 +6387,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6096 6387
6097#ifdef CONFIG_HOTPLUG_CPU 6388#ifdef CONFIG_HOTPLUG_CPU
6098 case CPU_DYING: 6389 case CPU_DYING:
6390 sched_ttwu_pending();
6099 /* Update our root-domain */ 6391 /* Update our root-domain */
6100 raw_spin_lock_irqsave(&rq->lock, flags); 6392 raw_spin_lock_irqsave(&rq->lock, flags);
6101 if (rq->rd) { 6393 if (rq->rd) {
@@ -6111,6 +6403,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6111 break; 6403 break;
6112#endif 6404#endif
6113 } 6405 }
6406
6407 update_max_interval();
6408
6114 return NOTIFY_OK; 6409 return NOTIFY_OK;
6115} 6410}
6116 6411
@@ -6171,6 +6466,8 @@ early_initcall(migration_init);
6171 6466
6172#ifdef CONFIG_SMP 6467#ifdef CONFIG_SMP
6173 6468
6469static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
6470
6174#ifdef CONFIG_SCHED_DEBUG 6471#ifdef CONFIG_SCHED_DEBUG
6175 6472
6176static __read_mostly int sched_domain_debug_enabled; 6473static __read_mostly int sched_domain_debug_enabled;
@@ -6245,7 +6542,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6245 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 6542 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6246 6543
6247 printk(KERN_CONT " %s", str); 6544 printk(KERN_CONT " %s", str);
6248 if (group->cpu_power != SCHED_LOAD_SCALE) { 6545 if (group->cpu_power != SCHED_POWER_SCALE) {
6249 printk(KERN_CONT " (cpu_power = %d)", 6546 printk(KERN_CONT " (cpu_power = %d)",
6250 group->cpu_power); 6547 group->cpu_power);
6251 } 6548 }
@@ -6266,7 +6563,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6266 6563
6267static void sched_domain_debug(struct sched_domain *sd, int cpu) 6564static void sched_domain_debug(struct sched_domain *sd, int cpu)
6268{ 6565{
6269 cpumask_var_t groupmask;
6270 int level = 0; 6566 int level = 0;
6271 6567
6272 if (!sched_domain_debug_enabled) 6568 if (!sched_domain_debug_enabled)
@@ -6279,20 +6575,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6279 6575
6280 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6576 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6281 6577
6282 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6283 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6284 return;
6285 }
6286
6287 for (;;) { 6578 for (;;) {
6288 if (sched_domain_debug_one(sd, cpu, level, groupmask)) 6579 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
6289 break; 6580 break;
6290 level++; 6581 level++;
6291 sd = sd->parent; 6582 sd = sd->parent;
6292 if (!sd) 6583 if (!sd)
6293 break; 6584 break;
6294 } 6585 }
6295 free_cpumask_var(groupmask);
6296} 6586}
6297#else /* !CONFIG_SCHED_DEBUG */ 6587#else /* !CONFIG_SCHED_DEBUG */
6298# define sched_domain_debug(sd, cpu) do { } while (0) 6588# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6349,12 +6639,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6349 return 1; 6639 return 1;
6350} 6640}
6351 6641
6352static void free_rootdomain(struct root_domain *rd) 6642static void free_rootdomain(struct rcu_head *rcu)
6353{ 6643{
6354 synchronize_sched(); 6644 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
6355 6645
6356 cpupri_cleanup(&rd->cpupri); 6646 cpupri_cleanup(&rd->cpupri);
6357
6358 free_cpumask_var(rd->rto_mask); 6647 free_cpumask_var(rd->rto_mask);
6359 free_cpumask_var(rd->online); 6648 free_cpumask_var(rd->online);
6360 free_cpumask_var(rd->span); 6649 free_cpumask_var(rd->span);
@@ -6395,7 +6684,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6395 raw_spin_unlock_irqrestore(&rq->lock, flags); 6684 raw_spin_unlock_irqrestore(&rq->lock, flags);
6396 6685
6397 if (old_rd) 6686 if (old_rd)
6398 free_rootdomain(old_rd); 6687 call_rcu_sched(&old_rd->rcu, free_rootdomain);
6399} 6688}
6400 6689
6401static int init_rootdomain(struct root_domain *rd) 6690static int init_rootdomain(struct root_domain *rd)
@@ -6446,6 +6735,25 @@ static struct root_domain *alloc_rootdomain(void)
6446 return rd; 6735 return rd;
6447} 6736}
6448 6737
6738static void free_sched_domain(struct rcu_head *rcu)
6739{
6740 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6741 if (atomic_dec_and_test(&sd->groups->ref))
6742 kfree(sd->groups);
6743 kfree(sd);
6744}
6745
6746static void destroy_sched_domain(struct sched_domain *sd, int cpu)
6747{
6748 call_rcu(&sd->rcu, free_sched_domain);
6749}
6750
6751static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6752{
6753 for (; sd; sd = sd->parent)
6754 destroy_sched_domain(sd, cpu);
6755}
6756
6449/* 6757/*
6450 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6758 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6451 * hold the hotplug lock. 6759 * hold the hotplug lock.
@@ -6456,9 +6764,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6456 struct rq *rq = cpu_rq(cpu); 6764 struct rq *rq = cpu_rq(cpu);
6457 struct sched_domain *tmp; 6765 struct sched_domain *tmp;
6458 6766
6459 for (tmp = sd; tmp; tmp = tmp->parent)
6460 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6461
6462 /* Remove the sched domains which do not contribute to scheduling. */ 6767 /* Remove the sched domains which do not contribute to scheduling. */
6463 for (tmp = sd; tmp; ) { 6768 for (tmp = sd; tmp; ) {
6464 struct sched_domain *parent = tmp->parent; 6769 struct sched_domain *parent = tmp->parent;
@@ -6469,12 +6774,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6469 tmp->parent = parent->parent; 6774 tmp->parent = parent->parent;
6470 if (parent->parent) 6775 if (parent->parent)
6471 parent->parent->child = tmp; 6776 parent->parent->child = tmp;
6777 destroy_sched_domain(parent, cpu);
6472 } else 6778 } else
6473 tmp = tmp->parent; 6779 tmp = tmp->parent;
6474 } 6780 }
6475 6781
6476 if (sd && sd_degenerate(sd)) { 6782 if (sd && sd_degenerate(sd)) {
6783 tmp = sd;
6477 sd = sd->parent; 6784 sd = sd->parent;
6785 destroy_sched_domain(tmp, cpu);
6478 if (sd) 6786 if (sd)
6479 sd->child = NULL; 6787 sd->child = NULL;
6480 } 6788 }
@@ -6482,7 +6790,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6482 sched_domain_debug(sd, cpu); 6790 sched_domain_debug(sd, cpu);
6483 6791
6484 rq_attach_root(rq, rd); 6792 rq_attach_root(rq, rd);
6793 tmp = rq->sd;
6485 rcu_assign_pointer(rq->sd, sd); 6794 rcu_assign_pointer(rq->sd, sd);
6795 destroy_sched_domains(tmp, cpu);
6486} 6796}
6487 6797
6488/* cpus with isolated domains */ 6798/* cpus with isolated domains */
@@ -6498,56 +6808,6 @@ static int __init isolated_cpu_setup(char *str)
6498 6808
6499__setup("isolcpus=", isolated_cpu_setup); 6809__setup("isolcpus=", isolated_cpu_setup);
6500 6810
6501/*
6502 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6503 * to a function which identifies what group(along with sched group) a CPU
6504 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6505 * (due to the fact that we keep track of groups covered with a struct cpumask).
6506 *
6507 * init_sched_build_groups will build a circular linked list of the groups
6508 * covered by the given span, and will set each group's ->cpumask correctly,
6509 * and ->cpu_power to 0.
6510 */
6511static void
6512init_sched_build_groups(const struct cpumask *span,
6513 const struct cpumask *cpu_map,
6514 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6515 struct sched_group **sg,
6516 struct cpumask *tmpmask),
6517 struct cpumask *covered, struct cpumask *tmpmask)
6518{
6519 struct sched_group *first = NULL, *last = NULL;
6520 int i;
6521
6522 cpumask_clear(covered);
6523
6524 for_each_cpu(i, span) {
6525 struct sched_group *sg;
6526 int group = group_fn(i, cpu_map, &sg, tmpmask);
6527 int j;
6528
6529 if (cpumask_test_cpu(i, covered))
6530 continue;
6531
6532 cpumask_clear(sched_group_cpus(sg));
6533 sg->cpu_power = 0;
6534
6535 for_each_cpu(j, span) {
6536 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6537 continue;
6538
6539 cpumask_set_cpu(j, covered);
6540 cpumask_set_cpu(j, sched_group_cpus(sg));
6541 }
6542 if (!first)
6543 first = sg;
6544 if (last)
6545 last->next = sg;
6546 last = sg;
6547 }
6548 last->next = first;
6549}
6550
6551#define SD_NODES_PER_DOMAIN 16 6811#define SD_NODES_PER_DOMAIN 16
6552 6812
6553#ifdef CONFIG_NUMA 6813#ifdef CONFIG_NUMA
@@ -6564,7 +6824,7 @@ init_sched_build_groups(const struct cpumask *span,
6564 */ 6824 */
6565static int find_next_best_node(int node, nodemask_t *used_nodes) 6825static int find_next_best_node(int node, nodemask_t *used_nodes)
6566{ 6826{
6567 int i, n, val, min_val, best_node = 0; 6827 int i, n, val, min_val, best_node = -1;
6568 6828
6569 min_val = INT_MAX; 6829 min_val = INT_MAX;
6570 6830
@@ -6588,7 +6848,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6588 } 6848 }
6589 } 6849 }
6590 6850
6591 node_set(best_node, *used_nodes); 6851 if (best_node != -1)
6852 node_set(best_node, *used_nodes);
6592 return best_node; 6853 return best_node;
6593} 6854}
6594 6855
@@ -6614,315 +6875,130 @@ static void sched_domain_node_span(int node, struct cpumask *span)
6614 6875
6615 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 6876 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6616 int next_node = find_next_best_node(node, &used_nodes); 6877 int next_node = find_next_best_node(node, &used_nodes);
6617 6878 if (next_node < 0)
6879 break;
6618 cpumask_or(span, span, cpumask_of_node(next_node)); 6880 cpumask_or(span, span, cpumask_of_node(next_node));
6619 } 6881 }
6620} 6882}
6883
6884static const struct cpumask *cpu_node_mask(int cpu)
6885{
6886 lockdep_assert_held(&sched_domains_mutex);
6887
6888 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
6889
6890 return sched_domains_tmpmask;
6891}
6892
6893static const struct cpumask *cpu_allnodes_mask(int cpu)
6894{
6895 return cpu_possible_mask;
6896}
6621#endif /* CONFIG_NUMA */ 6897#endif /* CONFIG_NUMA */
6622 6898
6623int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6899static const struct cpumask *cpu_cpu_mask(int cpu)
6900{
6901 return cpumask_of_node(cpu_to_node(cpu));
6902}
6624 6903
6625/* 6904int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6626 * The cpus mask in sched_group and sched_domain hangs off the end.
6627 *
6628 * ( See the the comments in include/linux/sched.h:struct sched_group
6629 * and struct sched_domain. )
6630 */
6631struct static_sched_group {
6632 struct sched_group sg;
6633 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
6634};
6635 6905
6636struct static_sched_domain { 6906struct sd_data {
6637 struct sched_domain sd; 6907 struct sched_domain **__percpu sd;
6638 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 6908 struct sched_group **__percpu sg;
6639}; 6909};
6640 6910
6641struct s_data { 6911struct s_data {
6642#ifdef CONFIG_NUMA 6912 struct sched_domain ** __percpu sd;
6643 int sd_allnodes;
6644 cpumask_var_t domainspan;
6645 cpumask_var_t covered;
6646 cpumask_var_t notcovered;
6647#endif
6648 cpumask_var_t nodemask;
6649 cpumask_var_t this_sibling_map;
6650 cpumask_var_t this_core_map;
6651 cpumask_var_t this_book_map;
6652 cpumask_var_t send_covered;
6653 cpumask_var_t tmpmask;
6654 struct sched_group **sched_group_nodes;
6655 struct root_domain *rd; 6913 struct root_domain *rd;
6656}; 6914};
6657 6915
6658enum s_alloc { 6916enum s_alloc {
6659 sa_sched_groups = 0,
6660 sa_rootdomain, 6917 sa_rootdomain,
6661 sa_tmpmask, 6918 sa_sd,
6662 sa_send_covered, 6919 sa_sd_storage,
6663 sa_this_book_map,
6664 sa_this_core_map,
6665 sa_this_sibling_map,
6666 sa_nodemask,
6667 sa_sched_group_nodes,
6668#ifdef CONFIG_NUMA
6669 sa_notcovered,
6670 sa_covered,
6671 sa_domainspan,
6672#endif
6673 sa_none, 6920 sa_none,
6674}; 6921};
6675 6922
6676/* 6923struct sched_domain_topology_level;
6677 * SMT sched-domains:
6678 */
6679#ifdef CONFIG_SCHED_SMT
6680static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
6681static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
6682 6924
6683static int 6925typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
6684cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 6926typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
6685 struct sched_group **sg, struct cpumask *unused)
6686{
6687 if (sg)
6688 *sg = &per_cpu(sched_groups, cpu).sg;
6689 return cpu;
6690}
6691#endif /* CONFIG_SCHED_SMT */
6692 6927
6693/* 6928struct sched_domain_topology_level {
6694 * multi-core sched-domains: 6929 sched_domain_init_f init;
6695 */ 6930 sched_domain_mask_f mask;
6696#ifdef CONFIG_SCHED_MC 6931 struct sd_data data;
6697static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6932};
6698static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6699
6700static int
6701cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6702 struct sched_group **sg, struct cpumask *mask)
6703{
6704 int group;
6705#ifdef CONFIG_SCHED_SMT
6706 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6707 group = cpumask_first(mask);
6708#else
6709 group = cpu;
6710#endif
6711 if (sg)
6712 *sg = &per_cpu(sched_group_core, group).sg;
6713 return group;
6714}
6715#endif /* CONFIG_SCHED_MC */
6716 6933
6717/* 6934/*
6718 * book sched-domains: 6935 * Assumes the sched_domain tree is fully constructed
6719 */ 6936 */
6720#ifdef CONFIG_SCHED_BOOK 6937static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6721static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6722static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6723
6724static int
6725cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
6726 struct sched_group **sg, struct cpumask *mask)
6727{ 6938{
6728 int group = cpu; 6939 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6729#ifdef CONFIG_SCHED_MC 6940 struct sched_domain *child = sd->child;
6730 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6731 group = cpumask_first(mask);
6732#elif defined(CONFIG_SCHED_SMT)
6733 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6734 group = cpumask_first(mask);
6735#endif
6736 if (sg)
6737 *sg = &per_cpu(sched_group_book, group).sg;
6738 return group;
6739}
6740#endif /* CONFIG_SCHED_BOOK */
6741 6941
6742static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6942 if (child)
6743static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6943 cpu = cpumask_first(sched_domain_span(child));
6744 6944
6745static int
6746cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6747 struct sched_group **sg, struct cpumask *mask)
6748{
6749 int group;
6750#ifdef CONFIG_SCHED_BOOK
6751 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6752 group = cpumask_first(mask);
6753#elif defined(CONFIG_SCHED_MC)
6754 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6755 group = cpumask_first(mask);
6756#elif defined(CONFIG_SCHED_SMT)
6757 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6758 group = cpumask_first(mask);
6759#else
6760 group = cpu;
6761#endif
6762 if (sg) 6945 if (sg)
6763 *sg = &per_cpu(sched_group_phys, group).sg; 6946 *sg = *per_cpu_ptr(sdd->sg, cpu);
6764 return group; 6947
6948 return cpu;
6765} 6949}
6766 6950
6767#ifdef CONFIG_NUMA
6768/* 6951/*
6769 * The init_sched_build_groups can't handle what we want to do with node 6952 * build_sched_groups takes the cpumask we wish to span, and a pointer
6770 * groups, so roll our own. Now each node has its own list of groups which 6953 * to a function which identifies what group(along with sched group) a CPU
6771 * gets dynamically allocated. 6954 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6955 * (due to the fact that we keep track of groups covered with a struct cpumask).
6956 *
6957 * build_sched_groups will build a circular linked list of the groups
6958 * covered by the given span, and will set each group's ->cpumask correctly,
6959 * and ->cpu_power to 0.
6772 */ 6960 */
6773static DEFINE_PER_CPU(struct static_sched_domain, node_domains); 6961static void
6774static struct sched_group ***sched_group_nodes_bycpu; 6962build_sched_groups(struct sched_domain *sd)
6775
6776static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
6777static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
6778
6779static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
6780 struct sched_group **sg,
6781 struct cpumask *nodemask)
6782{
6783 int group;
6784
6785 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
6786 group = cpumask_first(nodemask);
6787
6788 if (sg)
6789 *sg = &per_cpu(sched_group_allnodes, group).sg;
6790 return group;
6791}
6792
6793static void init_numa_sched_groups_power(struct sched_group *group_head)
6794{
6795 struct sched_group *sg = group_head;
6796 int j;
6797
6798 if (!sg)
6799 return;
6800 do {
6801 for_each_cpu(j, sched_group_cpus(sg)) {
6802 struct sched_domain *sd;
6803
6804 sd = &per_cpu(phys_domains, j).sd;
6805 if (j != group_first_cpu(sd->groups)) {
6806 /*
6807 * Only add "power" once for each
6808 * physical package.
6809 */
6810 continue;
6811 }
6812
6813 sg->cpu_power += sd->groups->cpu_power;
6814 }
6815 sg = sg->next;
6816 } while (sg != group_head);
6817}
6818
6819static int build_numa_sched_groups(struct s_data *d,
6820 const struct cpumask *cpu_map, int num)
6821{ 6963{
6822 struct sched_domain *sd; 6964 struct sched_group *first = NULL, *last = NULL;
6823 struct sched_group *sg, *prev; 6965 struct sd_data *sdd = sd->private;
6824 int n, j; 6966 const struct cpumask *span = sched_domain_span(sd);
6825 6967 struct cpumask *covered;
6826 cpumask_clear(d->covered); 6968 int i;
6827 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
6828 if (cpumask_empty(d->nodemask)) {
6829 d->sched_group_nodes[num] = NULL;
6830 goto out;
6831 }
6832
6833 sched_domain_node_span(num, d->domainspan);
6834 cpumask_and(d->domainspan, d->domainspan, cpu_map);
6835
6836 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
6837 GFP_KERNEL, num);
6838 if (!sg) {
6839 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
6840 num);
6841 return -ENOMEM;
6842 }
6843 d->sched_group_nodes[num] = sg;
6844
6845 for_each_cpu(j, d->nodemask) {
6846 sd = &per_cpu(node_domains, j).sd;
6847 sd->groups = sg;
6848 }
6849
6850 sg->cpu_power = 0;
6851 cpumask_copy(sched_group_cpus(sg), d->nodemask);
6852 sg->next = sg;
6853 cpumask_or(d->covered, d->covered, d->nodemask);
6854 6969
6855 prev = sg; 6970 lockdep_assert_held(&sched_domains_mutex);
6856 for (j = 0; j < nr_node_ids; j++) { 6971 covered = sched_domains_tmpmask;
6857 n = (num + j) % nr_node_ids;
6858 cpumask_complement(d->notcovered, d->covered);
6859 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
6860 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
6861 if (cpumask_empty(d->tmpmask))
6862 break;
6863 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
6864 if (cpumask_empty(d->tmpmask))
6865 continue;
6866 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
6867 GFP_KERNEL, num);
6868 if (!sg) {
6869 printk(KERN_WARNING
6870 "Can not alloc domain group for node %d\n", j);
6871 return -ENOMEM;
6872 }
6873 sg->cpu_power = 0;
6874 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
6875 sg->next = prev->next;
6876 cpumask_or(d->covered, d->covered, d->tmpmask);
6877 prev->next = sg;
6878 prev = sg;
6879 }
6880out:
6881 return 0;
6882}
6883#endif /* CONFIG_NUMA */
6884 6972
6885#ifdef CONFIG_NUMA 6973 cpumask_clear(covered);
6886/* Free memory allocated for various sched_group structures */
6887static void free_sched_groups(const struct cpumask *cpu_map,
6888 struct cpumask *nodemask)
6889{
6890 int cpu, i;
6891 6974
6892 for_each_cpu(cpu, cpu_map) { 6975 for_each_cpu(i, span) {
6893 struct sched_group **sched_group_nodes 6976 struct sched_group *sg;
6894 = sched_group_nodes_bycpu[cpu]; 6977 int group = get_group(i, sdd, &sg);
6978 int j;
6895 6979
6896 if (!sched_group_nodes) 6980 if (cpumask_test_cpu(i, covered))
6897 continue; 6981 continue;
6898 6982
6899 for (i = 0; i < nr_node_ids; i++) { 6983 cpumask_clear(sched_group_cpus(sg));
6900 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 6984 sg->cpu_power = 0;
6901 6985
6902 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 6986 for_each_cpu(j, span) {
6903 if (cpumask_empty(nodemask)) 6987 if (get_group(j, sdd, NULL) != group)
6904 continue; 6988 continue;
6905 6989
6906 if (sg == NULL) 6990 cpumask_set_cpu(j, covered);
6907 continue; 6991 cpumask_set_cpu(j, sched_group_cpus(sg));
6908 sg = sg->next;
6909next_sg:
6910 oldsg = sg;
6911 sg = sg->next;
6912 kfree(oldsg);
6913 if (oldsg != sched_group_nodes[i])
6914 goto next_sg;
6915 } 6992 }
6916 kfree(sched_group_nodes); 6993
6917 sched_group_nodes_bycpu[cpu] = NULL; 6994 if (!first)
6995 first = sg;
6996 if (last)
6997 last->next = sg;
6998 last = sg;
6918 } 6999 }
7000 last->next = first;
6919} 7001}
6920#else /* !CONFIG_NUMA */
6921static void free_sched_groups(const struct cpumask *cpu_map,
6922 struct cpumask *nodemask)
6923{
6924}
6925#endif /* CONFIG_NUMA */
6926 7002
6927/* 7003/*
6928 * Initialize sched groups cpu_power. 7004 * Initialize sched groups cpu_power.
@@ -6936,11 +7012,6 @@ static void free_sched_groups(const struct cpumask *cpu_map,
6936 */ 7012 */
6937static void init_sched_groups_power(int cpu, struct sched_domain *sd) 7013static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6938{ 7014{
6939 struct sched_domain *child;
6940 struct sched_group *group;
6941 long power;
6942 int weight;
6943
6944 WARN_ON(!sd || !sd->groups); 7015 WARN_ON(!sd || !sd->groups);
6945 7016
6946 if (cpu != group_first_cpu(sd->groups)) 7017 if (cpu != group_first_cpu(sd->groups))
@@ -6948,36 +7019,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6948 7019
6949 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); 7020 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
6950 7021
6951 child = sd->child; 7022 update_group_power(sd, cpu);
6952
6953 sd->groups->cpu_power = 0;
6954
6955 if (!child) {
6956 power = SCHED_LOAD_SCALE;
6957 weight = cpumask_weight(sched_domain_span(sd));
6958 /*
6959 * SMT siblings share the power of a single core.
6960 * Usually multiple threads get a better yield out of
6961 * that one core than a single thread would have,
6962 * reflect that in sd->smt_gain.
6963 */
6964 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
6965 power *= sd->smt_gain;
6966 power /= weight;
6967 power >>= SCHED_LOAD_SHIFT;
6968 }
6969 sd->groups->cpu_power += power;
6970 return;
6971 }
6972
6973 /*
6974 * Add cpu_power of each child group to this groups cpu_power.
6975 */
6976 group = child->groups;
6977 do {
6978 sd->groups->cpu_power += group->cpu_power;
6979 group = group->next;
6980 } while (group != child->groups);
6981} 7023}
6982 7024
6983/* 7025/*
@@ -6991,15 +7033,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6991# define SD_INIT_NAME(sd, type) do { } while (0) 7033# define SD_INIT_NAME(sd, type) do { } while (0)
6992#endif 7034#endif
6993 7035
6994#define SD_INIT(sd, type) sd_init_##type(sd) 7036#define SD_INIT_FUNC(type) \
6995 7037static noinline struct sched_domain * \
6996#define SD_INIT_FUNC(type) \ 7038sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6997static noinline void sd_init_##type(struct sched_domain *sd) \ 7039{ \
6998{ \ 7040 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
6999 memset(sd, 0, sizeof(*sd)); \ 7041 *sd = SD_##type##_INIT; \
7000 *sd = SD_##type##_INIT; \ 7042 SD_INIT_NAME(sd, type); \
7001 sd->level = SD_LV_##type; \ 7043 sd->private = &tl->data; \
7002 SD_INIT_NAME(sd, type); \ 7044 return sd; \
7003} 7045}
7004 7046
7005SD_INIT_FUNC(CPU) 7047SD_INIT_FUNC(CPU)
@@ -7018,13 +7060,14 @@ SD_INIT_FUNC(CPU)
7018#endif 7060#endif
7019 7061
7020static int default_relax_domain_level = -1; 7062static int default_relax_domain_level = -1;
7063int sched_domain_level_max;
7021 7064
7022static int __init setup_relax_domain_level(char *str) 7065static int __init setup_relax_domain_level(char *str)
7023{ 7066{
7024 unsigned long val; 7067 unsigned long val;
7025 7068
7026 val = simple_strtoul(str, NULL, 0); 7069 val = simple_strtoul(str, NULL, 0);
7027 if (val < SD_LV_MAX) 7070 if (val < sched_domain_level_max)
7028 default_relax_domain_level = val; 7071 default_relax_domain_level = val;
7029 7072
7030 return 1; 7073 return 1;
@@ -7052,37 +7095,20 @@ static void set_domain_attribute(struct sched_domain *sd,
7052 } 7095 }
7053} 7096}
7054 7097
7098static void __sdt_free(const struct cpumask *cpu_map);
7099static int __sdt_alloc(const struct cpumask *cpu_map);
7100
7055static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 7101static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7056 const struct cpumask *cpu_map) 7102 const struct cpumask *cpu_map)
7057{ 7103{
7058 switch (what) { 7104 switch (what) {
7059 case sa_sched_groups:
7060 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
7061 d->sched_group_nodes = NULL;
7062 case sa_rootdomain: 7105 case sa_rootdomain:
7063 free_rootdomain(d->rd); /* fall through */ 7106 if (!atomic_read(&d->rd->refcount))
7064 case sa_tmpmask: 7107 free_rootdomain(&d->rd->rcu); /* fall through */
7065 free_cpumask_var(d->tmpmask); /* fall through */ 7108 case sa_sd:
7066 case sa_send_covered: 7109 free_percpu(d->sd); /* fall through */
7067 free_cpumask_var(d->send_covered); /* fall through */ 7110 case sa_sd_storage:
7068 case sa_this_book_map: 7111 __sdt_free(cpu_map); /* fall through */
7069 free_cpumask_var(d->this_book_map); /* fall through */
7070 case sa_this_core_map:
7071 free_cpumask_var(d->this_core_map); /* fall through */
7072 case sa_this_sibling_map:
7073 free_cpumask_var(d->this_sibling_map); /* fall through */
7074 case sa_nodemask:
7075 free_cpumask_var(d->nodemask); /* fall through */
7076 case sa_sched_group_nodes:
7077#ifdef CONFIG_NUMA
7078 kfree(d->sched_group_nodes); /* fall through */
7079 case sa_notcovered:
7080 free_cpumask_var(d->notcovered); /* fall through */
7081 case sa_covered:
7082 free_cpumask_var(d->covered); /* fall through */
7083 case sa_domainspan:
7084 free_cpumask_var(d->domainspan); /* fall through */
7085#endif
7086 case sa_none: 7112 case sa_none:
7087 break; 7113 break;
7088 } 7114 }
@@ -7091,308 +7117,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7091static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 7117static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7092 const struct cpumask *cpu_map) 7118 const struct cpumask *cpu_map)
7093{ 7119{
7094#ifdef CONFIG_NUMA 7120 memset(d, 0, sizeof(*d));
7095 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) 7121
7096 return sa_none; 7122 if (__sdt_alloc(cpu_map))
7097 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) 7123 return sa_sd_storage;
7098 return sa_domainspan; 7124 d->sd = alloc_percpu(struct sched_domain *);
7099 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) 7125 if (!d->sd)
7100 return sa_covered; 7126 return sa_sd_storage;
7101 /* Allocate the per-node list of sched groups */
7102 d->sched_group_nodes = kcalloc(nr_node_ids,
7103 sizeof(struct sched_group *), GFP_KERNEL);
7104 if (!d->sched_group_nodes) {
7105 printk(KERN_WARNING "Can not alloc sched group node list\n");
7106 return sa_notcovered;
7107 }
7108 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
7109#endif
7110 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
7111 return sa_sched_group_nodes;
7112 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
7113 return sa_nodemask;
7114 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
7115 return sa_this_sibling_map;
7116 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
7117 return sa_this_core_map;
7118 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7119 return sa_this_book_map;
7120 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
7121 return sa_send_covered;
7122 d->rd = alloc_rootdomain(); 7127 d->rd = alloc_rootdomain();
7123 if (!d->rd) { 7128 if (!d->rd)
7124 printk(KERN_WARNING "Cannot alloc root domain\n"); 7129 return sa_sd;
7125 return sa_tmpmask;
7126 }
7127 return sa_rootdomain; 7130 return sa_rootdomain;
7128} 7131}
7129 7132
7130static struct sched_domain *__build_numa_sched_domains(struct s_data *d, 7133/*
7131 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) 7134 * NULL the sd_data elements we've used to build the sched_domain and
7135 * sched_group structure so that the subsequent __free_domain_allocs()
7136 * will not free the data we're using.
7137 */
7138static void claim_allocations(int cpu, struct sched_domain *sd)
7132{ 7139{
7133 struct sched_domain *sd = NULL; 7140 struct sd_data *sdd = sd->private;
7134#ifdef CONFIG_NUMA 7141 struct sched_group *sg = sd->groups;
7135 struct sched_domain *parent;
7136
7137 d->sd_allnodes = 0;
7138 if (cpumask_weight(cpu_map) >
7139 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
7140 sd = &per_cpu(allnodes_domains, i).sd;
7141 SD_INIT(sd, ALLNODES);
7142 set_domain_attribute(sd, attr);
7143 cpumask_copy(sched_domain_span(sd), cpu_map);
7144 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
7145 d->sd_allnodes = 1;
7146 }
7147 parent = sd;
7148
7149 sd = &per_cpu(node_domains, i).sd;
7150 SD_INIT(sd, NODE);
7151 set_domain_attribute(sd, attr);
7152 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7153 sd->parent = parent;
7154 if (parent)
7155 parent->child = sd;
7156 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
7157#endif
7158 return sd;
7159}
7160 7142
7161static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, 7143 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7162 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7144 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7163 struct sched_domain *parent, int i)
7164{
7165 struct sched_domain *sd;
7166 sd = &per_cpu(phys_domains, i).sd;
7167 SD_INIT(sd, CPU);
7168 set_domain_attribute(sd, attr);
7169 cpumask_copy(sched_domain_span(sd), d->nodemask);
7170 sd->parent = parent;
7171 if (parent)
7172 parent->child = sd;
7173 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
7174 return sd;
7175}
7176 7145
7177static struct sched_domain *__build_book_sched_domain(struct s_data *d, 7146 if (cpu == cpumask_first(sched_group_cpus(sg))) {
7178 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7147 WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
7179 struct sched_domain *parent, int i) 7148 *per_cpu_ptr(sdd->sg, cpu) = NULL;
7180{ 7149 }
7181 struct sched_domain *sd = parent;
7182#ifdef CONFIG_SCHED_BOOK
7183 sd = &per_cpu(book_domains, i).sd;
7184 SD_INIT(sd, BOOK);
7185 set_domain_attribute(sd, attr);
7186 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7187 sd->parent = parent;
7188 parent->child = sd;
7189 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7190#endif
7191 return sd;
7192} 7150}
7193 7151
7194static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7152#ifdef CONFIG_SCHED_SMT
7195 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7153static const struct cpumask *cpu_smt_mask(int cpu)
7196 struct sched_domain *parent, int i)
7197{ 7154{
7198 struct sched_domain *sd = parent; 7155 return topology_thread_cpumask(cpu);
7199#ifdef CONFIG_SCHED_MC
7200 sd = &per_cpu(core_domains, i).sd;
7201 SD_INIT(sd, MC);
7202 set_domain_attribute(sd, attr);
7203 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
7204 sd->parent = parent;
7205 parent->child = sd;
7206 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
7207#endif
7208 return sd;
7209} 7156}
7210
7211static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
7212 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7213 struct sched_domain *parent, int i)
7214{
7215 struct sched_domain *sd = parent;
7216#ifdef CONFIG_SCHED_SMT
7217 sd = &per_cpu(cpu_domains, i).sd;
7218 SD_INIT(sd, SIBLING);
7219 set_domain_attribute(sd, attr);
7220 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
7221 sd->parent = parent;
7222 parent->child = sd;
7223 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
7224#endif 7157#endif
7225 return sd;
7226}
7227 7158
7228static void build_sched_groups(struct s_data *d, enum sched_domain_level l, 7159/*
7229 const struct cpumask *cpu_map, int cpu) 7160 * Topology list, bottom-up.
7230{ 7161 */
7231 switch (l) { 7162static struct sched_domain_topology_level default_topology[] = {
7232#ifdef CONFIG_SCHED_SMT 7163#ifdef CONFIG_SCHED_SMT
7233 case SD_LV_SIBLING: /* set up CPU (sibling) groups */ 7164 { sd_init_SIBLING, cpu_smt_mask, },
7234 cpumask_and(d->this_sibling_map, cpu_map,
7235 topology_thread_cpumask(cpu));
7236 if (cpu == cpumask_first(d->this_sibling_map))
7237 init_sched_build_groups(d->this_sibling_map, cpu_map,
7238 &cpu_to_cpu_group,
7239 d->send_covered, d->tmpmask);
7240 break;
7241#endif 7165#endif
7242#ifdef CONFIG_SCHED_MC 7166#ifdef CONFIG_SCHED_MC
7243 case SD_LV_MC: /* set up multi-core groups */ 7167 { sd_init_MC, cpu_coregroup_mask, },
7244 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
7245 if (cpu == cpumask_first(d->this_core_map))
7246 init_sched_build_groups(d->this_core_map, cpu_map,
7247 &cpu_to_core_group,
7248 d->send_covered, d->tmpmask);
7249 break;
7250#endif 7168#endif
7251#ifdef CONFIG_SCHED_BOOK 7169#ifdef CONFIG_SCHED_BOOK
7252 case SD_LV_BOOK: /* set up book groups */ 7170 { sd_init_BOOK, cpu_book_mask, },
7253 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7254 if (cpu == cpumask_first(d->this_book_map))
7255 init_sched_build_groups(d->this_book_map, cpu_map,
7256 &cpu_to_book_group,
7257 d->send_covered, d->tmpmask);
7258 break;
7259#endif 7171#endif
7260 case SD_LV_CPU: /* set up physical groups */ 7172 { sd_init_CPU, cpu_cpu_mask, },
7261 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7262 if (!cpumask_empty(d->nodemask))
7263 init_sched_build_groups(d->nodemask, cpu_map,
7264 &cpu_to_phys_group,
7265 d->send_covered, d->tmpmask);
7266 break;
7267#ifdef CONFIG_NUMA 7173#ifdef CONFIG_NUMA
7268 case SD_LV_ALLNODES: 7174 { sd_init_NODE, cpu_node_mask, },
7269 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, 7175 { sd_init_ALLNODES, cpu_allnodes_mask, },
7270 d->send_covered, d->tmpmask);
7271 break;
7272#endif 7176#endif
7273 default: 7177 { NULL, },
7274 break; 7178};
7179
7180static struct sched_domain_topology_level *sched_domain_topology = default_topology;
7181
7182static int __sdt_alloc(const struct cpumask *cpu_map)
7183{
7184 struct sched_domain_topology_level *tl;
7185 int j;
7186
7187 for (tl = sched_domain_topology; tl->init; tl++) {
7188 struct sd_data *sdd = &tl->data;
7189
7190 sdd->sd = alloc_percpu(struct sched_domain *);
7191 if (!sdd->sd)
7192 return -ENOMEM;
7193
7194 sdd->sg = alloc_percpu(struct sched_group *);
7195 if (!sdd->sg)
7196 return -ENOMEM;
7197
7198 for_each_cpu(j, cpu_map) {
7199 struct sched_domain *sd;
7200 struct sched_group *sg;
7201
7202 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7203 GFP_KERNEL, cpu_to_node(j));
7204 if (!sd)
7205 return -ENOMEM;
7206
7207 *per_cpu_ptr(sdd->sd, j) = sd;
7208
7209 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7210 GFP_KERNEL, cpu_to_node(j));
7211 if (!sg)
7212 return -ENOMEM;
7213
7214 *per_cpu_ptr(sdd->sg, j) = sg;
7215 }
7216 }
7217
7218 return 0;
7219}
7220
7221static void __sdt_free(const struct cpumask *cpu_map)
7222{
7223 struct sched_domain_topology_level *tl;
7224 int j;
7225
7226 for (tl = sched_domain_topology; tl->init; tl++) {
7227 struct sd_data *sdd = &tl->data;
7228
7229 for_each_cpu(j, cpu_map) {
7230 kfree(*per_cpu_ptr(sdd->sd, j));
7231 kfree(*per_cpu_ptr(sdd->sg, j));
7232 }
7233 free_percpu(sdd->sd);
7234 free_percpu(sdd->sg);
7275 } 7235 }
7276} 7236}
7277 7237
7238struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
7239 struct s_data *d, const struct cpumask *cpu_map,
7240 struct sched_domain_attr *attr, struct sched_domain *child,
7241 int cpu)
7242{
7243 struct sched_domain *sd = tl->init(tl, cpu);
7244 if (!sd)
7245 return child;
7246
7247 set_domain_attribute(sd, attr);
7248 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
7249 if (child) {
7250 sd->level = child->level + 1;
7251 sched_domain_level_max = max(sched_domain_level_max, sd->level);
7252 child->parent = sd;
7253 }
7254 sd->child = child;
7255
7256 return sd;
7257}
7258
7278/* 7259/*
7279 * Build sched domains for a given set of cpus and attach the sched domains 7260 * Build sched domains for a given set of cpus and attach the sched domains
7280 * to the individual cpus 7261 * to the individual cpus
7281 */ 7262 */
7282static int __build_sched_domains(const struct cpumask *cpu_map, 7263static int build_sched_domains(const struct cpumask *cpu_map,
7283 struct sched_domain_attr *attr) 7264 struct sched_domain_attr *attr)
7284{ 7265{
7285 enum s_alloc alloc_state = sa_none; 7266 enum s_alloc alloc_state = sa_none;
7286 struct s_data d;
7287 struct sched_domain *sd; 7267 struct sched_domain *sd;
7288 int i; 7268 struct s_data d;
7289#ifdef CONFIG_NUMA 7269 int i, ret = -ENOMEM;
7290 d.sd_allnodes = 0;
7291#endif
7292 7270
7293 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7271 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7294 if (alloc_state != sa_rootdomain) 7272 if (alloc_state != sa_rootdomain)
7295 goto error; 7273 goto error;
7296 alloc_state = sa_sched_groups;
7297 7274
7298 /* 7275 /* Set up domains for cpus specified by the cpu_map. */
7299 * Set up domains for cpus specified by the cpu_map.
7300 */
7301 for_each_cpu(i, cpu_map) { 7276 for_each_cpu(i, cpu_map) {
7302 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), 7277 struct sched_domain_topology_level *tl;
7303 cpu_map);
7304
7305 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
7306 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
7307 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
7308 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7309 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7310 }
7311
7312 for_each_cpu(i, cpu_map) {
7313 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7314 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7315 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7316 }
7317 7278
7318 /* Set up physical groups */ 7279 sd = NULL;
7319 for (i = 0; i < nr_node_ids; i++) 7280 for (tl = sched_domain_topology; tl->init; tl++)
7320 build_sched_groups(&d, SD_LV_CPU, cpu_map, i); 7281 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
7321 7282
7322#ifdef CONFIG_NUMA 7283 while (sd->child)
7323 /* Set up node groups */ 7284 sd = sd->child;
7324 if (d.sd_allnodes)
7325 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
7326
7327 for (i = 0; i < nr_node_ids; i++)
7328 if (build_numa_sched_groups(&d, cpu_map, i))
7329 goto error;
7330#endif
7331 7285
7332 /* Calculate CPU power for physical packages and nodes */ 7286 *per_cpu_ptr(d.sd, i) = sd;
7333#ifdef CONFIG_SCHED_SMT
7334 for_each_cpu(i, cpu_map) {
7335 sd = &per_cpu(cpu_domains, i).sd;
7336 init_sched_groups_power(i, sd);
7337 }
7338#endif
7339#ifdef CONFIG_SCHED_MC
7340 for_each_cpu(i, cpu_map) {
7341 sd = &per_cpu(core_domains, i).sd;
7342 init_sched_groups_power(i, sd);
7343 }
7344#endif
7345#ifdef CONFIG_SCHED_BOOK
7346 for_each_cpu(i, cpu_map) {
7347 sd = &per_cpu(book_domains, i).sd;
7348 init_sched_groups_power(i, sd);
7349 } 7287 }
7350#endif
7351 7288
7289 /* Build the groups for the domains */
7352 for_each_cpu(i, cpu_map) { 7290 for_each_cpu(i, cpu_map) {
7353 sd = &per_cpu(phys_domains, i).sd; 7291 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7354 init_sched_groups_power(i, sd); 7292 sd->span_weight = cpumask_weight(sched_domain_span(sd));
7355 } 7293 get_group(i, sd->private, &sd->groups);
7294 atomic_inc(&sd->groups->ref);
7356 7295
7357#ifdef CONFIG_NUMA 7296 if (i != cpumask_first(sched_domain_span(sd)))
7358 for (i = 0; i < nr_node_ids; i++) 7297 continue;
7359 init_numa_sched_groups_power(d.sched_group_nodes[i]);
7360 7298
7361 if (d.sd_allnodes) { 7299 build_sched_groups(sd);
7362 struct sched_group *sg; 7300 }
7301 }
7363 7302
7364 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 7303 /* Calculate CPU power for physical packages and nodes */
7365 d.tmpmask); 7304 for (i = nr_cpumask_bits-1; i >= 0; i--) {
7366 init_numa_sched_groups_power(sg); 7305 if (!cpumask_test_cpu(i, cpu_map))
7306 continue;
7307
7308 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7309 claim_allocations(i, sd);
7310 init_sched_groups_power(i, sd);
7311 }
7367 } 7312 }
7368#endif
7369 7313
7370 /* Attach the domains */ 7314 /* Attach the domains */
7315 rcu_read_lock();
7371 for_each_cpu(i, cpu_map) { 7316 for_each_cpu(i, cpu_map) {
7372#ifdef CONFIG_SCHED_SMT 7317 sd = *per_cpu_ptr(d.sd, i);
7373 sd = &per_cpu(cpu_domains, i).sd;
7374#elif defined(CONFIG_SCHED_MC)
7375 sd = &per_cpu(core_domains, i).sd;
7376#elif defined(CONFIG_SCHED_BOOK)
7377 sd = &per_cpu(book_domains, i).sd;
7378#else
7379 sd = &per_cpu(phys_domains, i).sd;
7380#endif
7381 cpu_attach_domain(sd, d.rd, i); 7318 cpu_attach_domain(sd, d.rd, i);
7382 } 7319 }
7320 rcu_read_unlock();
7383 7321
7384 d.sched_group_nodes = NULL; /* don't free this we still need it */ 7322 ret = 0;
7385 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
7386 return 0;
7387
7388error: 7323error:
7389 __free_domain_allocs(&d, alloc_state, cpu_map); 7324 __free_domain_allocs(&d, alloc_state, cpu_map);
7390 return -ENOMEM; 7325 return ret;
7391}
7392
7393static int build_sched_domains(const struct cpumask *cpu_map)
7394{
7395 return __build_sched_domains(cpu_map, NULL);
7396} 7326}
7397 7327
7398static cpumask_var_t *doms_cur; /* current sched domains */ 7328static cpumask_var_t *doms_cur; /* current sched domains */
@@ -7447,7 +7377,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7447 * For now this just excludes isolated cpus, but could be used to 7377 * For now this just excludes isolated cpus, but could be used to
7448 * exclude other special cases in the future. 7378 * exclude other special cases in the future.
7449 */ 7379 */
7450static int arch_init_sched_domains(const struct cpumask *cpu_map) 7380static int init_sched_domains(const struct cpumask *cpu_map)
7451{ 7381{
7452 int err; 7382 int err;
7453 7383
@@ -7458,32 +7388,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
7458 doms_cur = &fallback_doms; 7388 doms_cur = &fallback_doms;
7459 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 7389 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7460 dattr_cur = NULL; 7390 dattr_cur = NULL;
7461 err = build_sched_domains(doms_cur[0]); 7391 err = build_sched_domains(doms_cur[0], NULL);
7462 register_sched_domain_sysctl(); 7392 register_sched_domain_sysctl();
7463 7393
7464 return err; 7394 return err;
7465} 7395}
7466 7396
7467static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7468 struct cpumask *tmpmask)
7469{
7470 free_sched_groups(cpu_map, tmpmask);
7471}
7472
7473/* 7397/*
7474 * Detach sched domains from a group of cpus specified in cpu_map 7398 * Detach sched domains from a group of cpus specified in cpu_map
7475 * These cpus will now be attached to the NULL domain 7399 * These cpus will now be attached to the NULL domain
7476 */ 7400 */
7477static void detach_destroy_domains(const struct cpumask *cpu_map) 7401static void detach_destroy_domains(const struct cpumask *cpu_map)
7478{ 7402{
7479 /* Save because hotplug lock held. */
7480 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7481 int i; 7403 int i;
7482 7404
7405 rcu_read_lock();
7483 for_each_cpu(i, cpu_map) 7406 for_each_cpu(i, cpu_map)
7484 cpu_attach_domain(NULL, &def_root_domain, i); 7407 cpu_attach_domain(NULL, &def_root_domain, i);
7485 synchronize_sched(); 7408 rcu_read_unlock();
7486 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7487} 7409}
7488 7410
7489/* handle null as "default" */ 7411/* handle null as "default" */
@@ -7572,8 +7494,7 @@ match1:
7572 goto match2; 7494 goto match2;
7573 } 7495 }
7574 /* no match - add a new doms_new */ 7496 /* no match - add a new doms_new */
7575 __build_sched_domains(doms_new[i], 7497 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7576 dattr_new ? dattr_new + i : NULL);
7577match2: 7498match2:
7578 ; 7499 ;
7579 } 7500 }
@@ -7592,7 +7513,7 @@ match2:
7592} 7513}
7593 7514
7594#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7515#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7595static void arch_reinit_sched_domains(void) 7516static void reinit_sched_domains(void)
7596{ 7517{
7597 get_online_cpus(); 7518 get_online_cpus();
7598 7519
@@ -7625,7 +7546,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7625 else 7546 else
7626 sched_mc_power_savings = level; 7547 sched_mc_power_savings = level;
7627 7548
7628 arch_reinit_sched_domains(); 7549 reinit_sched_domains();
7629 7550
7630 return count; 7551 return count;
7631} 7552}
@@ -7744,14 +7665,9 @@ void __init sched_init_smp(void)
7744 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7665 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7745 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7666 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7746 7667
7747#if defined(CONFIG_NUMA)
7748 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7749 GFP_KERNEL);
7750 BUG_ON(sched_group_nodes_bycpu == NULL);
7751#endif
7752 get_online_cpus(); 7668 get_online_cpus();
7753 mutex_lock(&sched_domains_mutex); 7669 mutex_lock(&sched_domains_mutex);
7754 arch_init_sched_domains(cpu_active_mask); 7670 init_sched_domains(cpu_active_mask);
7755 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7671 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7756 if (cpumask_empty(non_isolated_cpus)) 7672 if (cpumask_empty(non_isolated_cpus))
7757 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7673 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -7796,6 +7712,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7796 INIT_LIST_HEAD(&cfs_rq->tasks); 7712 INIT_LIST_HEAD(&cfs_rq->tasks);
7797#ifdef CONFIG_FAIR_GROUP_SCHED 7713#ifdef CONFIG_FAIR_GROUP_SCHED
7798 cfs_rq->rq = rq; 7714 cfs_rq->rq = rq;
7715 /* allow initial update_cfs_load() to truncate */
7716#ifdef CONFIG_SMP
7717 cfs_rq->load_stamp = 1;
7718#endif
7799#endif 7719#endif
7800 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 7720 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7801} 7721}
@@ -7997,7 +7917,7 @@ void __init sched_init(void)
7997#ifdef CONFIG_SMP 7917#ifdef CONFIG_SMP
7998 rq->sd = NULL; 7918 rq->sd = NULL;
7999 rq->rd = NULL; 7919 rq->rd = NULL;
8000 rq->cpu_power = SCHED_LOAD_SCALE; 7920 rq->cpu_power = SCHED_POWER_SCALE;
8001 rq->post_schedule = 0; 7921 rq->post_schedule = 0;
8002 rq->active_balance = 0; 7922 rq->active_balance = 0;
8003 rq->next_balance = jiffies; 7923 rq->next_balance = jiffies;
@@ -8054,6 +7974,7 @@ void __init sched_init(void)
8054 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 7974 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8055 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7975 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
8056#ifdef CONFIG_SMP 7976#ifdef CONFIG_SMP
7977 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8057#ifdef CONFIG_NO_HZ 7978#ifdef CONFIG_NO_HZ
8058 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 7979 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8059 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); 7980 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@ -8074,7 +7995,7 @@ static inline int preempt_count_equals(int preempt_offset)
8074{ 7995{
8075 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 7996 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
8076 7997
8077 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7998 return (nested == preempt_offset);
8078} 7999}
8079 8000
8080void __might_sleep(const char *file, int line, int preempt_offset) 8001void __might_sleep(const char *file, int line, int preempt_offset)
@@ -8109,9 +8030,11 @@ EXPORT_SYMBOL(__might_sleep);
8109#ifdef CONFIG_MAGIC_SYSRQ 8030#ifdef CONFIG_MAGIC_SYSRQ
8110static void normalize_task(struct rq *rq, struct task_struct *p) 8031static void normalize_task(struct rq *rq, struct task_struct *p)
8111{ 8032{
8033 const struct sched_class *prev_class = p->sched_class;
8034 int old_prio = p->prio;
8112 int on_rq; 8035 int on_rq;
8113 8036
8114 on_rq = p->se.on_rq; 8037 on_rq = p->on_rq;
8115 if (on_rq) 8038 if (on_rq)
8116 deactivate_task(rq, p, 0); 8039 deactivate_task(rq, p, 0);
8117 __setscheduler(rq, p, SCHED_NORMAL, 0); 8040 __setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8119,6 +8042,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8119 activate_task(rq, p, 0); 8042 activate_task(rq, p, 0);
8120 resched_task(rq->curr); 8043 resched_task(rq->curr);
8121 } 8044 }
8045
8046 check_class_changed(rq, p, prev_class, old_prio);
8122} 8047}
8123 8048
8124void normalize_rt_tasks(void) 8049void normalize_rt_tasks(void)
@@ -8234,7 +8159,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8234{ 8159{
8235 struct cfs_rq *cfs_rq; 8160 struct cfs_rq *cfs_rq;
8236 struct sched_entity *se; 8161 struct sched_entity *se;
8237 struct rq *rq;
8238 int i; 8162 int i;
8239 8163
8240 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 8164 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8247,8 +8171,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8247 tg->shares = NICE_0_LOAD; 8171 tg->shares = NICE_0_LOAD;
8248 8172
8249 for_each_possible_cpu(i) { 8173 for_each_possible_cpu(i) {
8250 rq = cpu_rq(i);
8251
8252 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8174 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8253 GFP_KERNEL, cpu_to_node(i)); 8175 GFP_KERNEL, cpu_to_node(i));
8254 if (!cfs_rq) 8176 if (!cfs_rq)
@@ -8325,7 +8247,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8325{ 8247{
8326 struct rt_rq *rt_rq; 8248 struct rt_rq *rt_rq;
8327 struct sched_rt_entity *rt_se; 8249 struct sched_rt_entity *rt_se;
8328 struct rq *rq;
8329 int i; 8250 int i;
8330 8251
8331 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 8252 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8339,8 +8260,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8339 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 8260 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8340 8261
8341 for_each_possible_cpu(i) { 8262 for_each_possible_cpu(i) {
8342 rq = cpu_rq(i);
8343
8344 rt_rq = kzalloc_node(sizeof(struct rt_rq), 8263 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8345 GFP_KERNEL, cpu_to_node(i)); 8264 GFP_KERNEL, cpu_to_node(i));
8346 if (!rt_rq) 8265 if (!rt_rq)
@@ -8455,7 +8374,7 @@ void sched_move_task(struct task_struct *tsk)
8455 rq = task_rq_lock(tsk, &flags); 8374 rq = task_rq_lock(tsk, &flags);
8456 8375
8457 running = task_current(rq, tsk); 8376 running = task_current(rq, tsk);
8458 on_rq = tsk->se.on_rq; 8377 on_rq = tsk->on_rq;
8459 8378
8460 if (on_rq) 8379 if (on_rq)
8461 dequeue_task(rq, tsk, 0); 8380 dequeue_task(rq, tsk, 0);
@@ -8474,7 +8393,7 @@ void sched_move_task(struct task_struct *tsk)
8474 if (on_rq) 8393 if (on_rq)
8475 enqueue_task(rq, tsk, 0); 8394 enqueue_task(rq, tsk, 0);
8476 8395
8477 task_rq_unlock(rq, &flags); 8396 task_rq_unlock(rq, tsk, &flags);
8478} 8397}
8479#endif /* CONFIG_CGROUP_SCHED */ 8398#endif /* CONFIG_CGROUP_SCHED */
8480 8399
@@ -8510,7 +8429,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8510 /* Propagate contribution to hierarchy */ 8429 /* Propagate contribution to hierarchy */
8511 raw_spin_lock_irqsave(&rq->lock, flags); 8430 raw_spin_lock_irqsave(&rq->lock, flags);
8512 for_each_sched_entity(se) 8431 for_each_sched_entity(se)
8513 update_cfs_shares(group_cfs_rq(se), 0); 8432 update_cfs_shares(group_cfs_rq(se));
8514 raw_spin_unlock_irqrestore(&rq->lock, flags); 8433 raw_spin_unlock_irqrestore(&rq->lock, flags);
8515 } 8434 }
8516 8435
@@ -8845,46 +8764,15 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
8845 return 0; 8764 return 0;
8846} 8765}
8847 8766
8848static int
8849cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8850 struct task_struct *tsk, bool threadgroup)
8851{
8852 int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
8853 if (retval)
8854 return retval;
8855 if (threadgroup) {
8856 struct task_struct *c;
8857 rcu_read_lock();
8858 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
8859 retval = cpu_cgroup_can_attach_task(cgrp, c);
8860 if (retval) {
8861 rcu_read_unlock();
8862 return retval;
8863 }
8864 }
8865 rcu_read_unlock();
8866 }
8867 return 0;
8868}
8869
8870static void 8767static void
8871cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 8768cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
8872 struct cgroup *old_cont, struct task_struct *tsk,
8873 bool threadgroup)
8874{ 8769{
8875 sched_move_task(tsk); 8770 sched_move_task(tsk);
8876 if (threadgroup) {
8877 struct task_struct *c;
8878 rcu_read_lock();
8879 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
8880 sched_move_task(c);
8881 }
8882 rcu_read_unlock();
8883 }
8884} 8771}
8885 8772
8886static void 8773static void
8887cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) 8774cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
8775 struct cgroup *old_cgrp, struct task_struct *task)
8888{ 8776{
8889 /* 8777 /*
8890 * cgroup_exit() is called in the copy_process() failure path. 8778 * cgroup_exit() is called in the copy_process() failure path.
@@ -8901,14 +8789,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
8901static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 8789static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
8902 u64 shareval) 8790 u64 shareval)
8903{ 8791{
8904 return sched_group_set_shares(cgroup_tg(cgrp), shareval); 8792 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
8905} 8793}
8906 8794
8907static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 8795static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8908{ 8796{
8909 struct task_group *tg = cgroup_tg(cgrp); 8797 struct task_group *tg = cgroup_tg(cgrp);
8910 8798
8911 return (u64) tg->shares; 8799 return (u64) scale_load_down(tg->shares);
8912} 8800}
8913#endif /* CONFIG_FAIR_GROUP_SCHED */ 8801#endif /* CONFIG_FAIR_GROUP_SCHED */
8914 8802
@@ -8967,8 +8855,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8967 .name = "cpu", 8855 .name = "cpu",
8968 .create = cpu_cgroup_create, 8856 .create = cpu_cgroup_create,
8969 .destroy = cpu_cgroup_destroy, 8857 .destroy = cpu_cgroup_destroy,
8970 .can_attach = cpu_cgroup_can_attach, 8858 .can_attach_task = cpu_cgroup_can_attach_task,
8971 .attach = cpu_cgroup_attach, 8859 .attach_task = cpu_cgroup_attach_task,
8972 .exit = cpu_cgroup_exit, 8860 .exit = cpu_cgroup_exit,
8973 .populate = cpu_cgroup_populate, 8861 .populate = cpu_cgroup_populate,
8974 .subsys_id = cpu_cgroup_subsys_id, 8862 .subsys_id = cpu_cgroup_subsys_id,
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 9fb656283157..429242f3c484 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -12,7 +12,6 @@ static atomic_t autogroup_seq_nr;
12static void __init autogroup_init(struct task_struct *init_task) 12static void __init autogroup_init(struct task_struct *init_task)
13{ 13{
14 autogroup_default.tg = &root_task_group; 14 autogroup_default.tg = &root_task_group;
15 root_task_group.autogroup = &autogroup_default;
16 kref_init(&autogroup_default.kref); 15 kref_init(&autogroup_default.kref);
17 init_rwsem(&autogroup_default.lock); 16 init_rwsem(&autogroup_default.lock);
18 init_task->signal->autogroup = &autogroup_default; 17 init_task->signal->autogroup = &autogroup_default;
@@ -130,7 +129,7 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
130 129
131static inline bool task_group_is_autogroup(struct task_group *tg) 130static inline bool task_group_is_autogroup(struct task_group *tg)
132{ 131{
133 return tg != &root_task_group && tg->autogroup; 132 return !!tg->autogroup;
134} 133}
135 134
136static inline struct task_group * 135static inline struct task_group *
@@ -161,11 +160,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
161 160
162 p->signal->autogroup = autogroup_kref_get(ag); 161 p->signal->autogroup = autogroup_kref_get(ag);
163 162
163 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
164 goto out;
165
164 t = p; 166 t = p;
165 do { 167 do {
166 sched_move_task(t); 168 sched_move_task(t);
167 } while_each_thread(p, t); 169 } while_each_thread(p, t);
168 170
171out:
169 unlock_task_sighand(p, &flags); 172 unlock_task_sighand(p, &flags);
170 autogroup_kref_put(prev); 173 autogroup_kref_put(prev);
171} 174}
@@ -176,7 +179,7 @@ void sched_autogroup_create_attach(struct task_struct *p)
176 struct autogroup *ag = autogroup_create(); 179 struct autogroup *ag = autogroup_create();
177 180
178 autogroup_move_group(p, ag); 181 autogroup_move_group(p, ag);
179 /* drop extra refrence added by autogroup_create() */ 182 /* drop extra reference added by autogroup_create() */
180 autogroup_kref_put(ag); 183 autogroup_kref_put(ag);
181} 184}
182EXPORT_SYMBOL(sched_autogroup_create_attach); 185EXPORT_SYMBOL(sched_autogroup_create_attach);
@@ -247,10 +250,14 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
247{ 250{
248 struct autogroup *ag = autogroup_task_get(p); 251 struct autogroup *ag = autogroup_task_get(p);
249 252
253 if (!task_group_is_autogroup(ag->tg))
254 goto out;
255
250 down_read(&ag->lock); 256 down_read(&ag->lock);
251 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); 257 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
252 up_read(&ag->lock); 258 up_read(&ag->lock);
253 259
260out:
254 autogroup_kref_put(ag); 261 autogroup_kref_put(ag);
255} 262}
256#endif /* CONFIG_PROC_FS */ 263#endif /* CONFIG_PROC_FS */
@@ -258,9 +265,7 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
258#ifdef CONFIG_SCHED_DEBUG 265#ifdef CONFIG_SCHED_DEBUG
259static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 266static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
260{ 267{
261 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); 268 if (!task_group_is_autogroup(tg))
262
263 if (!enabled || !tg->autogroup)
264 return 0; 269 return 0;
265 270
266 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); 271 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 7b859ffe5dad..05577055cfca 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -1,6 +1,11 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3struct autogroup { 3struct autogroup {
4 /*
5 * reference doesn't mean how many thread attach to this
6 * autogroup now. It just stands for the number of task
7 * could use this autogroup.
8 */
4 struct kref kref; 9 struct kref kref;
5 struct task_group *tg; 10 struct task_group *tg;
6 struct rw_semaphore lock; 11 struct rw_semaphore lock;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index eb6cb8edd075..a6710a112b4f 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
152 read_lock_irqsave(&tasklist_lock, flags); 152 read_lock_irqsave(&tasklist_lock, flags);
153 153
154 do_each_thread(g, p) { 154 do_each_thread(g, p) {
155 if (!p->se.on_rq || task_cpu(p) != rq_cpu) 155 if (!p->on_rq || task_cpu(p) != rq_cpu)
156 continue; 156 continue;
157 157
158 print_task(m, rq, p); 158 print_task(m, rq, p);
@@ -179,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
179 179
180 raw_spin_lock_irqsave(&rq->lock, flags); 180 raw_spin_lock_irqsave(&rq->lock, flags);
181 if (cfs_rq->rb_leftmost) 181 if (cfs_rq->rb_leftmost)
182 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; 182 MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
183 last = __pick_last_entity(cfs_rq); 183 last = __pick_last_entity(cfs_rq);
184 if (last) 184 if (last)
185 max_vruntime = last->vruntime; 185 max_vruntime = last->vruntime;
@@ -296,9 +296,6 @@ static void print_cpu(struct seq_file *m, int cpu)
296 P(ttwu_count); 296 P(ttwu_count);
297 P(ttwu_local); 297 P(ttwu_local);
298 298
299 SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
300 rq->rq_sched_info.bkl_count);
301
302#undef P 299#undef P
303#undef P64 300#undef P64
304#endif 301#endif
@@ -441,7 +438,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
441 P(se.statistics.wait_count); 438 P(se.statistics.wait_count);
442 PN(se.statistics.iowait_sum); 439 PN(se.statistics.iowait_sum);
443 P(se.statistics.iowait_count); 440 P(se.statistics.iowait_count);
444 P(sched_info.bkl_count);
445 P(se.nr_migrations); 441 P(se.nr_migrations);
446 P(se.statistics.nr_migrations_cold); 442 P(se.statistics.nr_migrations_cold);
447 P(se.statistics.nr_failed_migrations_affine); 443 P(se.statistics.nr_failed_migrations_affine);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c26e2df450e..e32a9b70ee9c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/cpumask.h>
25 26
26/* 27/*
27 * Targeted preemption latency for CPU-bound tasks: 28 * Targeted preemption latency for CPU-bound tasks:
@@ -69,14 +70,6 @@ static unsigned int sched_nr_latency = 8;
69unsigned int sysctl_sched_child_runs_first __read_mostly; 70unsigned int sysctl_sched_child_runs_first __read_mostly;
70 71
71/* 72/*
72 * sys_sched_yield() compat mode
73 *
74 * This option switches the agressive yield implementation of the
75 * old scheduler back on.
76 */
77unsigned int __read_mostly sysctl_sched_compat_yield;
78
79/*
80 * SCHED_OTHER wake-up granularity. 73 * SCHED_OTHER wake-up granularity.
81 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 74 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
82 * 75 *
@@ -365,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
365 } 358 }
366 359
367 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); 360 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
361#ifndef CONFIG_64BIT
362 smp_wmb();
363 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
364#endif
368} 365}
369 366
370/* 367/*
@@ -419,7 +416,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
419 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 416 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
420} 417}
421 418
422static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) 419static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
423{ 420{
424 struct rb_node *left = cfs_rq->rb_leftmost; 421 struct rb_node *left = cfs_rq->rb_leftmost;
425 422
@@ -429,6 +426,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
429 return rb_entry(left, struct sched_entity, run_node); 426 return rb_entry(left, struct sched_entity, run_node);
430} 427}
431 428
429static struct sched_entity *__pick_next_entity(struct sched_entity *se)
430{
431 struct rb_node *next = rb_next(&se->run_node);
432
433 if (!next)
434 return NULL;
435
436 return rb_entry(next, struct sched_entity, run_node);
437}
438
439#ifdef CONFIG_SCHED_DEBUG
432static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 440static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
433{ 441{
434 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 442 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -443,7 +451,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
443 * Scheduling class statistics methods: 451 * Scheduling class statistics methods:
444 */ 452 */
445 453
446#ifdef CONFIG_SCHED_DEBUG
447int sched_proc_update_handler(struct ctl_table *table, int write, 454int sched_proc_update_handler(struct ctl_table *table, int write,
448 void __user *buffer, size_t *lenp, 455 void __user *buffer, size_t *lenp,
449 loff_t *ppos) 456 loff_t *ppos)
@@ -540,7 +547,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
540} 547}
541 548
542static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); 549static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
543static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); 550static void update_cfs_shares(struct cfs_rq *cfs_rq);
544 551
545/* 552/*
546 * Update the current task's runtime statistics. Skip current tasks that 553 * Update the current task's runtime statistics. Skip current tasks that
@@ -733,6 +740,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
733 now - cfs_rq->load_last > 4 * period) { 740 now - cfs_rq->load_last > 4 * period) {
734 cfs_rq->load_period = 0; 741 cfs_rq->load_period = 0;
735 cfs_rq->load_avg = 0; 742 cfs_rq->load_avg = 0;
743 delta = period - 1;
736 } 744 }
737 745
738 cfs_rq->load_stamp = now; 746 cfs_rq->load_stamp = now;
@@ -763,16 +771,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
763 list_del_leaf_cfs_rq(cfs_rq); 771 list_del_leaf_cfs_rq(cfs_rq);
764} 772}
765 773
766static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, 774static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
767 long weight_delta)
768{ 775{
769 long load_weight, load, shares; 776 long load_weight, load, shares;
770 777
771 load = cfs_rq->load.weight + weight_delta; 778 load = cfs_rq->load.weight;
772 779
773 load_weight = atomic_read(&tg->load_weight); 780 load_weight = atomic_read(&tg->load_weight);
774 load_weight -= cfs_rq->load_contribution;
775 load_weight += load; 781 load_weight += load;
782 load_weight -= cfs_rq->load_contribution;
776 783
777 shares = (tg->shares * load); 784 shares = (tg->shares * load);
778 if (load_weight) 785 if (load_weight)
@@ -790,7 +797,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
790{ 797{
791 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { 798 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
792 update_cfs_load(cfs_rq, 0); 799 update_cfs_load(cfs_rq, 0);
793 update_cfs_shares(cfs_rq, 0); 800 update_cfs_shares(cfs_rq);
794 } 801 }
795} 802}
796# else /* CONFIG_SMP */ 803# else /* CONFIG_SMP */
@@ -798,8 +805,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
798{ 805{
799} 806}
800 807
801static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, 808static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
802 long weight_delta)
803{ 809{
804 return tg->shares; 810 return tg->shares;
805} 811}
@@ -824,7 +830,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
824 account_entity_enqueue(cfs_rq, se); 830 account_entity_enqueue(cfs_rq, se);
825} 831}
826 832
827static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) 833static void update_cfs_shares(struct cfs_rq *cfs_rq)
828{ 834{
829 struct task_group *tg; 835 struct task_group *tg;
830 struct sched_entity *se; 836 struct sched_entity *se;
@@ -838,7 +844,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
838 if (likely(se->load.weight == tg->shares)) 844 if (likely(se->load.weight == tg->shares))
839 return; 845 return;
840#endif 846#endif
841 shares = calc_cfs_shares(cfs_rq, tg, weight_delta); 847 shares = calc_cfs_shares(cfs_rq, tg);
842 848
843 reweight_entity(cfs_rq_of(se), se, shares); 849 reweight_entity(cfs_rq_of(se), se, shares);
844} 850}
@@ -847,7 +853,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
847{ 853{
848} 854}
849 855
850static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) 856static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
851{ 857{
852} 858}
853 859
@@ -978,8 +984,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
978 */ 984 */
979 update_curr(cfs_rq); 985 update_curr(cfs_rq);
980 update_cfs_load(cfs_rq, 0); 986 update_cfs_load(cfs_rq, 0);
981 update_cfs_shares(cfs_rq, se->load.weight);
982 account_entity_enqueue(cfs_rq, se); 987 account_entity_enqueue(cfs_rq, se);
988 update_cfs_shares(cfs_rq);
983 989
984 if (flags & ENQUEUE_WAKEUP) { 990 if (flags & ENQUEUE_WAKEUP) {
985 place_entity(cfs_rq, se, 0); 991 place_entity(cfs_rq, se, 0);
@@ -996,19 +1002,49 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
996 list_add_leaf_cfs_rq(cfs_rq); 1002 list_add_leaf_cfs_rq(cfs_rq);
997} 1003}
998 1004
999static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 1005static void __clear_buddies_last(struct sched_entity *se)
1000{ 1006{
1001 if (!se || cfs_rq->last == se) 1007 for_each_sched_entity(se) {
1002 cfs_rq->last = NULL; 1008 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1009 if (cfs_rq->last == se)
1010 cfs_rq->last = NULL;
1011 else
1012 break;
1013 }
1014}
1003 1015
1004 if (!se || cfs_rq->next == se) 1016static void __clear_buddies_next(struct sched_entity *se)
1005 cfs_rq->next = NULL; 1017{
1018 for_each_sched_entity(se) {
1019 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1020 if (cfs_rq->next == se)
1021 cfs_rq->next = NULL;
1022 else
1023 break;
1024 }
1025}
1026
1027static void __clear_buddies_skip(struct sched_entity *se)
1028{
1029 for_each_sched_entity(se) {
1030 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1031 if (cfs_rq->skip == se)
1032 cfs_rq->skip = NULL;
1033 else
1034 break;
1035 }
1006} 1036}
1007 1037
1008static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 1038static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1009{ 1039{
1010 for_each_sched_entity(se) 1040 if (cfs_rq->last == se)
1011 __clear_buddies(cfs_rq_of(se), se); 1041 __clear_buddies_last(se);
1042
1043 if (cfs_rq->next == se)
1044 __clear_buddies_next(se);
1045
1046 if (cfs_rq->skip == se)
1047 __clear_buddies_skip(se);
1012} 1048}
1013 1049
1014static void 1050static void
@@ -1041,7 +1077,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1041 update_cfs_load(cfs_rq, 0); 1077 update_cfs_load(cfs_rq, 0);
1042 account_entity_dequeue(cfs_rq, se); 1078 account_entity_dequeue(cfs_rq, se);
1043 update_min_vruntime(cfs_rq); 1079 update_min_vruntime(cfs_rq);
1044 update_cfs_shares(cfs_rq, 0); 1080 update_cfs_shares(cfs_rq);
1045 1081
1046 /* 1082 /*
1047 * Normalize the entity after updating the min_vruntime because the 1083 * Normalize the entity after updating the min_vruntime because the
@@ -1084,7 +1120,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1084 return; 1120 return;
1085 1121
1086 if (cfs_rq->nr_running > 1) { 1122 if (cfs_rq->nr_running > 1) {
1087 struct sched_entity *se = __pick_next_entity(cfs_rq); 1123 struct sched_entity *se = __pick_first_entity(cfs_rq);
1088 s64 delta = curr->vruntime - se->vruntime; 1124 s64 delta = curr->vruntime - se->vruntime;
1089 1125
1090 if (delta < 0) 1126 if (delta < 0)
@@ -1128,13 +1164,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1128static int 1164static int
1129wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); 1165wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1130 1166
1167/*
1168 * Pick the next process, keeping these things in mind, in this order:
1169 * 1) keep things fair between processes/task groups
1170 * 2) pick the "next" process, since someone really wants that to run
1171 * 3) pick the "last" process, for cache locality
1172 * 4) do not run the "skip" process, if something else is available
1173 */
1131static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 1174static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1132{ 1175{
1133 struct sched_entity *se = __pick_next_entity(cfs_rq); 1176 struct sched_entity *se = __pick_first_entity(cfs_rq);
1134 struct sched_entity *left = se; 1177 struct sched_entity *left = se;
1135 1178
1136 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) 1179 /*
1137 se = cfs_rq->next; 1180 * Avoid running the skip buddy, if running something else can
1181 * be done without getting too unfair.
1182 */
1183 if (cfs_rq->skip == se) {
1184 struct sched_entity *second = __pick_next_entity(se);
1185 if (second && wakeup_preempt_entity(second, left) < 1)
1186 se = second;
1187 }
1138 1188
1139 /* 1189 /*
1140 * Prefer last buddy, try to return the CPU to a preempted task. 1190 * Prefer last buddy, try to return the CPU to a preempted task.
@@ -1142,6 +1192,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1142 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) 1192 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1143 se = cfs_rq->last; 1193 se = cfs_rq->last;
1144 1194
1195 /*
1196 * Someone really wants this to run. If it's not unfair, run it.
1197 */
1198 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1199 se = cfs_rq->next;
1200
1145 clear_buddies(cfs_rq, se); 1201 clear_buddies(cfs_rq, se);
1146 1202
1147 return se; 1203 return se;
@@ -1282,12 +1338,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1282 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1338 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1283 1339
1284 update_cfs_load(cfs_rq, 0); 1340 update_cfs_load(cfs_rq, 0);
1285 update_cfs_shares(cfs_rq, 0); 1341 update_cfs_shares(cfs_rq);
1286 } 1342 }
1287 1343
1288 hrtick_update(rq); 1344 hrtick_update(rq);
1289} 1345}
1290 1346
1347static void set_next_buddy(struct sched_entity *se);
1348
1291/* 1349/*
1292 * The dequeue_task method is called before nr_running is 1350 * The dequeue_task method is called before nr_running is
1293 * decreased. We remove the task from the rbtree and 1351 * decreased. We remove the task from the rbtree and
@@ -1297,14 +1355,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1297{ 1355{
1298 struct cfs_rq *cfs_rq; 1356 struct cfs_rq *cfs_rq;
1299 struct sched_entity *se = &p->se; 1357 struct sched_entity *se = &p->se;
1358 int task_sleep = flags & DEQUEUE_SLEEP;
1300 1359
1301 for_each_sched_entity(se) { 1360 for_each_sched_entity(se) {
1302 cfs_rq = cfs_rq_of(se); 1361 cfs_rq = cfs_rq_of(se);
1303 dequeue_entity(cfs_rq, se, flags); 1362 dequeue_entity(cfs_rq, se, flags);
1304 1363
1305 /* Don't dequeue parent if it has other entities besides us */ 1364 /* Don't dequeue parent if it has other entities besides us */
1306 if (cfs_rq->load.weight) 1365 if (cfs_rq->load.weight) {
1366 /*
1367 * Bias pick_next to pick a task from this cfs_rq, as
1368 * p is sleeping when it is within its sched_slice.
1369 */
1370 if (task_sleep && parent_entity(se))
1371 set_next_buddy(parent_entity(se));
1307 break; 1372 break;
1373 }
1308 flags |= DEQUEUE_SLEEP; 1374 flags |= DEQUEUE_SLEEP;
1309 } 1375 }
1310 1376
@@ -1312,66 +1378,33 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1312 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1378 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1313 1379
1314 update_cfs_load(cfs_rq, 0); 1380 update_cfs_load(cfs_rq, 0);
1315 update_cfs_shares(cfs_rq, 0); 1381 update_cfs_shares(cfs_rq);
1316 } 1382 }
1317 1383
1318 hrtick_update(rq); 1384 hrtick_update(rq);
1319} 1385}
1320 1386
1321/*
1322 * sched_yield() support is very simple - we dequeue and enqueue.
1323 *
1324 * If compat_yield is turned on then we requeue to the end of the tree.
1325 */
1326static void yield_task_fair(struct rq *rq)
1327{
1328 struct task_struct *curr = rq->curr;
1329 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1330 struct sched_entity *rightmost, *se = &curr->se;
1331
1332 /*
1333 * Are we the only task in the tree?
1334 */
1335 if (unlikely(cfs_rq->nr_running == 1))
1336 return;
1337
1338 clear_buddies(cfs_rq, se);
1339
1340 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
1341 update_rq_clock(rq);
1342 /*
1343 * Update run-time statistics of the 'current'.
1344 */
1345 update_curr(cfs_rq);
1346
1347 return;
1348 }
1349 /*
1350 * Find the rightmost entry in the rbtree:
1351 */
1352 rightmost = __pick_last_entity(cfs_rq);
1353 /*
1354 * Already in the rightmost position?
1355 */
1356 if (unlikely(!rightmost || entity_before(rightmost, se)))
1357 return;
1358
1359 /*
1360 * Minimally necessary key value to be last in the tree:
1361 * Upon rescheduling, sched_class::put_prev_task() will place
1362 * 'current' within the tree based on its new key value.
1363 */
1364 se->vruntime = rightmost->vruntime + 1;
1365}
1366
1367#ifdef CONFIG_SMP 1387#ifdef CONFIG_SMP
1368 1388
1369static void task_waking_fair(struct rq *rq, struct task_struct *p) 1389static void task_waking_fair(struct task_struct *p)
1370{ 1390{
1371 struct sched_entity *se = &p->se; 1391 struct sched_entity *se = &p->se;
1372 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1392 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1393 u64 min_vruntime;
1373 1394
1374 se->vruntime -= cfs_rq->min_vruntime; 1395#ifndef CONFIG_64BIT
1396 u64 min_vruntime_copy;
1397
1398 do {
1399 min_vruntime_copy = cfs_rq->min_vruntime_copy;
1400 smp_rmb();
1401 min_vruntime = cfs_rq->min_vruntime;
1402 } while (min_vruntime != min_vruntime_copy);
1403#else
1404 min_vruntime = cfs_rq->min_vruntime;
1405#endif
1406
1407 se->vruntime -= min_vruntime;
1375} 1408}
1376 1409
1377#ifdef CONFIG_FAIR_GROUP_SCHED 1410#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1551,7 +1584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1551 } 1584 }
1552 1585
1553 /* Adjust by relative CPU power of the group */ 1586 /* Adjust by relative CPU power of the group */
1554 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 1587 avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power;
1555 1588
1556 if (local_group) { 1589 if (local_group) {
1557 this_load = avg_load; 1590 this_load = avg_load;
@@ -1616,6 +1649,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1616 /* 1649 /*
1617 * Otherwise, iterate the domains and find an elegible idle cpu. 1650 * Otherwise, iterate the domains and find an elegible idle cpu.
1618 */ 1651 */
1652 rcu_read_lock();
1619 for_each_domain(target, sd) { 1653 for_each_domain(target, sd) {
1620 if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) 1654 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1621 break; 1655 break;
@@ -1635,6 +1669,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1635 cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) 1669 cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
1636 break; 1670 break;
1637 } 1671 }
1672 rcu_read_unlock();
1638 1673
1639 return target; 1674 return target;
1640} 1675}
@@ -1651,7 +1686,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1651 * preempt must be disabled. 1686 * preempt must be disabled.
1652 */ 1687 */
1653static int 1688static int
1654select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) 1689select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1655{ 1690{
1656 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 1691 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1657 int cpu = smp_processor_id(); 1692 int cpu = smp_processor_id();
@@ -1667,6 +1702,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1667 new_cpu = prev_cpu; 1702 new_cpu = prev_cpu;
1668 } 1703 }
1669 1704
1705 rcu_read_lock();
1670 for_each_domain(cpu, tmp) { 1706 for_each_domain(cpu, tmp) {
1671 if (!(tmp->flags & SD_LOAD_BALANCE)) 1707 if (!(tmp->flags & SD_LOAD_BALANCE))
1672 continue; 1708 continue;
@@ -1686,7 +1722,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1686 nr_running += cpu_rq(i)->cfs.nr_running; 1722 nr_running += cpu_rq(i)->cfs.nr_running;
1687 } 1723 }
1688 1724
1689 capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 1725 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
1690 1726
1691 if (tmp->flags & SD_POWERSAVINGS_BALANCE) 1727 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1692 nr_running /= 2; 1728 nr_running /= 2;
@@ -1717,9 +1753,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1717 1753
1718 if (affine_sd) { 1754 if (affine_sd) {
1719 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1755 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1720 return select_idle_sibling(p, cpu); 1756 prev_cpu = cpu;
1721 else 1757
1722 return select_idle_sibling(p, prev_cpu); 1758 new_cpu = select_idle_sibling(p, prev_cpu);
1759 goto unlock;
1723 } 1760 }
1724 1761
1725 while (sd) { 1762 while (sd) {
@@ -1760,6 +1797,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1760 } 1797 }
1761 /* while loop will break here if sd == NULL */ 1798 /* while loop will break here if sd == NULL */
1762 } 1799 }
1800unlock:
1801 rcu_read_unlock();
1763 1802
1764 return new_cpu; 1803 return new_cpu;
1765} 1804}
@@ -1783,10 +1822,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1783 * This is especially important for buddies when the leftmost 1822 * This is especially important for buddies when the leftmost
1784 * task is higher priority than the buddy. 1823 * task is higher priority than the buddy.
1785 */ 1824 */
1786 if (unlikely(se->load.weight != NICE_0_LOAD)) 1825 return calc_delta_fair(gran, se);
1787 gran = calc_delta_fair(gran, se);
1788
1789 return gran;
1790} 1826}
1791 1827
1792/* 1828/*
@@ -1820,18 +1856,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1820 1856
1821static void set_last_buddy(struct sched_entity *se) 1857static void set_last_buddy(struct sched_entity *se)
1822{ 1858{
1823 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1859 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
1824 for_each_sched_entity(se) 1860 return;
1825 cfs_rq_of(se)->last = se; 1861
1826 } 1862 for_each_sched_entity(se)
1863 cfs_rq_of(se)->last = se;
1827} 1864}
1828 1865
1829static void set_next_buddy(struct sched_entity *se) 1866static void set_next_buddy(struct sched_entity *se)
1830{ 1867{
1831 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1868 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
1832 for_each_sched_entity(se) 1869 return;
1833 cfs_rq_of(se)->next = se; 1870
1834 } 1871 for_each_sched_entity(se)
1872 cfs_rq_of(se)->next = se;
1873}
1874
1875static void set_skip_buddy(struct sched_entity *se)
1876{
1877 for_each_sched_entity(se)
1878 cfs_rq_of(se)->skip = se;
1835} 1879}
1836 1880
1837/* 1881/*
@@ -1843,12 +1887,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1843 struct sched_entity *se = &curr->se, *pse = &p->se; 1887 struct sched_entity *se = &curr->se, *pse = &p->se;
1844 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1888 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1845 int scale = cfs_rq->nr_running >= sched_nr_latency; 1889 int scale = cfs_rq->nr_running >= sched_nr_latency;
1890 int next_buddy_marked = 0;
1846 1891
1847 if (unlikely(se == pse)) 1892 if (unlikely(se == pse))
1848 return; 1893 return;
1849 1894
1850 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) 1895 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
1851 set_next_buddy(pse); 1896 set_next_buddy(pse);
1897 next_buddy_marked = 1;
1898 }
1852 1899
1853 /* 1900 /*
1854 * We can come here with TIF_NEED_RESCHED already set from new task 1901 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1857,16 +1904,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1857 if (test_tsk_need_resched(curr)) 1904 if (test_tsk_need_resched(curr))
1858 return; 1905 return;
1859 1906
1907 /* Idle tasks are by definition preempted by non-idle tasks. */
1908 if (unlikely(curr->policy == SCHED_IDLE) &&
1909 likely(p->policy != SCHED_IDLE))
1910 goto preempt;
1911
1860 /* 1912 /*
1861 * Batch and idle tasks do not preempt (their preemption is driven by 1913 * Batch and idle tasks do not preempt non-idle tasks (their preemption
1862 * the tick): 1914 * is driven by the tick):
1863 */ 1915 */
1864 if (unlikely(p->policy != SCHED_NORMAL)) 1916 if (unlikely(p->policy != SCHED_NORMAL))
1865 return; 1917 return;
1866 1918
1867 /* Idle tasks are by definition preempted by everybody. */
1868 if (unlikely(curr->policy == SCHED_IDLE))
1869 goto preempt;
1870 1919
1871 if (!sched_feat(WAKEUP_PREEMPT)) 1920 if (!sched_feat(WAKEUP_PREEMPT))
1872 return; 1921 return;
@@ -1874,8 +1923,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1874 update_curr(cfs_rq); 1923 update_curr(cfs_rq);
1875 find_matching_se(&se, &pse); 1924 find_matching_se(&se, &pse);
1876 BUG_ON(!pse); 1925 BUG_ON(!pse);
1877 if (wakeup_preempt_entity(se, pse) == 1) 1926 if (wakeup_preempt_entity(se, pse) == 1) {
1927 /*
1928 * Bias pick_next to pick the sched entity that is
1929 * triggering this preemption.
1930 */
1931 if (!next_buddy_marked)
1932 set_next_buddy(pse);
1878 goto preempt; 1933 goto preempt;
1934 }
1879 1935
1880 return; 1936 return;
1881 1937
@@ -1932,6 +1988,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1932 } 1988 }
1933} 1989}
1934 1990
1991/*
1992 * sched_yield() is very simple
1993 *
1994 * The magic of dealing with the ->skip buddy is in pick_next_entity.
1995 */
1996static void yield_task_fair(struct rq *rq)
1997{
1998 struct task_struct *curr = rq->curr;
1999 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
2000 struct sched_entity *se = &curr->se;
2001
2002 /*
2003 * Are we the only task in the tree?
2004 */
2005 if (unlikely(rq->nr_running == 1))
2006 return;
2007
2008 clear_buddies(cfs_rq, se);
2009
2010 if (curr->policy != SCHED_BATCH) {
2011 update_rq_clock(rq);
2012 /*
2013 * Update run-time statistics of the 'current'.
2014 */
2015 update_curr(cfs_rq);
2016 }
2017
2018 set_skip_buddy(se);
2019}
2020
2021static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
2022{
2023 struct sched_entity *se = &p->se;
2024
2025 if (!se->on_rq)
2026 return false;
2027
2028 /* Tell the scheduler that we'd really like pse to run next. */
2029 set_next_buddy(se);
2030
2031 yield_task_fair(rq);
2032
2033 return true;
2034}
2035
1935#ifdef CONFIG_SMP 2036#ifdef CONFIG_SMP
1936/************************************************** 2037/**************************************************
1937 * Fair scheduling class load-balancing methods: 2038 * Fair scheduling class load-balancing methods:
@@ -2041,23 +2142,22 @@ static unsigned long
2041balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 2142balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2042 unsigned long max_load_move, struct sched_domain *sd, 2143 unsigned long max_load_move, struct sched_domain *sd,
2043 enum cpu_idle_type idle, int *all_pinned, 2144 enum cpu_idle_type idle, int *all_pinned,
2044 int *this_best_prio, struct cfs_rq *busiest_cfs_rq) 2145 struct cfs_rq *busiest_cfs_rq)
2045{ 2146{
2046 int loops = 0, pulled = 0, pinned = 0; 2147 int loops = 0, pulled = 0;
2047 long rem_load_move = max_load_move; 2148 long rem_load_move = max_load_move;
2048 struct task_struct *p, *n; 2149 struct task_struct *p, *n;
2049 2150
2050 if (max_load_move == 0) 2151 if (max_load_move == 0)
2051 goto out; 2152 goto out;
2052 2153
2053 pinned = 1;
2054
2055 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { 2154 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
2056 if (loops++ > sysctl_sched_nr_migrate) 2155 if (loops++ > sysctl_sched_nr_migrate)
2057 break; 2156 break;
2058 2157
2059 if ((p->se.load.weight >> 1) > rem_load_move || 2158 if ((p->se.load.weight >> 1) > rem_load_move ||
2060 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) 2159 !can_migrate_task(p, busiest, this_cpu, sd, idle,
2160 all_pinned))
2061 continue; 2161 continue;
2062 2162
2063 pull_task(busiest, p, this_rq, this_cpu); 2163 pull_task(busiest, p, this_rq, this_cpu);
@@ -2080,9 +2180,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2080 */ 2180 */
2081 if (rem_load_move <= 0) 2181 if (rem_load_move <= 0)
2082 break; 2182 break;
2083
2084 if (p->prio < *this_best_prio)
2085 *this_best_prio = p->prio;
2086 } 2183 }
2087out: 2184out:
2088 /* 2185 /*
@@ -2092,9 +2189,6 @@ out:
2092 */ 2189 */
2093 schedstat_add(sd, lb_gained[idle], pulled); 2190 schedstat_add(sd, lb_gained[idle], pulled);
2094 2191
2095 if (all_pinned)
2096 *all_pinned = pinned;
2097
2098 return max_load_move - rem_load_move; 2192 return max_load_move - rem_load_move;
2099} 2193}
2100 2194
@@ -2123,7 +2217,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
2123 * We need to update shares after updating tg->load_weight in 2217 * We need to update shares after updating tg->load_weight in
2124 * order to adjust the weight of groups with long running tasks. 2218 * order to adjust the weight of groups with long running tasks.
2125 */ 2219 */
2126 update_cfs_shares(cfs_rq, 0); 2220 update_cfs_shares(cfs_rq);
2127 2221
2128 raw_spin_unlock_irqrestore(&rq->lock, flags); 2222 raw_spin_unlock_irqrestore(&rq->lock, flags);
2129 2223
@@ -2145,7 +2239,7 @@ static unsigned long
2145load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2239load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2146 unsigned long max_load_move, 2240 unsigned long max_load_move,
2147 struct sched_domain *sd, enum cpu_idle_type idle, 2241 struct sched_domain *sd, enum cpu_idle_type idle,
2148 int *all_pinned, int *this_best_prio) 2242 int *all_pinned)
2149{ 2243{
2150 long rem_load_move = max_load_move; 2244 long rem_load_move = max_load_move;
2151 int busiest_cpu = cpu_of(busiest); 2245 int busiest_cpu = cpu_of(busiest);
@@ -2170,7 +2264,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2170 rem_load = div_u64(rem_load, busiest_h_load + 1); 2264 rem_load = div_u64(rem_load, busiest_h_load + 1);
2171 2265
2172 moved_load = balance_tasks(this_rq, this_cpu, busiest, 2266 moved_load = balance_tasks(this_rq, this_cpu, busiest,
2173 rem_load, sd, idle, all_pinned, this_best_prio, 2267 rem_load, sd, idle, all_pinned,
2174 busiest_cfs_rq); 2268 busiest_cfs_rq);
2175 2269
2176 if (!moved_load) 2270 if (!moved_load)
@@ -2196,11 +2290,11 @@ static unsigned long
2196load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2290load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2197 unsigned long max_load_move, 2291 unsigned long max_load_move,
2198 struct sched_domain *sd, enum cpu_idle_type idle, 2292 struct sched_domain *sd, enum cpu_idle_type idle,
2199 int *all_pinned, int *this_best_prio) 2293 int *all_pinned)
2200{ 2294{
2201 return balance_tasks(this_rq, this_cpu, busiest, 2295 return balance_tasks(this_rq, this_cpu, busiest,
2202 max_load_move, sd, idle, all_pinned, 2296 max_load_move, sd, idle, all_pinned,
2203 this_best_prio, &busiest->cfs); 2297 &busiest->cfs);
2204} 2298}
2205#endif 2299#endif
2206 2300
@@ -2217,12 +2311,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2217 int *all_pinned) 2311 int *all_pinned)
2218{ 2312{
2219 unsigned long total_load_moved = 0, load_moved; 2313 unsigned long total_load_moved = 0, load_moved;
2220 int this_best_prio = this_rq->curr->prio;
2221 2314
2222 do { 2315 do {
2223 load_moved = load_balance_fair(this_rq, this_cpu, busiest, 2316 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2224 max_load_move - total_load_moved, 2317 max_load_move - total_load_moved,
2225 sd, idle, all_pinned, &this_best_prio); 2318 sd, idle, all_pinned);
2226 2319
2227 total_load_moved += load_moved; 2320 total_load_moved += load_moved;
2228 2321
@@ -2477,7 +2570,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2477 2570
2478unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 2571unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
2479{ 2572{
2480 return SCHED_LOAD_SCALE; 2573 return SCHED_POWER_SCALE;
2481} 2574}
2482 2575
2483unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) 2576unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
@@ -2514,10 +2607,10 @@ unsigned long scale_rt_power(int cpu)
2514 available = total - rq->rt_avg; 2607 available = total - rq->rt_avg;
2515 } 2608 }
2516 2609
2517 if (unlikely((s64)total < SCHED_LOAD_SCALE)) 2610 if (unlikely((s64)total < SCHED_POWER_SCALE))
2518 total = SCHED_LOAD_SCALE; 2611 total = SCHED_POWER_SCALE;
2519 2612
2520 total >>= SCHED_LOAD_SHIFT; 2613 total >>= SCHED_POWER_SHIFT;
2521 2614
2522 return div_u64(available, total); 2615 return div_u64(available, total);
2523} 2616}
@@ -2525,7 +2618,7 @@ unsigned long scale_rt_power(int cpu)
2525static void update_cpu_power(struct sched_domain *sd, int cpu) 2618static void update_cpu_power(struct sched_domain *sd, int cpu)
2526{ 2619{
2527 unsigned long weight = sd->span_weight; 2620 unsigned long weight = sd->span_weight;
2528 unsigned long power = SCHED_LOAD_SCALE; 2621 unsigned long power = SCHED_POWER_SCALE;
2529 struct sched_group *sdg = sd->groups; 2622 struct sched_group *sdg = sd->groups;
2530 2623
2531 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 2624 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
@@ -2534,7 +2627,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2534 else 2627 else
2535 power *= default_scale_smt_power(sd, cpu); 2628 power *= default_scale_smt_power(sd, cpu);
2536 2629
2537 power >>= SCHED_LOAD_SHIFT; 2630 power >>= SCHED_POWER_SHIFT;
2538 } 2631 }
2539 2632
2540 sdg->cpu_power_orig = power; 2633 sdg->cpu_power_orig = power;
@@ -2544,10 +2637,10 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2544 else 2637 else
2545 power *= default_scale_freq_power(sd, cpu); 2638 power *= default_scale_freq_power(sd, cpu);
2546 2639
2547 power >>= SCHED_LOAD_SHIFT; 2640 power >>= SCHED_POWER_SHIFT;
2548 2641
2549 power *= scale_rt_power(cpu); 2642 power *= scale_rt_power(cpu);
2550 power >>= SCHED_LOAD_SHIFT; 2643 power >>= SCHED_POWER_SHIFT;
2551 2644
2552 if (!power) 2645 if (!power)
2553 power = 1; 2646 power = 1;
@@ -2589,9 +2682,9 @@ static inline int
2589fix_small_capacity(struct sched_domain *sd, struct sched_group *group) 2682fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2590{ 2683{
2591 /* 2684 /*
2592 * Only siblings can have significantly less than SCHED_LOAD_SCALE 2685 * Only siblings can have significantly less than SCHED_POWER_SCALE
2593 */ 2686 */
2594 if (sd->level != SD_LV_SIBLING) 2687 if (!(sd->flags & SD_SHARE_CPUPOWER))
2595 return 0; 2688 return 0;
2596 2689
2597 /* 2690 /*
@@ -2610,7 +2703,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2610 * @this_cpu: Cpu for which load balance is currently performed. 2703 * @this_cpu: Cpu for which load balance is currently performed.
2611 * @idle: Idle status of this_cpu 2704 * @idle: Idle status of this_cpu
2612 * @load_idx: Load index of sched_domain of this_cpu for load calc. 2705 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2613 * @sd_idle: Idle status of the sched_domain containing group.
2614 * @local_group: Does group contain this_cpu. 2706 * @local_group: Does group contain this_cpu.
2615 * @cpus: Set of cpus considered for load balancing. 2707 * @cpus: Set of cpus considered for load balancing.
2616 * @balance: Should we balance. 2708 * @balance: Should we balance.
@@ -2618,7 +2710,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2618 */ 2710 */
2619static inline void update_sg_lb_stats(struct sched_domain *sd, 2711static inline void update_sg_lb_stats(struct sched_domain *sd,
2620 struct sched_group *group, int this_cpu, 2712 struct sched_group *group, int this_cpu,
2621 enum cpu_idle_type idle, int load_idx, int *sd_idle, 2713 enum cpu_idle_type idle, int load_idx,
2622 int local_group, const struct cpumask *cpus, 2714 int local_group, const struct cpumask *cpus,
2623 int *balance, struct sg_lb_stats *sgs) 2715 int *balance, struct sg_lb_stats *sgs)
2624{ 2716{
@@ -2638,9 +2730,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2638 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 2730 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2639 struct rq *rq = cpu_rq(i); 2731 struct rq *rq = cpu_rq(i);
2640 2732
2641 if (*sd_idle && rq->nr_running)
2642 *sd_idle = 0;
2643
2644 /* Bias balancing toward cpus of our domain */ 2733 /* Bias balancing toward cpus of our domain */
2645 if (local_group) { 2734 if (local_group) {
2646 if (idle_cpu(i) && !first_idle_cpu) { 2735 if (idle_cpu(i) && !first_idle_cpu) {
@@ -2681,11 +2770,11 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2681 } 2770 }
2682 2771
2683 /* Adjust by relative CPU power of the group */ 2772 /* Adjust by relative CPU power of the group */
2684 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; 2773 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power;
2685 2774
2686 /* 2775 /*
2687 * Consider the group unbalanced when the imbalance is larger 2776 * Consider the group unbalanced when the imbalance is larger
2688 * than the average weight of two tasks. 2777 * than the average weight of a task.
2689 * 2778 *
2690 * APZ: with cgroup the avg task weight can vary wildly and 2779 * APZ: with cgroup the avg task weight can vary wildly and
2691 * might not be a suitable number - should we keep a 2780 * might not be a suitable number - should we keep a
@@ -2695,10 +2784,11 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2695 if (sgs->sum_nr_running) 2784 if (sgs->sum_nr_running)
2696 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 2785 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2697 2786
2698 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) 2787 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
2699 sgs->group_imb = 1; 2788 sgs->group_imb = 1;
2700 2789
2701 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2790 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power,
2791 SCHED_POWER_SCALE);
2702 if (!sgs->group_capacity) 2792 if (!sgs->group_capacity)
2703 sgs->group_capacity = fix_small_capacity(sd, group); 2793 sgs->group_capacity = fix_small_capacity(sd, group);
2704 sgs->group_weight = group->group_weight; 2794 sgs->group_weight = group->group_weight;
@@ -2755,15 +2845,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
2755 * @sd: sched_domain whose statistics are to be updated. 2845 * @sd: sched_domain whose statistics are to be updated.
2756 * @this_cpu: Cpu for which load balance is currently performed. 2846 * @this_cpu: Cpu for which load balance is currently performed.
2757 * @idle: Idle status of this_cpu 2847 * @idle: Idle status of this_cpu
2758 * @sd_idle: Idle status of the sched_domain containing sg.
2759 * @cpus: Set of cpus considered for load balancing. 2848 * @cpus: Set of cpus considered for load balancing.
2760 * @balance: Should we balance. 2849 * @balance: Should we balance.
2761 * @sds: variable to hold the statistics for this sched_domain. 2850 * @sds: variable to hold the statistics for this sched_domain.
2762 */ 2851 */
2763static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, 2852static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2764 enum cpu_idle_type idle, int *sd_idle, 2853 enum cpu_idle_type idle, const struct cpumask *cpus,
2765 const struct cpumask *cpus, int *balance, 2854 int *balance, struct sd_lb_stats *sds)
2766 struct sd_lb_stats *sds)
2767{ 2855{
2768 struct sched_domain *child = sd->child; 2856 struct sched_domain *child = sd->child;
2769 struct sched_group *sg = sd->groups; 2857 struct sched_group *sg = sd->groups;
@@ -2781,7 +2869,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2781 2869
2782 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); 2870 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2783 memset(&sgs, 0, sizeof(sgs)); 2871 memset(&sgs, 0, sizeof(sgs));
2784 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, 2872 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
2785 local_group, cpus, balance, &sgs); 2873 local_group, cpus, balance, &sgs);
2786 2874
2787 if (local_group && !(*balance)) 2875 if (local_group && !(*balance))
@@ -2874,7 +2962,7 @@ static int check_asym_packing(struct sched_domain *sd,
2874 return 0; 2962 return 0;
2875 2963
2876 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, 2964 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
2877 SCHED_LOAD_SCALE); 2965 SCHED_POWER_SCALE);
2878 return 1; 2966 return 1;
2879} 2967}
2880 2968
@@ -2903,7 +2991,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2903 cpu_avg_load_per_task(this_cpu); 2991 cpu_avg_load_per_task(this_cpu);
2904 2992
2905 scaled_busy_load_per_task = sds->busiest_load_per_task 2993 scaled_busy_load_per_task = sds->busiest_load_per_task
2906 * SCHED_LOAD_SCALE; 2994 * SCHED_POWER_SCALE;
2907 scaled_busy_load_per_task /= sds->busiest->cpu_power; 2995 scaled_busy_load_per_task /= sds->busiest->cpu_power;
2908 2996
2909 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 2997 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
@@ -2922,10 +3010,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2922 min(sds->busiest_load_per_task, sds->max_load); 3010 min(sds->busiest_load_per_task, sds->max_load);
2923 pwr_now += sds->this->cpu_power * 3011 pwr_now += sds->this->cpu_power *
2924 min(sds->this_load_per_task, sds->this_load); 3012 min(sds->this_load_per_task, sds->this_load);
2925 pwr_now /= SCHED_LOAD_SCALE; 3013 pwr_now /= SCHED_POWER_SCALE;
2926 3014
2927 /* Amount of load we'd subtract */ 3015 /* Amount of load we'd subtract */
2928 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / 3016 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
2929 sds->busiest->cpu_power; 3017 sds->busiest->cpu_power;
2930 if (sds->max_load > tmp) 3018 if (sds->max_load > tmp)
2931 pwr_move += sds->busiest->cpu_power * 3019 pwr_move += sds->busiest->cpu_power *
@@ -2933,15 +3021,15 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2933 3021
2934 /* Amount of load we'd add */ 3022 /* Amount of load we'd add */
2935 if (sds->max_load * sds->busiest->cpu_power < 3023 if (sds->max_load * sds->busiest->cpu_power <
2936 sds->busiest_load_per_task * SCHED_LOAD_SCALE) 3024 sds->busiest_load_per_task * SCHED_POWER_SCALE)
2937 tmp = (sds->max_load * sds->busiest->cpu_power) / 3025 tmp = (sds->max_load * sds->busiest->cpu_power) /
2938 sds->this->cpu_power; 3026 sds->this->cpu_power;
2939 else 3027 else
2940 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / 3028 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
2941 sds->this->cpu_power; 3029 sds->this->cpu_power;
2942 pwr_move += sds->this->cpu_power * 3030 pwr_move += sds->this->cpu_power *
2943 min(sds->this_load_per_task, sds->this_load + tmp); 3031 min(sds->this_load_per_task, sds->this_load + tmp);
2944 pwr_move /= SCHED_LOAD_SCALE; 3032 pwr_move /= SCHED_POWER_SCALE;
2945 3033
2946 /* Move if we gain throughput */ 3034 /* Move if we gain throughput */
2947 if (pwr_move > pwr_now) 3035 if (pwr_move > pwr_now)
@@ -2983,7 +3071,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2983 load_above_capacity = (sds->busiest_nr_running - 3071 load_above_capacity = (sds->busiest_nr_running -
2984 sds->busiest_group_capacity); 3072 sds->busiest_group_capacity);
2985 3073
2986 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); 3074 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
2987 3075
2988 load_above_capacity /= sds->busiest->cpu_power; 3076 load_above_capacity /= sds->busiest->cpu_power;
2989 } 3077 }
@@ -3003,11 +3091,11 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3003 /* How much load to actually move to equalise the imbalance */ 3091 /* How much load to actually move to equalise the imbalance */
3004 *imbalance = min(max_pull * sds->busiest->cpu_power, 3092 *imbalance = min(max_pull * sds->busiest->cpu_power,
3005 (sds->avg_load - sds->this_load) * sds->this->cpu_power) 3093 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3006 / SCHED_LOAD_SCALE; 3094 / SCHED_POWER_SCALE;
3007 3095
3008 /* 3096 /*
3009 * if *imbalance is less than the average load per runnable task 3097 * if *imbalance is less than the average load per runnable task
3010 * there is no gaurantee that any tasks will be moved so we'll have 3098 * there is no guarantee that any tasks will be moved so we'll have
3011 * a think about bumping its value to force at least one task to be 3099 * a think about bumping its value to force at least one task to be
3012 * moved 3100 * moved
3013 */ 3101 */
@@ -3033,7 +3121,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3033 * @imbalance: Variable which stores amount of weighted load which should 3121 * @imbalance: Variable which stores amount of weighted load which should
3034 * be moved to restore balance/put a group to idle. 3122 * be moved to restore balance/put a group to idle.
3035 * @idle: The idle status of this_cpu. 3123 * @idle: The idle status of this_cpu.
3036 * @sd_idle: The idleness of sd
3037 * @cpus: The set of CPUs under consideration for load-balancing. 3124 * @cpus: The set of CPUs under consideration for load-balancing.
3038 * @balance: Pointer to a variable indicating if this_cpu 3125 * @balance: Pointer to a variable indicating if this_cpu
3039 * is the appropriate cpu to perform load balancing at this_level. 3126 * is the appropriate cpu to perform load balancing at this_level.
@@ -3046,7 +3133,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3046static struct sched_group * 3133static struct sched_group *
3047find_busiest_group(struct sched_domain *sd, int this_cpu, 3134find_busiest_group(struct sched_domain *sd, int this_cpu,
3048 unsigned long *imbalance, enum cpu_idle_type idle, 3135 unsigned long *imbalance, enum cpu_idle_type idle,
3049 int *sd_idle, const struct cpumask *cpus, int *balance) 3136 const struct cpumask *cpus, int *balance)
3050{ 3137{
3051 struct sd_lb_stats sds; 3138 struct sd_lb_stats sds;
3052 3139
@@ -3056,22 +3143,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3056 * Compute the various statistics relavent for load balancing at 3143 * Compute the various statistics relavent for load balancing at
3057 * this level. 3144 * this level.
3058 */ 3145 */
3059 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, 3146 update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
3060 balance, &sds); 3147
3061 3148 /*
3062 /* Cases where imbalance does not exist from POV of this_cpu */ 3149 * this_cpu is not the appropriate cpu to perform load balancing at
3063 /* 1) this_cpu is not the appropriate cpu to perform load balancing 3150 * this level.
3064 * at this level.
3065 * 2) There is no busy sibling group to pull from.
3066 * 3) This group is the busiest group.
3067 * 4) This group is more busy than the avg busieness at this
3068 * sched_domain.
3069 * 5) The imbalance is within the specified limit.
3070 *
3071 * Note: when doing newidle balance, if the local group has excess
3072 * capacity (i.e. nr_running < group_capacity) and the busiest group
3073 * does not have any capacity, we force a load balance to pull tasks
3074 * to the local group. In this case, we skip past checks 3, 4 and 5.
3075 */ 3151 */
3076 if (!(*balance)) 3152 if (!(*balance))
3077 goto ret; 3153 goto ret;
@@ -3080,41 +3156,56 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3080 check_asym_packing(sd, &sds, this_cpu, imbalance)) 3156 check_asym_packing(sd, &sds, this_cpu, imbalance))
3081 return sds.busiest; 3157 return sds.busiest;
3082 3158
3159 /* There is no busy sibling group to pull tasks from */
3083 if (!sds.busiest || sds.busiest_nr_running == 0) 3160 if (!sds.busiest || sds.busiest_nr_running == 0)
3084 goto out_balanced; 3161 goto out_balanced;
3085 3162
3086 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 3163 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
3164
3165 /*
3166 * If the busiest group is imbalanced the below checks don't
3167 * work because they assumes all things are equal, which typically
3168 * isn't true due to cpus_allowed constraints and the like.
3169 */
3170 if (sds.group_imb)
3171 goto force_balance;
3172
3173 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
3087 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 3174 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
3088 !sds.busiest_has_capacity) 3175 !sds.busiest_has_capacity)
3089 goto force_balance; 3176 goto force_balance;
3090 3177
3178 /*
3179 * If the local group is more busy than the selected busiest group
3180 * don't try and pull any tasks.
3181 */
3091 if (sds.this_load >= sds.max_load) 3182 if (sds.this_load >= sds.max_load)
3092 goto out_balanced; 3183 goto out_balanced;
3093 3184
3094 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; 3185 /*
3095 3186 * Don't pull any tasks if this group is already above the domain
3187 * average load.
3188 */
3096 if (sds.this_load >= sds.avg_load) 3189 if (sds.this_load >= sds.avg_load)
3097 goto out_balanced; 3190 goto out_balanced;
3098 3191
3099 /* 3192 if (idle == CPU_IDLE) {
3100 * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
3101 * And to check for busy balance use !idle_cpu instead of
3102 * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
3103 * even when they are idle.
3104 */
3105 if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
3106 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3107 goto out_balanced;
3108 } else {
3109 /* 3193 /*
3110 * This cpu is idle. If the busiest group load doesn't 3194 * This cpu is idle. If the busiest group load doesn't
3111 * have more tasks than the number of available cpu's and 3195 * have more tasks than the number of available cpu's and
3112 * there is no imbalance between this and busiest group 3196 * there is no imbalance between this and busiest group
3113 * wrt to idle cpu's, it is balanced. 3197 * wrt to idle cpu's, it is balanced.
3114 */ 3198 */
3115 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && 3199 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
3116 sds.busiest_nr_running <= sds.busiest_group_weight) 3200 sds.busiest_nr_running <= sds.busiest_group_weight)
3117 goto out_balanced; 3201 goto out_balanced;
3202 } else {
3203 /*
3204 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
3205 * imbalance_pct to be conservative.
3206 */
3207 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3208 goto out_balanced;
3118 } 3209 }
3119 3210
3120force_balance: 3211force_balance:
@@ -3148,7 +3239,8 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
3148 3239
3149 for_each_cpu(i, sched_group_cpus(group)) { 3240 for_each_cpu(i, sched_group_cpus(group)) {
3150 unsigned long power = power_of(i); 3241 unsigned long power = power_of(i);
3151 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 3242 unsigned long capacity = DIV_ROUND_CLOSEST(power,
3243 SCHED_POWER_SCALE);
3152 unsigned long wl; 3244 unsigned long wl;
3153 3245
3154 if (!capacity) 3246 if (!capacity)
@@ -3173,7 +3265,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
3173 * the load can be moved away from the cpu that is potentially 3265 * the load can be moved away from the cpu that is potentially
3174 * running at a lower capacity. 3266 * running at a lower capacity.
3175 */ 3267 */
3176 wl = (wl * SCHED_LOAD_SCALE) / power; 3268 wl = (wl * SCHED_POWER_SCALE) / power;
3177 3269
3178 if (wl > max_load) { 3270 if (wl > max_load) {
3179 max_load = wl; 3271 max_load = wl;
@@ -3193,7 +3285,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
3193/* Working cpumask for load_balance and load_balance_newidle. */ 3285/* Working cpumask for load_balance and load_balance_newidle. */
3194static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 3286static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3195 3287
3196static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, 3288static int need_active_balance(struct sched_domain *sd, int idle,
3197 int busiest_cpu, int this_cpu) 3289 int busiest_cpu, int this_cpu)
3198{ 3290{
3199 if (idle == CPU_NEWLY_IDLE) { 3291 if (idle == CPU_NEWLY_IDLE) {
@@ -3225,10 +3317,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
3225 * move_tasks() will succeed. ld_moved will be true and this 3317 * move_tasks() will succeed. ld_moved will be true and this
3226 * active balance code will not be triggered. 3318 * active balance code will not be triggered.
3227 */ 3319 */
3228 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3229 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3230 return 0;
3231
3232 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) 3320 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
3233 return 0; 3321 return 0;
3234 } 3322 }
@@ -3246,7 +3334,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3246 struct sched_domain *sd, enum cpu_idle_type idle, 3334 struct sched_domain *sd, enum cpu_idle_type idle,
3247 int *balance) 3335 int *balance)
3248{ 3336{
3249 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3337 int ld_moved, all_pinned = 0, active_balance = 0;
3250 struct sched_group *group; 3338 struct sched_group *group;
3251 unsigned long imbalance; 3339 unsigned long imbalance;
3252 struct rq *busiest; 3340 struct rq *busiest;
@@ -3255,20 +3343,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3255 3343
3256 cpumask_copy(cpus, cpu_active_mask); 3344 cpumask_copy(cpus, cpu_active_mask);
3257 3345
3258 /*
3259 * When power savings policy is enabled for the parent domain, idle
3260 * sibling can pick up load irrespective of busy siblings. In this case,
3261 * let the state of idle sibling percolate up as CPU_IDLE, instead of
3262 * portraying it as CPU_NOT_IDLE.
3263 */
3264 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
3265 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3266 sd_idle = 1;
3267
3268 schedstat_inc(sd, lb_count[idle]); 3346 schedstat_inc(sd, lb_count[idle]);
3269 3347
3270redo: 3348redo:
3271 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3349 group = find_busiest_group(sd, this_cpu, &imbalance, idle,
3272 cpus, balance); 3350 cpus, balance);
3273 3351
3274 if (*balance == 0) 3352 if (*balance == 0)
@@ -3297,6 +3375,7 @@ redo:
3297 * still unbalanced. ld_moved simply stays zero, so it is 3375 * still unbalanced. ld_moved simply stays zero, so it is
3298 * correctly treated as an imbalance. 3376 * correctly treated as an imbalance.
3299 */ 3377 */
3378 all_pinned = 1;
3300 local_irq_save(flags); 3379 local_irq_save(flags);
3301 double_rq_lock(this_rq, busiest); 3380 double_rq_lock(this_rq, busiest);
3302 ld_moved = move_tasks(this_rq, this_cpu, busiest, 3381 ld_moved = move_tasks(this_rq, this_cpu, busiest,
@@ -3330,8 +3409,7 @@ redo:
3330 if (idle != CPU_NEWLY_IDLE) 3409 if (idle != CPU_NEWLY_IDLE)
3331 sd->nr_balance_failed++; 3410 sd->nr_balance_failed++;
3332 3411
3333 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), 3412 if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
3334 this_cpu)) {
3335 raw_spin_lock_irqsave(&busiest->lock, flags); 3413 raw_spin_lock_irqsave(&busiest->lock, flags);
3336 3414
3337 /* don't kick the active_load_balance_cpu_stop, 3415 /* don't kick the active_load_balance_cpu_stop,
@@ -3386,10 +3464,6 @@ redo:
3386 sd->balance_interval *= 2; 3464 sd->balance_interval *= 2;
3387 } 3465 }
3388 3466
3389 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3390 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3391 ld_moved = -1;
3392
3393 goto out; 3467 goto out;
3394 3468
3395out_balanced: 3469out_balanced:
@@ -3403,11 +3477,7 @@ out_one_pinned:
3403 (sd->balance_interval < sd->max_interval)) 3477 (sd->balance_interval < sd->max_interval))
3404 sd->balance_interval *= 2; 3478 sd->balance_interval *= 2;
3405 3479
3406 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3480 ld_moved = 0;
3407 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3408 ld_moved = -1;
3409 else
3410 ld_moved = 0;
3411out: 3481out:
3412 return ld_moved; 3482 return ld_moved;
3413} 3483}
@@ -3433,6 +3503,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3433 raw_spin_unlock(&this_rq->lock); 3503 raw_spin_unlock(&this_rq->lock);
3434 3504
3435 update_shares(this_cpu); 3505 update_shares(this_cpu);
3506 rcu_read_lock();
3436 for_each_domain(this_cpu, sd) { 3507 for_each_domain(this_cpu, sd) {
3437 unsigned long interval; 3508 unsigned long interval;
3438 int balance = 1; 3509 int balance = 1;
@@ -3454,6 +3525,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3454 break; 3525 break;
3455 } 3526 }
3456 } 3527 }
3528 rcu_read_unlock();
3457 3529
3458 raw_spin_lock(&this_rq->lock); 3530 raw_spin_lock(&this_rq->lock);
3459 3531
@@ -3502,6 +3574,7 @@ static int active_load_balance_cpu_stop(void *data)
3502 double_lock_balance(busiest_rq, target_rq); 3574 double_lock_balance(busiest_rq, target_rq);
3503 3575
3504 /* Search for an sd spanning us and the target CPU. */ 3576 /* Search for an sd spanning us and the target CPU. */
3577 rcu_read_lock();
3505 for_each_domain(target_cpu, sd) { 3578 for_each_domain(target_cpu, sd) {
3506 if ((sd->flags & SD_LOAD_BALANCE) && 3579 if ((sd->flags & SD_LOAD_BALANCE) &&
3507 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) 3580 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
@@ -3517,6 +3590,7 @@ static int active_load_balance_cpu_stop(void *data)
3517 else 3590 else
3518 schedstat_inc(sd, alb_failed); 3591 schedstat_inc(sd, alb_failed);
3519 } 3592 }
3593 rcu_read_unlock();
3520 double_unlock_balance(busiest_rq, target_rq); 3594 double_unlock_balance(busiest_rq, target_rq);
3521out_unlock: 3595out_unlock:
3522 busiest_rq->active_balance = 0; 3596 busiest_rq->active_balance = 0;
@@ -3643,6 +3717,7 @@ static int find_new_ilb(int cpu)
3643{ 3717{
3644 struct sched_domain *sd; 3718 struct sched_domain *sd;
3645 struct sched_group *ilb_group; 3719 struct sched_group *ilb_group;
3720 int ilb = nr_cpu_ids;
3646 3721
3647 /* 3722 /*
3648 * Have idle load balancer selection from semi-idle packages only 3723 * Have idle load balancer selection from semi-idle packages only
@@ -3658,20 +3733,25 @@ static int find_new_ilb(int cpu)
3658 if (cpumask_weight(nohz.idle_cpus_mask) < 2) 3733 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3659 goto out_done; 3734 goto out_done;
3660 3735
3736 rcu_read_lock();
3661 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3737 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3662 ilb_group = sd->groups; 3738 ilb_group = sd->groups;
3663 3739
3664 do { 3740 do {
3665 if (is_semi_idle_group(ilb_group)) 3741 if (is_semi_idle_group(ilb_group)) {
3666 return cpumask_first(nohz.grp_idle_mask); 3742 ilb = cpumask_first(nohz.grp_idle_mask);
3743 goto unlock;
3744 }
3667 3745
3668 ilb_group = ilb_group->next; 3746 ilb_group = ilb_group->next;
3669 3747
3670 } while (ilb_group != sd->groups); 3748 } while (ilb_group != sd->groups);
3671 } 3749 }
3750unlock:
3751 rcu_read_unlock();
3672 3752
3673out_done: 3753out_done:
3674 return nr_cpu_ids; 3754 return ilb;
3675} 3755}
3676#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3756#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3677static inline int find_new_ilb(int call_cpu) 3757static inline int find_new_ilb(int call_cpu)
@@ -3786,6 +3866,17 @@ void select_nohz_load_balancer(int stop_tick)
3786 3866
3787static DEFINE_SPINLOCK(balancing); 3867static DEFINE_SPINLOCK(balancing);
3788 3868
3869static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3870
3871/*
3872 * Scale the max load_balance interval with the number of CPUs in the system.
3873 * This trades load-balance latency on larger machines for less cross talk.
3874 */
3875static void update_max_interval(void)
3876{
3877 max_load_balance_interval = HZ*num_online_cpus()/10;
3878}
3879
3789/* 3880/*
3790 * It checks each scheduling domain to see if it is due to be balanced, 3881 * It checks each scheduling domain to see if it is due to be balanced,
3791 * and initiates a balancing operation if so. 3882 * and initiates a balancing operation if so.
@@ -3805,6 +3896,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3805 3896
3806 update_shares(cpu); 3897 update_shares(cpu);
3807 3898
3899 rcu_read_lock();
3808 for_each_domain(cpu, sd) { 3900 for_each_domain(cpu, sd) {
3809 if (!(sd->flags & SD_LOAD_BALANCE)) 3901 if (!(sd->flags & SD_LOAD_BALANCE))
3810 continue; 3902 continue;
@@ -3815,10 +3907,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3815 3907
3816 /* scale ms to jiffies */ 3908 /* scale ms to jiffies */
3817 interval = msecs_to_jiffies(interval); 3909 interval = msecs_to_jiffies(interval);
3818 if (unlikely(!interval)) 3910 interval = clamp(interval, 1UL, max_load_balance_interval);
3819 interval = 1;
3820 if (interval > HZ*NR_CPUS/10)
3821 interval = HZ*NR_CPUS/10;
3822 3911
3823 need_serialize = sd->flags & SD_SERIALIZE; 3912 need_serialize = sd->flags & SD_SERIALIZE;
3824 3913
@@ -3831,8 +3920,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3831 if (load_balance(cpu, rq, sd, idle, &balance)) { 3920 if (load_balance(cpu, rq, sd, idle, &balance)) {
3832 /* 3921 /*
3833 * We've pulled tasks over so either we're no 3922 * We've pulled tasks over so either we're no
3834 * longer idle, or one of our SMT siblings is 3923 * longer idle.
3835 * not idle.
3836 */ 3924 */
3837 idle = CPU_NOT_IDLE; 3925 idle = CPU_NOT_IDLE;
3838 } 3926 }
@@ -3854,6 +3942,7 @@ out:
3854 if (!balance) 3942 if (!balance)
3855 break; 3943 break;
3856 } 3944 }
3945 rcu_read_unlock();
3857 3946
3858 /* 3947 /*
3859 * next_balance will be updated only when there is a need. 3948 * next_balance will be updated only when there is a need.
@@ -4079,33 +4168,62 @@ static void task_fork_fair(struct task_struct *p)
4079 * Priority of the task has changed. Check to see if we preempt 4168 * Priority of the task has changed. Check to see if we preempt
4080 * the current task. 4169 * the current task.
4081 */ 4170 */
4082static void prio_changed_fair(struct rq *rq, struct task_struct *p, 4171static void
4083 int oldprio, int running) 4172prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
4084{ 4173{
4174 if (!p->se.on_rq)
4175 return;
4176
4085 /* 4177 /*
4086 * Reschedule if we are currently running on this runqueue and 4178 * Reschedule if we are currently running on this runqueue and
4087 * our priority decreased, or if we are not currently running on 4179 * our priority decreased, or if we are not currently running on
4088 * this runqueue and our priority is higher than the current's 4180 * this runqueue and our priority is higher than the current's
4089 */ 4181 */
4090 if (running) { 4182 if (rq->curr == p) {
4091 if (p->prio > oldprio) 4183 if (p->prio > oldprio)
4092 resched_task(rq->curr); 4184 resched_task(rq->curr);
4093 } else 4185 } else
4094 check_preempt_curr(rq, p, 0); 4186 check_preempt_curr(rq, p, 0);
4095} 4187}
4096 4188
4189static void switched_from_fair(struct rq *rq, struct task_struct *p)
4190{
4191 struct sched_entity *se = &p->se;
4192 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4193
4194 /*
4195 * Ensure the task's vruntime is normalized, so that when its
4196 * switched back to the fair class the enqueue_entity(.flags=0) will
4197 * do the right thing.
4198 *
4199 * If it was on_rq, then the dequeue_entity(.flags=0) will already
4200 * have normalized the vruntime, if it was !on_rq, then only when
4201 * the task is sleeping will it still have non-normalized vruntime.
4202 */
4203 if (!se->on_rq && p->state != TASK_RUNNING) {
4204 /*
4205 * Fix up our vruntime so that the current sleep doesn't
4206 * cause 'unlimited' sleep bonus.
4207 */
4208 place_entity(cfs_rq, se, 0);
4209 se->vruntime -= cfs_rq->min_vruntime;
4210 }
4211}
4212
4097/* 4213/*
4098 * We switched to the sched_fair class. 4214 * We switched to the sched_fair class.
4099 */ 4215 */
4100static void switched_to_fair(struct rq *rq, struct task_struct *p, 4216static void switched_to_fair(struct rq *rq, struct task_struct *p)
4101 int running)
4102{ 4217{
4218 if (!p->se.on_rq)
4219 return;
4220
4103 /* 4221 /*
4104 * We were most likely switched from sched_rt, so 4222 * We were most likely switched from sched_rt, so
4105 * kick off the schedule if running, otherwise just see 4223 * kick off the schedule if running, otherwise just see
4106 * if we can still preempt the current task. 4224 * if we can still preempt the current task.
4107 */ 4225 */
4108 if (running) 4226 if (rq->curr == p)
4109 resched_task(rq->curr); 4227 resched_task(rq->curr);
4110 else 4228 else
4111 check_preempt_curr(rq, p, 0); 4229 check_preempt_curr(rq, p, 0);
@@ -4171,6 +4289,7 @@ static const struct sched_class fair_sched_class = {
4171 .enqueue_task = enqueue_task_fair, 4289 .enqueue_task = enqueue_task_fair,
4172 .dequeue_task = dequeue_task_fair, 4290 .dequeue_task = dequeue_task_fair,
4173 .yield_task = yield_task_fair, 4291 .yield_task = yield_task_fair,
4292 .yield_to_task = yield_to_task_fair,
4174 4293
4175 .check_preempt_curr = check_preempt_wakeup, 4294 .check_preempt_curr = check_preempt_wakeup,
4176 4295
@@ -4191,6 +4310,7 @@ static const struct sched_class fair_sched_class = {
4191 .task_fork = task_fork_fair, 4310 .task_fork = task_fork_fair,
4192 4311
4193 .prio_changed = prio_changed_fair, 4312 .prio_changed = prio_changed_fair,
4313 .switched_from = switched_from_fair,
4194 .switched_to = switched_to_fair, 4314 .switched_to = switched_to_fair,
4195 4315
4196 .get_rr_interval = get_rr_interval_fair, 4316 .get_rr_interval = get_rr_interval_fair,
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 68e69acc29b9..be40f7371ee1 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -64,3 +64,9 @@ SCHED_FEAT(OWNER_SPIN, 1)
64 * Decrement CPU power based on irq activity 64 * Decrement CPU power based on irq activity
65 */ 65 */
66SCHED_FEAT(NONIRQ_POWER, 1) 66SCHED_FEAT(NONIRQ_POWER, 1)
67
68/*
69 * Queue remote wakeups on the target CPU and process them
70 * using the scheduler IPI. Reduces rq->lock contention/bounces.
71 */
72SCHED_FEAT(TTWU_QUEUE, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9fa0f402c87c..0a51882534ea 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -7,7 +7,7 @@
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int 9static int
10select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) 10select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
11{ 11{
12 return task_cpu(p); /* IDLE tasks as never migrated */ 12 return task_cpu(p); /* IDLE tasks as never migrated */
13} 13}
@@ -52,31 +52,15 @@ static void set_curr_task_idle(struct rq *rq)
52{ 52{
53} 53}
54 54
55static void switched_to_idle(struct rq *rq, struct task_struct *p, 55static void switched_to_idle(struct rq *rq, struct task_struct *p)
56 int running)
57{ 56{
58 /* Can this actually happen?? */ 57 BUG();
59 if (running)
60 resched_task(rq->curr);
61 else
62 check_preempt_curr(rq, p, 0);
63} 58}
64 59
65static void prio_changed_idle(struct rq *rq, struct task_struct *p, 60static void
66 int oldprio, int running) 61prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
67{ 62{
68 /* This can happen for hot plug CPUS */ 63 BUG();
69
70 /*
71 * Reschedule if we are currently running on this runqueue and
72 * our priority decreased, or if we are not currently running on
73 * this runqueue and our priority is higher than the current's
74 */
75 if (running) {
76 if (p->prio > oldprio)
77 resched_task(rq->curr);
78 } else
79 check_preempt_curr(rq, p, 0);
80} 64}
81 65
82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) 66static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
@@ -110,6 +94,4 @@ static const struct sched_class idle_sched_class = {
110 94
111 .prio_changed = prio_changed_idle, 95 .prio_changed = prio_changed_idle,
112 .switched_to = switched_to_idle, 96 .switched_to = switched_to_idle,
113
114 /* no .task_new for idle tasks */
115}; 97};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index ad6267714c84..64b2a37c07d0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
184} 184}
185 185
186typedef struct task_group *rt_rq_iter_t;
187
188#define for_each_rt_rq(rt_rq, iter, rq) \
189 for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \
190 (&iter->list != &task_groups) && \
191 (rt_rq = iter->rt_rq[cpu_of(rq)]); \
192 iter = list_entry_rcu(iter->list.next, typeof(*iter), list))
193
186static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 194static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
187{ 195{
188 list_add_rcu(&rt_rq->leaf_rt_rq_list, 196 list_add_rcu(&rt_rq->leaf_rt_rq_list,
@@ -210,11 +218,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
210 218
211static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 219static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
212{ 220{
213 int this_cpu = smp_processor_id();
214 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 221 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
215 struct sched_rt_entity *rt_se; 222 struct sched_rt_entity *rt_se;
216 223
217 rt_se = rt_rq->tg->rt_se[this_cpu]; 224 int cpu = cpu_of(rq_of_rt_rq(rt_rq));
225
226 rt_se = rt_rq->tg->rt_se[cpu];
218 227
219 if (rt_rq->rt_nr_running) { 228 if (rt_rq->rt_nr_running) {
220 if (rt_se && !on_rt_rq(rt_se)) 229 if (rt_se && !on_rt_rq(rt_se))
@@ -226,10 +235,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
226 235
227static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 236static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
228{ 237{
229 int this_cpu = smp_processor_id();
230 struct sched_rt_entity *rt_se; 238 struct sched_rt_entity *rt_se;
239 int cpu = cpu_of(rq_of_rt_rq(rt_rq));
231 240
232 rt_se = rt_rq->tg->rt_se[this_cpu]; 241 rt_se = rt_rq->tg->rt_se[cpu];
233 242
234 if (rt_se && on_rt_rq(rt_se)) 243 if (rt_se && on_rt_rq(rt_se))
235 dequeue_rt_entity(rt_se); 244 dequeue_rt_entity(rt_se);
@@ -287,6 +296,11 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
287 return ktime_to_ns(def_rt_bandwidth.rt_period); 296 return ktime_to_ns(def_rt_bandwidth.rt_period);
288} 297}
289 298
299typedef struct rt_rq *rt_rq_iter_t;
300
301#define for_each_rt_rq(rt_rq, iter, rq) \
302 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
303
290static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 304static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
291{ 305{
292} 306}
@@ -401,12 +415,13 @@ next:
401static void __disable_runtime(struct rq *rq) 415static void __disable_runtime(struct rq *rq)
402{ 416{
403 struct root_domain *rd = rq->rd; 417 struct root_domain *rd = rq->rd;
418 rt_rq_iter_t iter;
404 struct rt_rq *rt_rq; 419 struct rt_rq *rt_rq;
405 420
406 if (unlikely(!scheduler_running)) 421 if (unlikely(!scheduler_running))
407 return; 422 return;
408 423
409 for_each_leaf_rt_rq(rt_rq, rq) { 424 for_each_rt_rq(rt_rq, iter, rq) {
410 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 425 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
411 s64 want; 426 s64 want;
412 int i; 427 int i;
@@ -486,6 +501,7 @@ static void disable_runtime(struct rq *rq)
486 501
487static void __enable_runtime(struct rq *rq) 502static void __enable_runtime(struct rq *rq)
488{ 503{
504 rt_rq_iter_t iter;
489 struct rt_rq *rt_rq; 505 struct rt_rq *rt_rq;
490 506
491 if (unlikely(!scheduler_running)) 507 if (unlikely(!scheduler_running))
@@ -494,7 +510,7 @@ static void __enable_runtime(struct rq *rq)
494 /* 510 /*
495 * Reset each runqueue's bandwidth settings 511 * Reset each runqueue's bandwidth settings
496 */ 512 */
497 for_each_leaf_rt_rq(rt_rq, rq) { 513 for_each_rt_rq(rt_rq, iter, rq) {
498 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 514 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
499 515
500 raw_spin_lock(&rt_b->rt_runtime_lock); 516 raw_spin_lock(&rt_b->rt_runtime_lock);
@@ -561,12 +577,22 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
561 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { 577 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
562 rt_rq->rt_throttled = 0; 578 rt_rq->rt_throttled = 0;
563 enqueue = 1; 579 enqueue = 1;
580
581 /*
582 * Force a clock update if the CPU was idle,
583 * lest wakeup -> unthrottle time accumulate.
584 */
585 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
586 rq->skip_clock_update = -1;
564 } 587 }
565 if (rt_rq->rt_time || rt_rq->rt_nr_running) 588 if (rt_rq->rt_time || rt_rq->rt_nr_running)
566 idle = 0; 589 idle = 0;
567 raw_spin_unlock(&rt_rq->rt_runtime_lock); 590 raw_spin_unlock(&rt_rq->rt_runtime_lock);
568 } else if (rt_rq->rt_nr_running) 591 } else if (rt_rq->rt_nr_running) {
569 idle = 0; 592 idle = 0;
593 if (!rt_rq_throttled(rt_rq))
594 enqueue = 1;
595 }
570 596
571 if (enqueue) 597 if (enqueue)
572 sched_rt_rq_enqueue(rt_rq); 598 sched_rt_rq_enqueue(rt_rq);
@@ -973,13 +999,23 @@ static void yield_task_rt(struct rq *rq)
973static int find_lowest_rq(struct task_struct *task); 999static int find_lowest_rq(struct task_struct *task);
974 1000
975static int 1001static int
976select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) 1002select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
977{ 1003{
1004 struct task_struct *curr;
1005 struct rq *rq;
1006 int cpu;
1007
978 if (sd_flag != SD_BALANCE_WAKE) 1008 if (sd_flag != SD_BALANCE_WAKE)
979 return smp_processor_id(); 1009 return smp_processor_id();
980 1010
1011 cpu = task_cpu(p);
1012 rq = cpu_rq(cpu);
1013
1014 rcu_read_lock();
1015 curr = ACCESS_ONCE(rq->curr); /* unlocked access */
1016
981 /* 1017 /*
982 * If the current task is an RT task, then 1018 * If the current task on @p's runqueue is an RT task, then
983 * try to see if we can wake this RT task up on another 1019 * try to see if we can wake this RT task up on another
984 * runqueue. Otherwise simply start this RT task 1020 * runqueue. Otherwise simply start this RT task
985 * on its current runqueue. 1021 * on its current runqueue.
@@ -993,21 +1029,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
993 * lock? 1029 * lock?
994 * 1030 *
995 * For equal prio tasks, we just let the scheduler sort it out. 1031 * For equal prio tasks, we just let the scheduler sort it out.
1032 *
1033 * Otherwise, just let it ride on the affined RQ and the
1034 * post-schedule router will push the preempted task away
1035 *
1036 * This test is optimistic, if we get it wrong the load-balancer
1037 * will have to sort it out.
996 */ 1038 */
997 if (unlikely(rt_task(rq->curr)) && 1039 if (curr && unlikely(rt_task(curr)) &&
998 (rq->curr->rt.nr_cpus_allowed < 2 || 1040 (curr->rt.nr_cpus_allowed < 2 ||
999 rq->curr->prio < p->prio) && 1041 curr->prio < p->prio) &&
1000 (p->rt.nr_cpus_allowed > 1)) { 1042 (p->rt.nr_cpus_allowed > 1)) {
1001 int cpu = find_lowest_rq(p); 1043 int target = find_lowest_rq(p);
1002 1044
1003 return (cpu == -1) ? task_cpu(p) : cpu; 1045 if (target != -1)
1046 cpu = target;
1004 } 1047 }
1048 rcu_read_unlock();
1005 1049
1006 /* 1050 return cpu;
1007 * Otherwise, just let it ride on the affined RQ and the
1008 * post-schedule router will push the preempted task away
1009 */
1010 return task_cpu(p);
1011} 1051}
1012 1052
1013static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1053static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
@@ -1132,7 +1172,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1132 * The previous task needs to be made eligible for pushing 1172 * The previous task needs to be made eligible for pushing
1133 * if it is still active 1173 * if it is still active
1134 */ 1174 */
1135 if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) 1175 if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
1136 enqueue_pushable_task(rq, p); 1176 enqueue_pushable_task(rq, p);
1137} 1177}
1138 1178
@@ -1283,7 +1323,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1283 !cpumask_test_cpu(lowest_rq->cpu, 1323 !cpumask_test_cpu(lowest_rq->cpu,
1284 &task->cpus_allowed) || 1324 &task->cpus_allowed) ||
1285 task_running(rq, task) || 1325 task_running(rq, task) ||
1286 !task->se.on_rq)) { 1326 !task->on_rq)) {
1287 1327
1288 raw_spin_unlock(&lowest_rq->lock); 1328 raw_spin_unlock(&lowest_rq->lock);
1289 lowest_rq = NULL; 1329 lowest_rq = NULL;
@@ -1317,7 +1357,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
1317 BUG_ON(task_current(rq, p)); 1357 BUG_ON(task_current(rq, p));
1318 BUG_ON(p->rt.nr_cpus_allowed <= 1); 1358 BUG_ON(p->rt.nr_cpus_allowed <= 1);
1319 1359
1320 BUG_ON(!p->se.on_rq); 1360 BUG_ON(!p->on_rq);
1321 BUG_ON(!rt_task(p)); 1361 BUG_ON(!rt_task(p));
1322 1362
1323 return p; 1363 return p;
@@ -1374,7 +1414,7 @@ retry:
1374 task = pick_next_pushable_task(rq); 1414 task = pick_next_pushable_task(rq);
1375 if (task_cpu(next_task) == rq->cpu && task == next_task) { 1415 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1376 /* 1416 /*
1377 * If we get here, the task hasnt moved at all, but 1417 * If we get here, the task hasn't moved at all, but
1378 * it has failed to push. We will not try again, 1418 * it has failed to push. We will not try again,
1379 * since the other cpus will pull from us when they 1419 * since the other cpus will pull from us when they
1380 * are ready. 1420 * are ready.
@@ -1463,7 +1503,7 @@ static int pull_rt_task(struct rq *this_rq)
1463 */ 1503 */
1464 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 1504 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1465 WARN_ON(p == src_rq->curr); 1505 WARN_ON(p == src_rq->curr);
1466 WARN_ON(!p->se.on_rq); 1506 WARN_ON(!p->on_rq);
1467 1507
1468 /* 1508 /*
1469 * There's a chance that p is higher in priority 1509 * There's a chance that p is higher in priority
@@ -1484,7 +1524,7 @@ static int pull_rt_task(struct rq *this_rq)
1484 /* 1524 /*
1485 * We continue with the search, just in 1525 * We continue with the search, just in
1486 * case there's an even higher prio task 1526 * case there's an even higher prio task
1487 * in another runqueue. (low likelyhood 1527 * in another runqueue. (low likelihood
1488 * but possible) 1528 * but possible)
1489 */ 1529 */
1490 } 1530 }
@@ -1534,7 +1574,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1534 * Update the migration status of the RQ if we have an RT task 1574 * Update the migration status of the RQ if we have an RT task
1535 * which is running AND changing its weight value. 1575 * which is running AND changing its weight value.
1536 */ 1576 */
1537 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { 1577 if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
1538 struct rq *rq = task_rq(p); 1578 struct rq *rq = task_rq(p);
1539 1579
1540 if (!task_current(rq, p)) { 1580 if (!task_current(rq, p)) {
@@ -1595,8 +1635,7 @@ static void rq_offline_rt(struct rq *rq)
1595 * When switch from the rt queue, we bring ourselves to a position 1635 * When switch from the rt queue, we bring ourselves to a position
1596 * that we might want to pull RT tasks from other runqueues. 1636 * that we might want to pull RT tasks from other runqueues.
1597 */ 1637 */
1598static void switched_from_rt(struct rq *rq, struct task_struct *p, 1638static void switched_from_rt(struct rq *rq, struct task_struct *p)
1599 int running)
1600{ 1639{
1601 /* 1640 /*
1602 * If there are other RT tasks then we will reschedule 1641 * If there are other RT tasks then we will reschedule
@@ -1605,7 +1644,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
1605 * we may need to handle the pulling of RT tasks 1644 * we may need to handle the pulling of RT tasks
1606 * now. 1645 * now.
1607 */ 1646 */
1608 if (!rq->rt.rt_nr_running) 1647 if (p->on_rq && !rq->rt.rt_nr_running)
1609 pull_rt_task(rq); 1648 pull_rt_task(rq);
1610} 1649}
1611 1650
@@ -1624,8 +1663,7 @@ static inline void init_sched_rt_class(void)
1624 * with RT tasks. In this case we try to push them off to 1663 * with RT tasks. In this case we try to push them off to
1625 * other runqueues. 1664 * other runqueues.
1626 */ 1665 */
1627static void switched_to_rt(struct rq *rq, struct task_struct *p, 1666static void switched_to_rt(struct rq *rq, struct task_struct *p)
1628 int running)
1629{ 1667{
1630 int check_resched = 1; 1668 int check_resched = 1;
1631 1669
@@ -1636,7 +1674,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
1636 * If that current running task is also an RT task 1674 * If that current running task is also an RT task
1637 * then see if we can move to another run queue. 1675 * then see if we can move to another run queue.
1638 */ 1676 */
1639 if (!running) { 1677 if (p->on_rq && rq->curr != p) {
1640#ifdef CONFIG_SMP 1678#ifdef CONFIG_SMP
1641 if (rq->rt.overloaded && push_rt_task(rq) && 1679 if (rq->rt.overloaded && push_rt_task(rq) &&
1642 /* Don't resched if we changed runqueues */ 1680 /* Don't resched if we changed runqueues */
@@ -1652,10 +1690,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
1652 * Priority of the task has changed. This may cause 1690 * Priority of the task has changed. This may cause
1653 * us to initiate a push or pull. 1691 * us to initiate a push or pull.
1654 */ 1692 */
1655static void prio_changed_rt(struct rq *rq, struct task_struct *p, 1693static void
1656 int oldprio, int running) 1694prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1657{ 1695{
1658 if (running) { 1696 if (!p->on_rq)
1697 return;
1698
1699 if (rq->curr == p) {
1659#ifdef CONFIG_SMP 1700#ifdef CONFIG_SMP
1660 /* 1701 /*
1661 * If our priority decreases while running, we 1702 * If our priority decreases while running, we
@@ -1791,10 +1832,11 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1791 1832
1792static void print_rt_stats(struct seq_file *m, int cpu) 1833static void print_rt_stats(struct seq_file *m, int cpu)
1793{ 1834{
1835 rt_rq_iter_t iter;
1794 struct rt_rq *rt_rq; 1836 struct rt_rq *rt_rq;
1795 1837
1796 rcu_read_lock(); 1838 rcu_read_lock();
1797 for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) 1839 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
1798 print_rt_rq(m, cpu, rt_rq); 1840 print_rt_rq(m, cpu, rt_rq);
1799 rcu_read_unlock(); 1841 rcu_read_unlock();
1800} 1842}
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 2bf6b47058c1..6f437632afab 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -9,8 +9,7 @@
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
11static int 11static int
12select_task_rq_stop(struct rq *rq, struct task_struct *p, 12select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
13 int sd_flag, int flags)
14{ 13{
15 return task_cpu(p); /* stop tasks as never migrate */ 14 return task_cpu(p); /* stop tasks as never migrate */
16} 15}
@@ -26,7 +25,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
26{ 25{
27 struct task_struct *stop = rq->stop; 26 struct task_struct *stop = rq->stop;
28 27
29 if (stop && stop->se.on_rq) 28 if (stop && stop->on_rq)
30 return stop; 29 return stop;
31 30
32 return NULL; 31 return NULL;
@@ -59,14 +58,13 @@ static void set_curr_task_stop(struct rq *rq)
59{ 58{
60} 59}
61 60
62static void switched_to_stop(struct rq *rq, struct task_struct *p, 61static void switched_to_stop(struct rq *rq, struct task_struct *p)
63 int running)
64{ 62{
65 BUG(); /* its impossible to change to this class */ 63 BUG(); /* its impossible to change to this class */
66} 64}
67 65
68static void prio_changed_stop(struct rq *rq, struct task_struct *p, 66static void
69 int oldprio, int running) 67prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
70{ 68{
71 BUG(); /* how!?, what priority? */ 69 BUG(); /* how!?, what priority? */
72} 70}
@@ -103,6 +101,4 @@ static const struct sched_class stop_sched_class = {
103 101
104 .prio_changed = prio_changed_stop, 102 .prio_changed = prio_changed_stop,
105 .switched_to = switched_to_stop, 103 .switched_to = switched_to_stop,
106
107 /* no .task_new for stop tasks */
108}; 104};
diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff10fdce..86c32b884f8e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
124 124
125static int recalc_sigpending_tsk(struct task_struct *t) 125static int recalc_sigpending_tsk(struct task_struct *t)
126{ 126{
127 if (t->signal->group_stop_count > 0 || 127 if ((t->group_stop & GROUP_STOP_PENDING) ||
128 PENDING(&t->pending, &t->blocked) || 128 PENDING(&t->pending, &t->blocked) ||
129 PENDING(&t->signal->shared_pending, &t->blocked)) { 129 PENDING(&t->signal->shared_pending, &t->blocked)) {
130 set_tsk_thread_flag(t, TIF_SIGPENDING); 130 set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -223,10 +223,87 @@ static inline void print_dropped_signal(int sig)
223 current->comm, current->pid, sig); 223 current->comm, current->pid, sig);
224} 224}
225 225
226/**
227 * task_clear_group_stop_trapping - clear group stop trapping bit
228 * @task: target task
229 *
230 * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us. Clear it
231 * and wake up the ptracer. Note that we don't need any further locking.
232 * @task->siglock guarantees that @task->parent points to the ptracer.
233 *
234 * CONTEXT:
235 * Must be called with @task->sighand->siglock held.
236 */
237static void task_clear_group_stop_trapping(struct task_struct *task)
238{
239 if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) {
240 task->group_stop &= ~GROUP_STOP_TRAPPING;
241 __wake_up_sync_key(&task->parent->signal->wait_chldexit,
242 TASK_UNINTERRUPTIBLE, 1, task);
243 }
244}
245
246/**
247 * task_clear_group_stop_pending - clear pending group stop
248 * @task: target task
249 *
250 * Clear group stop states for @task.
251 *
252 * CONTEXT:
253 * Must be called with @task->sighand->siglock held.
254 */
255void task_clear_group_stop_pending(struct task_struct *task)
256{
257 task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME |
258 GROUP_STOP_DEQUEUED);
259}
260
261/**
262 * task_participate_group_stop - participate in a group stop
263 * @task: task participating in a group stop
264 *
265 * @task has GROUP_STOP_PENDING set and is participating in a group stop.
266 * Group stop states are cleared and the group stop count is consumed if
267 * %GROUP_STOP_CONSUME was set. If the consumption completes the group
268 * stop, the appropriate %SIGNAL_* flags are set.
269 *
270 * CONTEXT:
271 * Must be called with @task->sighand->siglock held.
272 *
273 * RETURNS:
274 * %true if group stop completion should be notified to the parent, %false
275 * otherwise.
276 */
277static bool task_participate_group_stop(struct task_struct *task)
278{
279 struct signal_struct *sig = task->signal;
280 bool consume = task->group_stop & GROUP_STOP_CONSUME;
281
282 WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING));
283
284 task_clear_group_stop_pending(task);
285
286 if (!consume)
287 return false;
288
289 if (!WARN_ON_ONCE(sig->group_stop_count == 0))
290 sig->group_stop_count--;
291
292 /*
293 * Tell the caller to notify completion iff we are entering into a
294 * fresh group stop. Read comment in do_signal_stop() for details.
295 */
296 if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
297 sig->flags = SIGNAL_STOP_STOPPED;
298 return true;
299 }
300 return false;
301}
302
226/* 303/*
227 * allocate a new signal queue record 304 * allocate a new signal queue record
228 * - this may be called without locks if and only if t == current, otherwise an 305 * - this may be called without locks if and only if t == current, otherwise an
229 * appopriate lock must be held to stop the target task from exiting 306 * appropriate lock must be held to stop the target task from exiting
230 */ 307 */
231static struct sigqueue * 308static struct sigqueue *
232__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) 309__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
@@ -375,15 +452,15 @@ int unhandled_signal(struct task_struct *tsk, int sig)
375 return !tracehook_consider_fatal_signal(tsk, sig); 452 return !tracehook_consider_fatal_signal(tsk, sig);
376} 453}
377 454
378 455/*
379/* Notify the system that a driver wants to block all signals for this 456 * Notify the system that a driver wants to block all signals for this
380 * process, and wants to be notified if any signals at all were to be 457 * process, and wants to be notified if any signals at all were to be
381 * sent/acted upon. If the notifier routine returns non-zero, then the 458 * sent/acted upon. If the notifier routine returns non-zero, then the
382 * signal will be acted upon after all. If the notifier routine returns 0, 459 * signal will be acted upon after all. If the notifier routine returns 0,
383 * then then signal will be blocked. Only one block per process is 460 * then then signal will be blocked. Only one block per process is
384 * allowed. priv is a pointer to private data that the notifier routine 461 * allowed. priv is a pointer to private data that the notifier routine
385 * can use to determine if the signal should be blocked or not. */ 462 * can use to determine if the signal should be blocked or not.
386 463 */
387void 464void
388block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) 465block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
389{ 466{
@@ -434,9 +511,10 @@ still_pending:
434 copy_siginfo(info, &first->info); 511 copy_siginfo(info, &first->info);
435 __sigqueue_free(first); 512 __sigqueue_free(first);
436 } else { 513 } else {
437 /* Ok, it wasn't in the queue. This must be 514 /*
438 a fast-pathed signal or we must have been 515 * Ok, it wasn't in the queue. This must be
439 out of queue space. So zero out the info. 516 * a fast-pathed signal or we must have been
517 * out of queue space. So zero out the info.
440 */ 518 */
441 info->si_signo = sig; 519 info->si_signo = sig;
442 info->si_errno = 0; 520 info->si_errno = 0;
@@ -468,7 +546,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
468} 546}
469 547
470/* 548/*
471 * Dequeue a signal and return the element to the caller, which is 549 * Dequeue a signal and return the element to the caller, which is
472 * expected to free it. 550 * expected to free it.
473 * 551 *
474 * All callers have to hold the siglock. 552 * All callers have to hold the siglock.
@@ -490,7 +568,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
490 * itimers are process shared and we restart periodic 568 * itimers are process shared and we restart periodic
491 * itimers in the signal delivery path to prevent DoS 569 * itimers in the signal delivery path to prevent DoS
492 * attacks in the high resolution timer case. This is 570 * attacks in the high resolution timer case. This is
493 * compliant with the old way of self restarting 571 * compliant with the old way of self-restarting
494 * itimers, as the SIGALRM is a legacy signal and only 572 * itimers, as the SIGALRM is a legacy signal and only
495 * queued once. Changing the restart behaviour to 573 * queued once. Changing the restart behaviour to
496 * restart the timer in the signal dequeue path is 574 * restart the timer in the signal dequeue path is
@@ -526,7 +604,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
526 * is to alert stop-signal processing code when another 604 * is to alert stop-signal processing code when another
527 * processor has come along and cleared the flag. 605 * processor has come along and cleared the flag.
528 */ 606 */
529 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; 607 current->group_stop |= GROUP_STOP_DEQUEUED;
530 } 608 }
531 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { 609 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
532 /* 610 /*
@@ -591,7 +669,7 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
591 if (sigisemptyset(&m)) 669 if (sigisemptyset(&m))
592 return 0; 670 return 0;
593 671
594 signandsets(&s->signal, &s->signal, mask); 672 sigandnsets(&s->signal, &s->signal, mask);
595 list_for_each_entry_safe(q, n, &s->list, list) { 673 list_for_each_entry_safe(q, n, &s->list, list) {
596 if (sigismember(mask, q->info.si_signo)) { 674 if (sigismember(mask, q->info.si_signo)) {
597 list_del_init(&q->list); 675 list_del_init(&q->list);
@@ -636,13 +714,33 @@ static inline bool si_fromuser(const struct siginfo *info)
636} 714}
637 715
638/* 716/*
717 * called with RCU read lock from check_kill_permission()
718 */
719static int kill_ok_by_cred(struct task_struct *t)
720{
721 const struct cred *cred = current_cred();
722 const struct cred *tcred = __task_cred(t);
723
724 if (cred->user->user_ns == tcred->user->user_ns &&
725 (cred->euid == tcred->suid ||
726 cred->euid == tcred->uid ||
727 cred->uid == tcred->suid ||
728 cred->uid == tcred->uid))
729 return 1;
730
731 if (ns_capable(tcred->user->user_ns, CAP_KILL))
732 return 1;
733
734 return 0;
735}
736
737/*
639 * Bad permissions for sending the signal 738 * Bad permissions for sending the signal
640 * - the caller must hold the RCU read lock 739 * - the caller must hold the RCU read lock
641 */ 740 */
642static int check_kill_permission(int sig, struct siginfo *info, 741static int check_kill_permission(int sig, struct siginfo *info,
643 struct task_struct *t) 742 struct task_struct *t)
644{ 743{
645 const struct cred *cred, *tcred;
646 struct pid *sid; 744 struct pid *sid;
647 int error; 745 int error;
648 746
@@ -656,14 +754,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
656 if (error) 754 if (error)
657 return error; 755 return error;
658 756
659 cred = current_cred();
660 tcred = __task_cred(t);
661 if (!same_thread_group(current, t) && 757 if (!same_thread_group(current, t) &&
662 (cred->euid ^ tcred->suid) && 758 !kill_ok_by_cred(t)) {
663 (cred->euid ^ tcred->uid) &&
664 (cred->uid ^ tcred->suid) &&
665 (cred->uid ^ tcred->uid) &&
666 !capable(CAP_KILL)) {
667 switch (sig) { 759 switch (sig) {
668 case SIGCONT: 760 case SIGCONT:
669 sid = task_session(t); 761 sid = task_session(t);
@@ -712,34 +804,14 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
712 } else if (sig == SIGCONT) { 804 } else if (sig == SIGCONT) {
713 unsigned int why; 805 unsigned int why;
714 /* 806 /*
715 * Remove all stop signals from all queues, 807 * Remove all stop signals from all queues, wake all threads.
716 * and wake all threads.
717 */ 808 */
718 rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); 809 rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
719 t = p; 810 t = p;
720 do { 811 do {
721 unsigned int state; 812 task_clear_group_stop_pending(t);
722 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); 813 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
723 /* 814 wake_up_state(t, __TASK_STOPPED);
724 * If there is a handler for SIGCONT, we must make
725 * sure that no thread returns to user mode before
726 * we post the signal, in case it was the only
727 * thread eligible to run the signal handler--then
728 * it must not do anything between resuming and
729 * running the handler. With the TIF_SIGPENDING
730 * flag set, the thread will pause and acquire the
731 * siglock that we hold now and until we've queued
732 * the pending signal.
733 *
734 * Wake up the stopped thread _after_ setting
735 * TIF_SIGPENDING
736 */
737 state = __TASK_STOPPED;
738 if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) {
739 set_tsk_thread_flag(t, TIF_SIGPENDING);
740 state |= TASK_INTERRUPTIBLE;
741 }
742 wake_up_state(t, state);
743 } while_each_thread(p, t); 815 } while_each_thread(p, t);
744 816
745 /* 817 /*
@@ -765,13 +837,6 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
765 signal->flags = why | SIGNAL_STOP_CONTINUED; 837 signal->flags = why | SIGNAL_STOP_CONTINUED;
766 signal->group_stop_count = 0; 838 signal->group_stop_count = 0;
767 signal->group_exit_code = 0; 839 signal->group_exit_code = 0;
768 } else {
769 /*
770 * We are not stopped, but there could be a stop
771 * signal in the middle of being processed after
772 * being removed from the queue. Clear that too.
773 */
774 signal->flags &= ~SIGNAL_STOP_DEQUEUED;
775 } 840 }
776 } 841 }
777 842
@@ -860,6 +925,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
860 signal->group_stop_count = 0; 925 signal->group_stop_count = 0;
861 t = p; 926 t = p;
862 do { 927 do {
928 task_clear_group_stop_pending(t);
863 sigaddset(&t->pending.signal, SIGKILL); 929 sigaddset(&t->pending.signal, SIGKILL);
864 signal_wake_up(t, 1); 930 signal_wake_up(t, 1);
865 } while_each_thread(p, t); 931 } while_each_thread(p, t);
@@ -909,14 +975,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
909 if (info == SEND_SIG_FORCED) 975 if (info == SEND_SIG_FORCED)
910 goto out_set; 976 goto out_set;
911 977
912 /* Real-time signals must be queued if sent by sigqueue, or 978 /*
913 some other real-time mechanism. It is implementation 979 * Real-time signals must be queued if sent by sigqueue, or
914 defined whether kill() does so. We attempt to do so, on 980 * some other real-time mechanism. It is implementation
915 the principle of least surprise, but since kill is not 981 * defined whether kill() does so. We attempt to do so, on
916 allowed to fail with EAGAIN when low on memory we just 982 * the principle of least surprise, but since kill is not
917 make sure at least one signal gets delivered and don't 983 * allowed to fail with EAGAIN when low on memory we just
918 pass on the info struct. */ 984 * make sure at least one signal gets delivered and don't
919 985 * pass on the info struct.
986 */
920 if (sig < SIGRTMIN) 987 if (sig < SIGRTMIN)
921 override_rlimit = (is_si_special(info) || info->si_code >= 0); 988 override_rlimit = (is_si_special(info) || info->si_code >= 0);
922 else 989 else
@@ -1093,6 +1160,7 @@ int zap_other_threads(struct task_struct *p)
1093 p->signal->group_stop_count = 0; 1160 p->signal->group_stop_count = 0;
1094 1161
1095 while_each_thread(p, t) { 1162 while_each_thread(p, t) {
1163 task_clear_group_stop_pending(t);
1096 count++; 1164 count++;
1097 1165
1098 /* Don't bother with already dead threads */ 1166 /* Don't bother with already dead threads */
@@ -1187,8 +1255,7 @@ retry:
1187 return error; 1255 return error;
1188} 1256}
1189 1257
1190int 1258int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1191kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1192{ 1259{
1193 int error; 1260 int error;
1194 rcu_read_lock(); 1261 rcu_read_lock();
@@ -1285,8 +1352,7 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
1285 * These are for backward compatibility with the rest of the kernel source. 1352 * These are for backward compatibility with the rest of the kernel source.
1286 */ 1353 */
1287 1354
1288int 1355int send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1289send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1290{ 1356{
1291 /* 1357 /*
1292 * Make sure legacy kernel users don't send in bad values 1358 * Make sure legacy kernel users don't send in bad values
@@ -1354,7 +1420,7 @@ EXPORT_SYMBOL(kill_pid);
1354 * These functions support sending signals using preallocated sigqueue 1420 * These functions support sending signals using preallocated sigqueue
1355 * structures. This is needed "because realtime applications cannot 1421 * structures. This is needed "because realtime applications cannot
1356 * afford to lose notifications of asynchronous events, like timer 1422 * afford to lose notifications of asynchronous events, like timer
1357 * expirations or I/O completions". In the case of Posix Timers 1423 * expirations or I/O completions". In the case of POSIX Timers
1358 * we allocate the sigqueue structure from the timer_create. If this 1424 * we allocate the sigqueue structure from the timer_create. If this
1359 * allocation fails we are able to report the failure to the application 1425 * allocation fails we are able to report the failure to the application
1360 * with an EAGAIN error. 1426 * with an EAGAIN error.
@@ -1522,16 +1588,30 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1522 return ret; 1588 return ret;
1523} 1589}
1524 1590
1525static void do_notify_parent_cldstop(struct task_struct *tsk, int why) 1591/**
1592 * do_notify_parent_cldstop - notify parent of stopped/continued state change
1593 * @tsk: task reporting the state change
1594 * @for_ptracer: the notification is for ptracer
1595 * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report
1596 *
1597 * Notify @tsk's parent that the stopped/continued state has changed. If
1598 * @for_ptracer is %false, @tsk's group leader notifies to its real parent.
1599 * If %true, @tsk reports to @tsk->parent which should be the ptracer.
1600 *
1601 * CONTEXT:
1602 * Must be called with tasklist_lock at least read locked.
1603 */
1604static void do_notify_parent_cldstop(struct task_struct *tsk,
1605 bool for_ptracer, int why)
1526{ 1606{
1527 struct siginfo info; 1607 struct siginfo info;
1528 unsigned long flags; 1608 unsigned long flags;
1529 struct task_struct *parent; 1609 struct task_struct *parent;
1530 struct sighand_struct *sighand; 1610 struct sighand_struct *sighand;
1531 1611
1532 if (task_ptrace(tsk)) 1612 if (for_ptracer) {
1533 parent = tsk->parent; 1613 parent = tsk->parent;
1534 else { 1614 } else {
1535 tsk = tsk->group_leader; 1615 tsk = tsk->group_leader;
1536 parent = tsk->real_parent; 1616 parent = tsk->real_parent;
1537 } 1617 }
@@ -1539,7 +1619,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1539 info.si_signo = SIGCHLD; 1619 info.si_signo = SIGCHLD;
1540 info.si_errno = 0; 1620 info.si_errno = 0;
1541 /* 1621 /*
1542 * see comment in do_notify_parent() abot the following 3 lines 1622 * see comment in do_notify_parent() about the following 4 lines
1543 */ 1623 */
1544 rcu_read_lock(); 1624 rcu_read_lock();
1545 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); 1625 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
@@ -1597,7 +1677,7 @@ static inline int may_ptrace_stop(void)
1597} 1677}
1598 1678
1599/* 1679/*
1600 * Return nonzero if there is a SIGKILL that should be waking us up. 1680 * Return non-zero if there is a SIGKILL that should be waking us up.
1601 * Called with the siglock held. 1681 * Called with the siglock held.
1602 */ 1682 */
1603static int sigkill_pending(struct task_struct *tsk) 1683static int sigkill_pending(struct task_struct *tsk)
@@ -1607,6 +1687,15 @@ static int sigkill_pending(struct task_struct *tsk)
1607} 1687}
1608 1688
1609/* 1689/*
1690 * Test whether the target task of the usual cldstop notification - the
1691 * real_parent of @child - is in the same group as the ptracer.
1692 */
1693static bool real_parent_is_ptracer(struct task_struct *child)
1694{
1695 return same_thread_group(child->parent, child->real_parent);
1696}
1697
1698/*
1610 * This must be called with current->sighand->siglock held. 1699 * This must be called with current->sighand->siglock held.
1611 * 1700 *
1612 * This should be the path for all ptrace stops. 1701 * This should be the path for all ptrace stops.
@@ -1617,10 +1706,12 @@ static int sigkill_pending(struct task_struct *tsk)
1617 * If we actually decide not to stop at all because the tracer 1706 * If we actually decide not to stop at all because the tracer
1618 * is gone, we keep current->exit_code unless clear_code. 1707 * is gone, we keep current->exit_code unless clear_code.
1619 */ 1708 */
1620static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) 1709static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1621 __releases(&current->sighand->siglock) 1710 __releases(&current->sighand->siglock)
1622 __acquires(&current->sighand->siglock) 1711 __acquires(&current->sighand->siglock)
1623{ 1712{
1713 bool gstop_done = false;
1714
1624 if (arch_ptrace_stop_needed(exit_code, info)) { 1715 if (arch_ptrace_stop_needed(exit_code, info)) {
1625 /* 1716 /*
1626 * The arch code has something special to do before a 1717 * The arch code has something special to do before a
@@ -1641,21 +1732,49 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1641 } 1732 }
1642 1733
1643 /* 1734 /*
1644 * If there is a group stop in progress, 1735 * If @why is CLD_STOPPED, we're trapping to participate in a group
1645 * we must participate in the bookkeeping. 1736 * stop. Do the bookkeeping. Note that if SIGCONT was delievered
1737 * while siglock was released for the arch hook, PENDING could be
1738 * clear now. We act as if SIGCONT is received after TASK_TRACED
1739 * is entered - ignore it.
1646 */ 1740 */
1647 if (current->signal->group_stop_count > 0) 1741 if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING))
1648 --current->signal->group_stop_count; 1742 gstop_done = task_participate_group_stop(current);
1649 1743
1650 current->last_siginfo = info; 1744 current->last_siginfo = info;
1651 current->exit_code = exit_code; 1745 current->exit_code = exit_code;
1652 1746
1653 /* Let the debugger run. */ 1747 /*
1654 __set_current_state(TASK_TRACED); 1748 * TRACED should be visible before TRAPPING is cleared; otherwise,
1749 * the tracer might fail do_wait().
1750 */
1751 set_current_state(TASK_TRACED);
1752
1753 /*
1754 * We're committing to trapping. Clearing GROUP_STOP_TRAPPING and
1755 * transition to TASK_TRACED should be atomic with respect to
1756 * siglock. This hsould be done after the arch hook as siglock is
1757 * released and regrabbed across it.
1758 */
1759 task_clear_group_stop_trapping(current);
1760
1655 spin_unlock_irq(&current->sighand->siglock); 1761 spin_unlock_irq(&current->sighand->siglock);
1656 read_lock(&tasklist_lock); 1762 read_lock(&tasklist_lock);
1657 if (may_ptrace_stop()) { 1763 if (may_ptrace_stop()) {
1658 do_notify_parent_cldstop(current, CLD_TRAPPED); 1764 /*
1765 * Notify parents of the stop.
1766 *
1767 * While ptraced, there are two parents - the ptracer and
1768 * the real_parent of the group_leader. The ptracer should
1769 * know about every stop while the real parent is only
1770 * interested in the completion of group stop. The states
1771 * for the two don't interact with each other. Notify
1772 * separately unless they're gonna be duplicates.
1773 */
1774 do_notify_parent_cldstop(current, true, why);
1775 if (gstop_done && !real_parent_is_ptracer(current))
1776 do_notify_parent_cldstop(current, false, why);
1777
1659 /* 1778 /*
1660 * Don't want to allow preemption here, because 1779 * Don't want to allow preemption here, because
1661 * sys_ptrace() needs this task to be inactive. 1780 * sys_ptrace() needs this task to be inactive.
@@ -1670,7 +1789,16 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1670 /* 1789 /*
1671 * By the time we got the lock, our tracer went away. 1790 * By the time we got the lock, our tracer went away.
1672 * Don't drop the lock yet, another tracer may come. 1791 * Don't drop the lock yet, another tracer may come.
1792 *
1793 * If @gstop_done, the ptracer went away between group stop
1794 * completion and here. During detach, it would have set
1795 * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED
1796 * in do_signal_stop() on return, so notifying the real
1797 * parent of the group stop completion is enough.
1673 */ 1798 */
1799 if (gstop_done)
1800 do_notify_parent_cldstop(current, false, why);
1801
1674 __set_current_state(TASK_RUNNING); 1802 __set_current_state(TASK_RUNNING);
1675 if (clear_code) 1803 if (clear_code)
1676 current->exit_code = 0; 1804 current->exit_code = 0;
@@ -1714,79 +1842,128 @@ void ptrace_notify(int exit_code)
1714 1842
1715 /* Let the debugger run. */ 1843 /* Let the debugger run. */
1716 spin_lock_irq(&current->sighand->siglock); 1844 spin_lock_irq(&current->sighand->siglock);
1717 ptrace_stop(exit_code, 1, &info); 1845 ptrace_stop(exit_code, CLD_TRAPPED, 1, &info);
1718 spin_unlock_irq(&current->sighand->siglock); 1846 spin_unlock_irq(&current->sighand->siglock);
1719} 1847}
1720 1848
1721/* 1849/*
1722 * This performs the stopping for SIGSTOP and other stop signals. 1850 * This performs the stopping for SIGSTOP and other stop signals.
1723 * We have to stop all threads in the thread group. 1851 * We have to stop all threads in the thread group.
1724 * Returns nonzero if we've actually stopped and released the siglock. 1852 * Returns non-zero if we've actually stopped and released the siglock.
1725 * Returns zero if we didn't stop and still hold the siglock. 1853 * Returns zero if we didn't stop and still hold the siglock.
1726 */ 1854 */
1727static int do_signal_stop(int signr) 1855static int do_signal_stop(int signr)
1728{ 1856{
1729 struct signal_struct *sig = current->signal; 1857 struct signal_struct *sig = current->signal;
1730 int notify;
1731 1858
1732 if (!sig->group_stop_count) { 1859 if (!(current->group_stop & GROUP_STOP_PENDING)) {
1860 unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME;
1733 struct task_struct *t; 1861 struct task_struct *t;
1734 1862
1735 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || 1863 /* signr will be recorded in task->group_stop for retries */
1864 WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK);
1865
1866 if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) ||
1736 unlikely(signal_group_exit(sig))) 1867 unlikely(signal_group_exit(sig)))
1737 return 0; 1868 return 0;
1738 /* 1869 /*
1739 * There is no group stop already in progress. 1870 * There is no group stop already in progress. We must
1740 * We must initiate one now. 1871 * initiate one now.
1872 *
1873 * While ptraced, a task may be resumed while group stop is
1874 * still in effect and then receive a stop signal and
1875 * initiate another group stop. This deviates from the
1876 * usual behavior as two consecutive stop signals can't
1877 * cause two group stops when !ptraced. That is why we
1878 * also check !task_is_stopped(t) below.
1879 *
1880 * The condition can be distinguished by testing whether
1881 * SIGNAL_STOP_STOPPED is already set. Don't generate
1882 * group_exit_code in such case.
1883 *
1884 * This is not necessary for SIGNAL_STOP_CONTINUED because
1885 * an intervening stop signal is required to cause two
1886 * continued events regardless of ptrace.
1741 */ 1887 */
1742 sig->group_exit_code = signr; 1888 if (!(sig->flags & SIGNAL_STOP_STOPPED))
1889 sig->group_exit_code = signr;
1890 else
1891 WARN_ON_ONCE(!task_ptrace(current));
1743 1892
1893 current->group_stop &= ~GROUP_STOP_SIGMASK;
1894 current->group_stop |= signr | gstop;
1744 sig->group_stop_count = 1; 1895 sig->group_stop_count = 1;
1745 for (t = next_thread(current); t != current; t = next_thread(t)) 1896 for (t = next_thread(current); t != current;
1897 t = next_thread(t)) {
1898 t->group_stop &= ~GROUP_STOP_SIGMASK;
1746 /* 1899 /*
1747 * Setting state to TASK_STOPPED for a group 1900 * Setting state to TASK_STOPPED for a group
1748 * stop is always done with the siglock held, 1901 * stop is always done with the siglock held,
1749 * so this check has no races. 1902 * so this check has no races.
1750 */ 1903 */
1751 if (!(t->flags & PF_EXITING) && 1904 if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) {
1752 !task_is_stopped_or_traced(t)) { 1905 t->group_stop |= signr | gstop;
1753 sig->group_stop_count++; 1906 sig->group_stop_count++;
1754 signal_wake_up(t, 0); 1907 signal_wake_up(t, 0);
1755 } 1908 }
1909 }
1756 } 1910 }
1757 /* 1911retry:
1758 * If there are no other threads in the group, or if there is 1912 if (likely(!task_ptrace(current))) {
1759 * a group stop in progress and we are the last to stop, report 1913 int notify = 0;
1760 * to the parent. When ptraced, every thread reports itself. 1914
1761 */ 1915 /*
1762 notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0; 1916 * If there are no other threads in the group, or if there
1763 notify = tracehook_notify_jctl(notify, CLD_STOPPED); 1917 * is a group stop in progress and we are the last to stop,
1764 /* 1918 * report to the parent.
1765 * tracehook_notify_jctl() can drop and reacquire siglock, so 1919 */
1766 * we keep ->group_stop_count != 0 before the call. If SIGCONT 1920 if (task_participate_group_stop(current))
1767 * or SIGKILL comes in between ->group_stop_count == 0. 1921 notify = CLD_STOPPED;
1768 */ 1922
1769 if (sig->group_stop_count) {
1770 if (!--sig->group_stop_count)
1771 sig->flags = SIGNAL_STOP_STOPPED;
1772 current->exit_code = sig->group_exit_code;
1773 __set_current_state(TASK_STOPPED); 1923 __set_current_state(TASK_STOPPED);
1924 spin_unlock_irq(&current->sighand->siglock);
1925
1926 /*
1927 * Notify the parent of the group stop completion. Because
1928 * we're not holding either the siglock or tasklist_lock
1929 * here, ptracer may attach inbetween; however, this is for
1930 * group stop and should always be delivered to the real
1931 * parent of the group leader. The new ptracer will get
1932 * its notification when this task transitions into
1933 * TASK_TRACED.
1934 */
1935 if (notify) {
1936 read_lock(&tasklist_lock);
1937 do_notify_parent_cldstop(current, false, notify);
1938 read_unlock(&tasklist_lock);
1939 }
1940
1941 /* Now we don't run again until woken by SIGCONT or SIGKILL */
1942 schedule();
1943
1944 spin_lock_irq(&current->sighand->siglock);
1945 } else {
1946 ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK,
1947 CLD_STOPPED, 0, NULL);
1948 current->exit_code = 0;
1774 } 1949 }
1775 spin_unlock_irq(&current->sighand->siglock);
1776 1950
1777 if (notify) { 1951 /*
1778 read_lock(&tasklist_lock); 1952 * GROUP_STOP_PENDING could be set if another group stop has
1779 do_notify_parent_cldstop(current, notify); 1953 * started since being woken up or ptrace wants us to transit
1780 read_unlock(&tasklist_lock); 1954 * between TASK_STOPPED and TRACED. Retry group stop.
1955 */
1956 if (current->group_stop & GROUP_STOP_PENDING) {
1957 WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK));
1958 goto retry;
1781 } 1959 }
1782 1960
1783 /* Now we don't run again until woken by SIGCONT or SIGKILL */ 1961 /* PTRACE_ATTACH might have raced with task killing, clear trapping */
1784 do { 1962 task_clear_group_stop_trapping(current);
1785 schedule(); 1963
1786 } while (try_to_freeze()); 1964 spin_unlock_irq(&current->sighand->siglock);
1787 1965
1788 tracehook_finish_jctl(); 1966 tracehook_finish_jctl();
1789 current->exit_code = 0;
1790 1967
1791 return 1; 1968 return 1;
1792} 1969}
@@ -1800,7 +1977,7 @@ static int ptrace_signal(int signr, siginfo_t *info,
1800 ptrace_signal_deliver(regs, cookie); 1977 ptrace_signal_deliver(regs, cookie);
1801 1978
1802 /* Let the debugger run. */ 1979 /* Let the debugger run. */
1803 ptrace_stop(signr, 0, info); 1980 ptrace_stop(signr, CLD_TRAPPED, 0, info);
1804 1981
1805 /* We're back. Did the debugger cancel the sig? */ 1982 /* We're back. Did the debugger cancel the sig? */
1806 signr = current->exit_code; 1983 signr = current->exit_code;
@@ -1809,10 +1986,12 @@ static int ptrace_signal(int signr, siginfo_t *info,
1809 1986
1810 current->exit_code = 0; 1987 current->exit_code = 0;
1811 1988
1812 /* Update the siginfo structure if the signal has 1989 /*
1813 changed. If the debugger wanted something 1990 * Update the siginfo structure if the signal has
1814 specific in the siginfo structure then it should 1991 * changed. If the debugger wanted something
1815 have updated *info via PTRACE_SETSIGINFO. */ 1992 * specific in the siginfo structure then it should
1993 * have updated *info via PTRACE_SETSIGINFO.
1994 */
1816 if (signr != info->si_signo) { 1995 if (signr != info->si_signo) {
1817 info->si_signo = signr; 1996 info->si_signo = signr;
1818 info->si_errno = 0; 1997 info->si_errno = 0;
@@ -1853,25 +2032,43 @@ relock:
1853 * the CLD_ si_code into SIGNAL_CLD_MASK bits. 2032 * the CLD_ si_code into SIGNAL_CLD_MASK bits.
1854 */ 2033 */
1855 if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { 2034 if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
1856 int why = (signal->flags & SIGNAL_STOP_CONTINUED) 2035 struct task_struct *leader;
1857 ? CLD_CONTINUED : CLD_STOPPED; 2036 int why;
2037
2038 if (signal->flags & SIGNAL_CLD_CONTINUED)
2039 why = CLD_CONTINUED;
2040 else
2041 why = CLD_STOPPED;
2042
1858 signal->flags &= ~SIGNAL_CLD_MASK; 2043 signal->flags &= ~SIGNAL_CLD_MASK;
1859 2044
1860 why = tracehook_notify_jctl(why, CLD_CONTINUED);
1861 spin_unlock_irq(&sighand->siglock); 2045 spin_unlock_irq(&sighand->siglock);
1862 2046
1863 if (why) { 2047 /*
1864 read_lock(&tasklist_lock); 2048 * Notify the parent that we're continuing. This event is
1865 do_notify_parent_cldstop(current->group_leader, why); 2049 * always per-process and doesn't make whole lot of sense
1866 read_unlock(&tasklist_lock); 2050 * for ptracers, who shouldn't consume the state via
1867 } 2051 * wait(2) either, but, for backward compatibility, notify
2052 * the ptracer of the group leader too unless it's gonna be
2053 * a duplicate.
2054 */
2055 read_lock(&tasklist_lock);
2056
2057 do_notify_parent_cldstop(current, false, why);
2058
2059 leader = current->group_leader;
2060 if (task_ptrace(leader) && !real_parent_is_ptracer(leader))
2061 do_notify_parent_cldstop(leader, true, why);
2062
2063 read_unlock(&tasklist_lock);
2064
1868 goto relock; 2065 goto relock;
1869 } 2066 }
1870 2067
1871 for (;;) { 2068 for (;;) {
1872 struct k_sigaction *ka; 2069 struct k_sigaction *ka;
1873 /* 2070 /*
1874 * Tracing can induce an artifical signal and choose sigaction. 2071 * Tracing can induce an artificial signal and choose sigaction.
1875 * The return value in @signr determines the default action, 2072 * The return value in @signr determines the default action,
1876 * but @info->si_signo is the signal number we will report. 2073 * but @info->si_signo is the signal number we will report.
1877 */ 2074 */
@@ -1881,8 +2078,8 @@ relock:
1881 if (unlikely(signr != 0)) 2078 if (unlikely(signr != 0))
1882 ka = return_ka; 2079 ka = return_ka;
1883 else { 2080 else {
1884 if (unlikely(signal->group_stop_count > 0) && 2081 if (unlikely(current->group_stop &
1885 do_signal_stop(0)) 2082 GROUP_STOP_PENDING) && do_signal_stop(0))
1886 goto relock; 2083 goto relock;
1887 2084
1888 signr = dequeue_signal(current, &current->blocked, 2085 signr = dequeue_signal(current, &current->blocked,
@@ -2001,10 +2198,42 @@ relock:
2001 return signr; 2198 return signr;
2002} 2199}
2003 2200
2201/*
2202 * It could be that complete_signal() picked us to notify about the
2203 * group-wide signal. Other threads should be notified now to take
2204 * the shared signals in @which since we will not.
2205 */
2206static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
2207{
2208 sigset_t retarget;
2209 struct task_struct *t;
2210
2211 sigandsets(&retarget, &tsk->signal->shared_pending.signal, which);
2212 if (sigisemptyset(&retarget))
2213 return;
2214
2215 t = tsk;
2216 while_each_thread(tsk, t) {
2217 if (t->flags & PF_EXITING)
2218 continue;
2219
2220 if (!has_pending_signals(&retarget, &t->blocked))
2221 continue;
2222 /* Remove the signals this thread can handle. */
2223 sigandsets(&retarget, &retarget, &t->blocked);
2224
2225 if (!signal_pending(t))
2226 signal_wake_up(t, 0);
2227
2228 if (sigisemptyset(&retarget))
2229 break;
2230 }
2231}
2232
2004void exit_signals(struct task_struct *tsk) 2233void exit_signals(struct task_struct *tsk)
2005{ 2234{
2006 int group_stop = 0; 2235 int group_stop = 0;
2007 struct task_struct *t; 2236 sigset_t unblocked;
2008 2237
2009 if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { 2238 if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
2010 tsk->flags |= PF_EXITING; 2239 tsk->flags |= PF_EXITING;
@@ -2020,25 +2249,23 @@ void exit_signals(struct task_struct *tsk)
2020 if (!signal_pending(tsk)) 2249 if (!signal_pending(tsk))
2021 goto out; 2250 goto out;
2022 2251
2023 /* It could be that __group_complete_signal() choose us to 2252 unblocked = tsk->blocked;
2024 * notify about group-wide signal. Another thread should be 2253 signotset(&unblocked);
2025 * woken now to take the signal since we will not. 2254 retarget_shared_pending(tsk, &unblocked);
2026 */
2027 for (t = tsk; (t = next_thread(t)) != tsk; )
2028 if (!signal_pending(t) && !(t->flags & PF_EXITING))
2029 recalc_sigpending_and_wake(t);
2030 2255
2031 if (unlikely(tsk->signal->group_stop_count) && 2256 if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) &&
2032 !--tsk->signal->group_stop_count) { 2257 task_participate_group_stop(tsk))
2033 tsk->signal->flags = SIGNAL_STOP_STOPPED; 2258 group_stop = CLD_STOPPED;
2034 group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
2035 }
2036out: 2259out:
2037 spin_unlock_irq(&tsk->sighand->siglock); 2260 spin_unlock_irq(&tsk->sighand->siglock);
2038 2261
2262 /*
2263 * If group stop has completed, deliver the notification. This
2264 * should always go to the real parent of the group leader.
2265 */
2039 if (unlikely(group_stop)) { 2266 if (unlikely(group_stop)) {
2040 read_lock(&tasklist_lock); 2267 read_lock(&tasklist_lock);
2041 do_notify_parent_cldstop(tsk, group_stop); 2268 do_notify_parent_cldstop(tsk, false, group_stop);
2042 read_unlock(&tasklist_lock); 2269 read_unlock(&tasklist_lock);
2043 } 2270 }
2044} 2271}
@@ -2058,6 +2285,9 @@ EXPORT_SYMBOL(unblock_all_signals);
2058 * System call entry points. 2285 * System call entry points.
2059 */ 2286 */
2060 2287
2288/**
2289 * sys_restart_syscall - restart a system call
2290 */
2061SYSCALL_DEFINE0(restart_syscall) 2291SYSCALL_DEFINE0(restart_syscall)
2062{ 2292{
2063 struct restart_block *restart = &current_thread_info()->restart_block; 2293 struct restart_block *restart = &current_thread_info()->restart_block;
@@ -2069,11 +2299,33 @@ long do_no_restart_syscall(struct restart_block *param)
2069 return -EINTR; 2299 return -EINTR;
2070} 2300}
2071 2301
2072/* 2302static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
2073 * We don't need to get the kernel lock - this is all local to this 2303{
2074 * particular thread.. (and that's good, because this is _heavily_ 2304 if (signal_pending(tsk) && !thread_group_empty(tsk)) {
2075 * used by various programs) 2305 sigset_t newblocked;
2306 /* A set of now blocked but previously unblocked signals. */
2307 sigandnsets(&newblocked, newset, &current->blocked);
2308 retarget_shared_pending(tsk, &newblocked);
2309 }
2310 tsk->blocked = *newset;
2311 recalc_sigpending();
2312}
2313
2314/**
2315 * set_current_blocked - change current->blocked mask
2316 * @newset: new mask
2317 *
2318 * It is wrong to change ->blocked directly, this helper should be used
2319 * to ensure the process can't miss a shared signal we are going to block.
2076 */ 2320 */
2321void set_current_blocked(const sigset_t *newset)
2322{
2323 struct task_struct *tsk = current;
2324
2325 spin_lock_irq(&tsk->sighand->siglock);
2326 __set_task_blocked(tsk, newset);
2327 spin_unlock_irq(&tsk->sighand->siglock);
2328}
2077 2329
2078/* 2330/*
2079 * This is also useful for kernel threads that want to temporarily 2331 * This is also useful for kernel threads that want to temporarily
@@ -2085,66 +2337,66 @@ long do_no_restart_syscall(struct restart_block *param)
2085 */ 2337 */
2086int sigprocmask(int how, sigset_t *set, sigset_t *oldset) 2338int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
2087{ 2339{
2088 int error; 2340 struct task_struct *tsk = current;
2341 sigset_t newset;
2089 2342
2090 spin_lock_irq(&current->sighand->siglock); 2343 /* Lockless, only current can change ->blocked, never from irq */
2091 if (oldset) 2344 if (oldset)
2092 *oldset = current->blocked; 2345 *oldset = tsk->blocked;
2093 2346
2094 error = 0;
2095 switch (how) { 2347 switch (how) {
2096 case SIG_BLOCK: 2348 case SIG_BLOCK:
2097 sigorsets(&current->blocked, &current->blocked, set); 2349 sigorsets(&newset, &tsk->blocked, set);
2098 break; 2350 break;
2099 case SIG_UNBLOCK: 2351 case SIG_UNBLOCK:
2100 signandsets(&current->blocked, &current->blocked, set); 2352 sigandnsets(&newset, &tsk->blocked, set);
2101 break; 2353 break;
2102 case SIG_SETMASK: 2354 case SIG_SETMASK:
2103 current->blocked = *set; 2355 newset = *set;
2104 break; 2356 break;
2105 default: 2357 default:
2106 error = -EINVAL; 2358 return -EINVAL;
2107 } 2359 }
2108 recalc_sigpending();
2109 spin_unlock_irq(&current->sighand->siglock);
2110 2360
2111 return error; 2361 set_current_blocked(&newset);
2362 return 0;
2112} 2363}
2113 2364
2114SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, 2365/**
2366 * sys_rt_sigprocmask - change the list of currently blocked signals
2367 * @how: whether to add, remove, or set signals
2368 * @set: stores pending signals
2369 * @oset: previous value of signal mask if non-null
2370 * @sigsetsize: size of sigset_t type
2371 */
2372SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
2115 sigset_t __user *, oset, size_t, sigsetsize) 2373 sigset_t __user *, oset, size_t, sigsetsize)
2116{ 2374{
2117 int error = -EINVAL;
2118 sigset_t old_set, new_set; 2375 sigset_t old_set, new_set;
2376 int error;
2119 2377
2120 /* XXX: Don't preclude handling different sized sigset_t's. */ 2378 /* XXX: Don't preclude handling different sized sigset_t's. */
2121 if (sigsetsize != sizeof(sigset_t)) 2379 if (sigsetsize != sizeof(sigset_t))
2122 goto out; 2380 return -EINVAL;
2123 2381
2124 if (set) { 2382 old_set = current->blocked;
2125 error = -EFAULT; 2383
2126 if (copy_from_user(&new_set, set, sizeof(*set))) 2384 if (nset) {
2127 goto out; 2385 if (copy_from_user(&new_set, nset, sizeof(sigset_t)))
2386 return -EFAULT;
2128 sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); 2387 sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
2129 2388
2130 error = sigprocmask(how, &new_set, &old_set); 2389 error = sigprocmask(how, &new_set, NULL);
2131 if (error) 2390 if (error)
2132 goto out; 2391 return error;
2133 if (oset) 2392 }
2134 goto set_old;
2135 } else if (oset) {
2136 spin_lock_irq(&current->sighand->siglock);
2137 old_set = current->blocked;
2138 spin_unlock_irq(&current->sighand->siglock);
2139 2393
2140 set_old: 2394 if (oset) {
2141 error = -EFAULT; 2395 if (copy_to_user(oset, &old_set, sizeof(sigset_t)))
2142 if (copy_to_user(oset, &old_set, sizeof(*oset))) 2396 return -EFAULT;
2143 goto out;
2144 } 2397 }
2145 error = 0; 2398
2146out: 2399 return 0;
2147 return error;
2148} 2400}
2149 2401
2150long do_sigpending(void __user *set, unsigned long sigsetsize) 2402long do_sigpending(void __user *set, unsigned long sigsetsize)
@@ -2169,8 +2421,14 @@ long do_sigpending(void __user *set, unsigned long sigsetsize)
2169 2421
2170out: 2422out:
2171 return error; 2423 return error;
2172} 2424}
2173 2425
2426/**
2427 * sys_rt_sigpending - examine a pending signal that has been raised
2428 * while blocked
2429 * @set: stores pending signals
2430 * @sigsetsize: size of sigset_t type or larger
2431 */
2174SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) 2432SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
2175{ 2433{
2176 return do_sigpending(set, sigsetsize); 2434 return do_sigpending(set, sigsetsize);
@@ -2219,9 +2477,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2219 err |= __put_user(from->si_trapno, &to->si_trapno); 2477 err |= __put_user(from->si_trapno, &to->si_trapno);
2220#endif 2478#endif
2221#ifdef BUS_MCEERR_AO 2479#ifdef BUS_MCEERR_AO
2222 /* 2480 /*
2223 * Other callers might not initialize the si_lsb field, 2481 * Other callers might not initialize the si_lsb field,
2224 * so check explicitely for the right codes here. 2482 * so check explicitly for the right codes here.
2225 */ 2483 */
2226 if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) 2484 if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
2227 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); 2485 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
@@ -2250,15 +2508,82 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2250 2508
2251#endif 2509#endif
2252 2510
2511/**
2512 * do_sigtimedwait - wait for queued signals specified in @which
2513 * @which: queued signals to wait for
2514 * @info: if non-null, the signal's siginfo is returned here
2515 * @ts: upper bound on process time suspension
2516 */
2517int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
2518 const struct timespec *ts)
2519{
2520 struct task_struct *tsk = current;
2521 long timeout = MAX_SCHEDULE_TIMEOUT;
2522 sigset_t mask = *which;
2523 int sig;
2524
2525 if (ts) {
2526 if (!timespec_valid(ts))
2527 return -EINVAL;
2528 timeout = timespec_to_jiffies(ts);
2529 /*
2530 * We can be close to the next tick, add another one
2531 * to ensure we will wait at least the time asked for.
2532 */
2533 if (ts->tv_sec || ts->tv_nsec)
2534 timeout++;
2535 }
2536
2537 /*
2538 * Invert the set of allowed signals to get those we want to block.
2539 */
2540 sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
2541 signotset(&mask);
2542
2543 spin_lock_irq(&tsk->sighand->siglock);
2544 sig = dequeue_signal(tsk, &mask, info);
2545 if (!sig && timeout) {
2546 /*
2547 * None ready, temporarily unblock those we're interested
2548 * while we are sleeping in so that we'll be awakened when
2549 * they arrive. Unblocking is always fine, we can avoid
2550 * set_current_blocked().
2551 */
2552 tsk->real_blocked = tsk->blocked;
2553 sigandsets(&tsk->blocked, &tsk->blocked, &mask);
2554 recalc_sigpending();
2555 spin_unlock_irq(&tsk->sighand->siglock);
2556
2557 timeout = schedule_timeout_interruptible(timeout);
2558
2559 spin_lock_irq(&tsk->sighand->siglock);
2560 __set_task_blocked(tsk, &tsk->real_blocked);
2561 siginitset(&tsk->real_blocked, 0);
2562 sig = dequeue_signal(tsk, &mask, info);
2563 }
2564 spin_unlock_irq(&tsk->sighand->siglock);
2565
2566 if (sig)
2567 return sig;
2568 return timeout ? -EINTR : -EAGAIN;
2569}
2570
2571/**
2572 * sys_rt_sigtimedwait - synchronously wait for queued signals specified
2573 * in @uthese
2574 * @uthese: queued signals to wait for
2575 * @uinfo: if non-null, the signal's siginfo is returned here
2576 * @uts: upper bound on process time suspension
2577 * @sigsetsize: size of sigset_t type
2578 */
2253SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, 2579SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
2254 siginfo_t __user *, uinfo, const struct timespec __user *, uts, 2580 siginfo_t __user *, uinfo, const struct timespec __user *, uts,
2255 size_t, sigsetsize) 2581 size_t, sigsetsize)
2256{ 2582{
2257 int ret, sig;
2258 sigset_t these; 2583 sigset_t these;
2259 struct timespec ts; 2584 struct timespec ts;
2260 siginfo_t info; 2585 siginfo_t info;
2261 long timeout = 0; 2586 int ret;
2262 2587
2263 /* XXX: Don't preclude handling different sized sigset_t's. */ 2588 /* XXX: Don't preclude handling different sized sigset_t's. */
2264 if (sigsetsize != sizeof(sigset_t)) 2589 if (sigsetsize != sizeof(sigset_t))
@@ -2266,65 +2591,27 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
2266 2591
2267 if (copy_from_user(&these, uthese, sizeof(these))) 2592 if (copy_from_user(&these, uthese, sizeof(these)))
2268 return -EFAULT; 2593 return -EFAULT;
2269
2270 /*
2271 * Invert the set of allowed signals to get those we
2272 * want to block.
2273 */
2274 sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP));
2275 signotset(&these);
2276 2594
2277 if (uts) { 2595 if (uts) {
2278 if (copy_from_user(&ts, uts, sizeof(ts))) 2596 if (copy_from_user(&ts, uts, sizeof(ts)))
2279 return -EFAULT; 2597 return -EFAULT;
2280 if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0
2281 || ts.tv_sec < 0)
2282 return -EINVAL;
2283 } 2598 }
2284 2599
2285 spin_lock_irq(&current->sighand->siglock); 2600 ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);
2286 sig = dequeue_signal(current, &these, &info);
2287 if (!sig) {
2288 timeout = MAX_SCHEDULE_TIMEOUT;
2289 if (uts)
2290 timeout = (timespec_to_jiffies(&ts)
2291 + (ts.tv_sec || ts.tv_nsec));
2292
2293 if (timeout) {
2294 /* None ready -- temporarily unblock those we're
2295 * interested while we are sleeping in so that we'll
2296 * be awakened when they arrive. */
2297 current->real_blocked = current->blocked;
2298 sigandsets(&current->blocked, &current->blocked, &these);
2299 recalc_sigpending();
2300 spin_unlock_irq(&current->sighand->siglock);
2301
2302 timeout = schedule_timeout_interruptible(timeout);
2303
2304 spin_lock_irq(&current->sighand->siglock);
2305 sig = dequeue_signal(current, &these, &info);
2306 current->blocked = current->real_blocked;
2307 siginitset(&current->real_blocked, 0);
2308 recalc_sigpending();
2309 }
2310 }
2311 spin_unlock_irq(&current->sighand->siglock);
2312 2601
2313 if (sig) { 2602 if (ret > 0 && uinfo) {
2314 ret = sig; 2603 if (copy_siginfo_to_user(uinfo, &info))
2315 if (uinfo) { 2604 ret = -EFAULT;
2316 if (copy_siginfo_to_user(uinfo, &info))
2317 ret = -EFAULT;
2318 }
2319 } else {
2320 ret = -EAGAIN;
2321 if (timeout)
2322 ret = -EINTR;
2323 } 2605 }
2324 2606
2325 return ret; 2607 return ret;
2326} 2608}
2327 2609
2610/**
2611 * sys_kill - send a signal to a process
2612 * @pid: the PID of the process
2613 * @sig: signal to be sent
2614 */
2328SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) 2615SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
2329{ 2616{
2330 struct siginfo info; 2617 struct siginfo info;
@@ -2400,7 +2687,11 @@ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
2400 return do_tkill(tgid, pid, sig); 2687 return do_tkill(tgid, pid, sig);
2401} 2688}
2402 2689
2403/* 2690/**
2691 * sys_tkill - send signal to one specific task
2692 * @pid: the PID of the task
2693 * @sig: signal to be sent
2694 *
2404 * Send a signal to only one task, even if it's a CLONE_THREAD task. 2695 * Send a signal to only one task, even if it's a CLONE_THREAD task.
2405 */ 2696 */
2406SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) 2697SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
@@ -2412,6 +2703,12 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
2412 return do_tkill(0, pid, sig); 2703 return do_tkill(0, pid, sig);
2413} 2704}
2414 2705
2706/**
2707 * sys_rt_sigqueueinfo - send signal information to a signal
2708 * @pid: the PID of the thread
2709 * @sig: signal to be sent
2710 * @uinfo: signal info to be sent
2711 */
2415SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, 2712SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2416 siginfo_t __user *, uinfo) 2713 siginfo_t __user *, uinfo)
2417{ 2714{
@@ -2421,9 +2718,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2421 return -EFAULT; 2718 return -EFAULT;
2422 2719
2423 /* Not even root can pretend to send signals from the kernel. 2720 /* Not even root can pretend to send signals from the kernel.
2424 Nor can they impersonate a kill(), which adds source info. */ 2721 * Nor can they impersonate a kill()/tgkill(), which adds source info.
2425 if (info.si_code >= 0) 2722 */
2723 if (info.si_code >= 0 || info.si_code == SI_TKILL) {
2724 /* We used to allow any < 0 si_code */
2725 WARN_ON_ONCE(info.si_code < 0);
2426 return -EPERM; 2726 return -EPERM;
2727 }
2427 info.si_signo = sig; 2728 info.si_signo = sig;
2428 2729
2429 /* POSIX.1b doesn't mention process groups. */ 2730 /* POSIX.1b doesn't mention process groups. */
@@ -2437,9 +2738,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
2437 return -EINVAL; 2738 return -EINVAL;
2438 2739
2439 /* Not even root can pretend to send signals from the kernel. 2740 /* Not even root can pretend to send signals from the kernel.
2440 Nor can they impersonate a kill(), which adds source info. */ 2741 * Nor can they impersonate a kill()/tgkill(), which adds source info.
2441 if (info->si_code >= 0) 2742 */
2743 if (info->si_code >= 0 || info->si_code == SI_TKILL) {
2744 /* We used to allow any < 0 si_code */
2745 WARN_ON_ONCE(info->si_code < 0);
2442 return -EPERM; 2746 return -EPERM;
2747 }
2443 info->si_signo = sig; 2748 info->si_signo = sig;
2444 2749
2445 return do_send_specific(tgid, pid, sig, info); 2750 return do_send_specific(tgid, pid, sig, info);
@@ -2531,12 +2836,11 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2531 2836
2532 error = -EINVAL; 2837 error = -EINVAL;
2533 /* 2838 /*
2534 * 2839 * Note - this code used to test ss_flags incorrectly:
2535 * Note - this code used to test ss_flags incorrectly
2536 * old code may have been written using ss_flags==0 2840 * old code may have been written using ss_flags==0
2537 * to mean ss_flags==SS_ONSTACK (as this was the only 2841 * to mean ss_flags==SS_ONSTACK (as this was the only
2538 * way that worked) - this fix preserves that older 2842 * way that worked) - this fix preserves that older
2539 * mechanism 2843 * mechanism.
2540 */ 2844 */
2541 if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) 2845 if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0)
2542 goto out; 2846 goto out;
@@ -2570,6 +2874,10 @@ out:
2570 2874
2571#ifdef __ARCH_WANT_SYS_SIGPENDING 2875#ifdef __ARCH_WANT_SYS_SIGPENDING
2572 2876
2877/**
2878 * sys_sigpending - examine pending signals
2879 * @set: where mask of pending signal is returned
2880 */
2573SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) 2881SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
2574{ 2882{
2575 return do_sigpending(set, sizeof(*set)); 2883 return do_sigpending(set, sizeof(*set));
@@ -2578,60 +2886,65 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
2578#endif 2886#endif
2579 2887
2580#ifdef __ARCH_WANT_SYS_SIGPROCMASK 2888#ifdef __ARCH_WANT_SYS_SIGPROCMASK
2581/* Some platforms have their own version with special arguments others 2889/**
2582 support only sys_rt_sigprocmask. */ 2890 * sys_sigprocmask - examine and change blocked signals
2891 * @how: whether to add, remove, or set signals
2892 * @nset: signals to add or remove (if non-null)
2893 * @oset: previous value of signal mask if non-null
2894 *
2895 * Some platforms have their own version with special arguments;
2896 * others support only sys_rt_sigprocmask.
2897 */
2583 2898
2584SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, 2899SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
2585 old_sigset_t __user *, oset) 2900 old_sigset_t __user *, oset)
2586{ 2901{
2587 int error;
2588 old_sigset_t old_set, new_set; 2902 old_sigset_t old_set, new_set;
2903 sigset_t new_blocked;
2589 2904
2590 if (set) { 2905 old_set = current->blocked.sig[0];
2591 error = -EFAULT; 2906
2592 if (copy_from_user(&new_set, set, sizeof(*set))) 2907 if (nset) {
2593 goto out; 2908 if (copy_from_user(&new_set, nset, sizeof(*nset)))
2909 return -EFAULT;
2594 new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); 2910 new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
2595 2911
2596 spin_lock_irq(&current->sighand->siglock); 2912 new_blocked = current->blocked;
2597 old_set = current->blocked.sig[0];
2598 2913
2599 error = 0;
2600 switch (how) { 2914 switch (how) {
2601 default:
2602 error = -EINVAL;
2603 break;
2604 case SIG_BLOCK: 2915 case SIG_BLOCK:
2605 sigaddsetmask(&current->blocked, new_set); 2916 sigaddsetmask(&new_blocked, new_set);
2606 break; 2917 break;
2607 case SIG_UNBLOCK: 2918 case SIG_UNBLOCK:
2608 sigdelsetmask(&current->blocked, new_set); 2919 sigdelsetmask(&new_blocked, new_set);
2609 break; 2920 break;
2610 case SIG_SETMASK: 2921 case SIG_SETMASK:
2611 current->blocked.sig[0] = new_set; 2922 new_blocked.sig[0] = new_set;
2612 break; 2923 break;
2924 default:
2925 return -EINVAL;
2613 } 2926 }
2614 2927
2615 recalc_sigpending(); 2928 set_current_blocked(&new_blocked);
2616 spin_unlock_irq(&current->sighand->siglock); 2929 }
2617 if (error) 2930
2618 goto out; 2931 if (oset) {
2619 if (oset)
2620 goto set_old;
2621 } else if (oset) {
2622 old_set = current->blocked.sig[0];
2623 set_old:
2624 error = -EFAULT;
2625 if (copy_to_user(oset, &old_set, sizeof(*oset))) 2932 if (copy_to_user(oset, &old_set, sizeof(*oset)))
2626 goto out; 2933 return -EFAULT;
2627 } 2934 }
2628 error = 0; 2935
2629out: 2936 return 0;
2630 return error;
2631} 2937}
2632#endif /* __ARCH_WANT_SYS_SIGPROCMASK */ 2938#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
2633 2939
2634#ifdef __ARCH_WANT_SYS_RT_SIGACTION 2940#ifdef __ARCH_WANT_SYS_RT_SIGACTION
2941/**
2942 * sys_rt_sigaction - alter an action taken by a process
2943 * @sig: signal to be sent
2944 * @act: new sigaction
2945 * @oact: used to save the previous sigaction
2946 * @sigsetsize: size of sigset_t type
2947 */
2635SYSCALL_DEFINE4(rt_sigaction, int, sig, 2948SYSCALL_DEFINE4(rt_sigaction, int, sig,
2636 const struct sigaction __user *, act, 2949 const struct sigaction __user *, act,
2637 struct sigaction __user *, oact, 2950 struct sigaction __user *, oact,
@@ -2710,14 +3023,22 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
2710 3023
2711SYSCALL_DEFINE0(pause) 3024SYSCALL_DEFINE0(pause)
2712{ 3025{
2713 current->state = TASK_INTERRUPTIBLE; 3026 while (!signal_pending(current)) {
2714 schedule(); 3027 current->state = TASK_INTERRUPTIBLE;
3028 schedule();
3029 }
2715 return -ERESTARTNOHAND; 3030 return -ERESTARTNOHAND;
2716} 3031}
2717 3032
2718#endif 3033#endif
2719 3034
2720#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND 3035#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
3036/**
3037 * sys_rt_sigsuspend - replace the signal mask for a value with the
3038 * @unewset value until a signal is received
3039 * @unewset: new signal mask value
3040 * @sigsetsize: size of sigset_t type
3041 */
2721SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) 3042SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
2722{ 3043{
2723 sigset_t newset; 3044 sigset_t newset;
diff --git a/kernel/smp.c b/kernel/smp.c
index 9910744f0856..73a195193558 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -194,7 +194,7 @@ void generic_smp_call_function_interrupt(void)
194 */ 194 */
195 list_for_each_entry_rcu(data, &call_function.queue, csd.list) { 195 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
196 int refs; 196 int refs;
197 void (*func) (void *info); 197 smp_call_func_t func;
198 198
199 /* 199 /*
200 * Since we walk the list without any locks, we might 200 * Since we walk the list without any locks, we might
@@ -214,17 +214,17 @@ void generic_smp_call_function_interrupt(void)
214 if (atomic_read(&data->refs) == 0) 214 if (atomic_read(&data->refs) == 0)
215 continue; 215 continue;
216 216
217 func = data->csd.func; /* for later warn */ 217 func = data->csd.func; /* save for later warn */
218 data->csd.func(data->csd.info); 218 func(data->csd.info);
219 219
220 /* 220 /*
221 * If the cpu mask is not still set then it enabled interrupts, 221 * If the cpu mask is not still set then func enabled
222 * we took another smp interrupt, and executed the function 222 * interrupts (BUG), and this cpu took another smp call
223 * twice on this cpu. In theory that copy decremented refs. 223 * function interrupt and executed func(info) twice
224 * on this cpu. That nested execution decremented refs.
224 */ 225 */
225 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { 226 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
226 WARN(1, "%pS enabled interrupts and double executed\n", 227 WARN(1, "%pf enabled interrupts and double executed\n", func);
227 func);
228 continue; 228 continue;
229 } 229 }
230 230
@@ -450,7 +450,7 @@ void smp_call_function_many(const struct cpumask *mask,
450{ 450{
451 struct call_function_data *data; 451 struct call_function_data *data;
452 unsigned long flags; 452 unsigned long flags;
453 int cpu, next_cpu, this_cpu = smp_processor_id(); 453 int refs, cpu, next_cpu, this_cpu = smp_processor_id();
454 454
455 /* 455 /*
456 * Can deadlock when called with interrupts disabled. 456 * Can deadlock when called with interrupts disabled.
@@ -461,7 +461,7 @@ void smp_call_function_many(const struct cpumask *mask,
461 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() 461 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
462 && !oops_in_progress && !early_boot_irqs_disabled); 462 && !oops_in_progress && !early_boot_irqs_disabled);
463 463
464 /* So, what's a CPU they want? Ignoring this one. */ 464 /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */
465 cpu = cpumask_first_and(mask, cpu_online_mask); 465 cpu = cpumask_first_and(mask, cpu_online_mask);
466 if (cpu == this_cpu) 466 if (cpu == this_cpu)
467 cpu = cpumask_next_and(cpu, mask, cpu_online_mask); 467 cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
@@ -483,22 +483,49 @@ void smp_call_function_many(const struct cpumask *mask,
483 483
484 data = &__get_cpu_var(cfd_data); 484 data = &__get_cpu_var(cfd_data);
485 csd_lock(&data->csd); 485 csd_lock(&data->csd);
486
487 /* This BUG_ON verifies our reuse assertions and can be removed */
486 BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); 488 BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
487 489
490 /*
491 * The global call function queue list add and delete are protected
492 * by a lock, but the list is traversed without any lock, relying
493 * on the rcu list add and delete to allow safe concurrent traversal.
494 * We reuse the call function data without waiting for any grace
495 * period after some other cpu removes it from the global queue.
496 * This means a cpu might find our data block as it is being
497 * filled out.
498 *
499 * We hold off the interrupt handler on the other cpu by
500 * ordering our writes to the cpu mask vs our setting of the
501 * refs counter. We assert only the cpu owning the data block
502 * will set a bit in cpumask, and each bit will only be cleared
503 * by the subject cpu. Each cpu must first find its bit is
504 * set and then check that refs is set indicating the element is
505 * ready to be processed, otherwise it must skip the entry.
506 *
507 * On the previous iteration refs was set to 0 by another cpu.
508 * To avoid the use of transitivity, set the counter to 0 here
509 * so the wmb will pair with the rmb in the interrupt handler.
510 */
511 atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */
512
488 data->csd.func = func; 513 data->csd.func = func;
489 data->csd.info = info; 514 data->csd.info = info;
490 cpumask_and(data->cpumask, mask, cpu_online_mask);
491 cpumask_clear_cpu(this_cpu, data->cpumask);
492 515
493 /* 516 /* Ensure 0 refs is visible before mask. Also orders func and info */
494 * To ensure the interrupt handler gets an complete view
495 * we order the cpumask and refs writes and order the read
496 * of them in the interrupt handler. In addition we may
497 * only clear our own cpu bit from the mask.
498 */
499 smp_wmb(); 517 smp_wmb();
500 518
501 atomic_set(&data->refs, cpumask_weight(data->cpumask)); 519 /* We rely on the "and" being processed before the store */
520 cpumask_and(data->cpumask, mask, cpu_online_mask);
521 cpumask_clear_cpu(this_cpu, data->cpumask);
522 refs = cpumask_weight(data->cpumask);
523
524 /* Some callers race with other cpus changing the passed mask */
525 if (unlikely(!refs)) {
526 csd_unlock(&data->csd);
527 return;
528 }
502 529
503 raw_spin_lock_irqsave(&call_function.lock, flags); 530 raw_spin_lock_irqsave(&call_function.lock, flags);
504 /* 531 /*
@@ -507,6 +534,12 @@ void smp_call_function_many(const struct cpumask *mask,
507 * will not miss any other list entries: 534 * will not miss any other list entries:
508 */ 535 */
509 list_add_rcu(&data->csd.list, &call_function.queue); 536 list_add_rcu(&data->csd.list, &call_function.queue);
537 /*
538 * We rely on the wmb() in list_add_rcu to complete our writes
539 * to the cpumask before this write to refs, which indicates
540 * data is on the list and is ready to be processed.
541 */
542 atomic_set(&data->refs, refs);
510 raw_spin_unlock_irqrestore(&call_function.lock, flags); 543 raw_spin_unlock_irqrestore(&call_function.lock, flags);
511 544
512 /* 545 /*
@@ -571,6 +604,87 @@ void ipi_call_unlock_irq(void)
571} 604}
572#endif /* USE_GENERIC_SMP_HELPERS */ 605#endif /* USE_GENERIC_SMP_HELPERS */
573 606
607/* Setup configured maximum number of CPUs to activate */
608unsigned int setup_max_cpus = NR_CPUS;
609EXPORT_SYMBOL(setup_max_cpus);
610
611
612/*
613 * Setup routine for controlling SMP activation
614 *
615 * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
616 * activation entirely (the MPS table probe still happens, though).
617 *
618 * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
619 * greater than 0, limits the maximum number of CPUs activated in
620 * SMP mode to <NUM>.
621 */
622
623void __weak arch_disable_smp_support(void) { }
624
625static int __init nosmp(char *str)
626{
627 setup_max_cpus = 0;
628 arch_disable_smp_support();
629
630 return 0;
631}
632
633early_param("nosmp", nosmp);
634
635/* this is hard limit */
636static int __init nrcpus(char *str)
637{
638 int nr_cpus;
639
640 get_option(&str, &nr_cpus);
641 if (nr_cpus > 0 && nr_cpus < nr_cpu_ids)
642 nr_cpu_ids = nr_cpus;
643
644 return 0;
645}
646
647early_param("nr_cpus", nrcpus);
648
649static int __init maxcpus(char *str)
650{
651 get_option(&str, &setup_max_cpus);
652 if (setup_max_cpus == 0)
653 arch_disable_smp_support();
654
655 return 0;
656}
657
658early_param("maxcpus", maxcpus);
659
660/* Setup number of possible processor ids */
661int nr_cpu_ids __read_mostly = NR_CPUS;
662EXPORT_SYMBOL(nr_cpu_ids);
663
664/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
665void __init setup_nr_cpu_ids(void)
666{
667 nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
668}
669
670/* Called by boot processor to activate the rest. */
671void __init smp_init(void)
672{
673 unsigned int cpu;
674
675 /* FIXME: This should be done in userspace --RR */
676 for_each_present_cpu(cpu) {
677 if (num_online_cpus() >= setup_max_cpus)
678 break;
679 if (!cpu_online(cpu))
680 cpu_up(cpu);
681 }
682
683 /* Any cleanup work */
684 printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
685 smp_cpus_done(setup_max_cpus);
686}
687
574/* 688/*
575 * Call a function on all processors. May be used during early boot while 689 * Call a function on all processors. May be used during early boot while
576 * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead 690 * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 68eb5efec388..13960170cad4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -54,11 +54,11 @@ EXPORT_SYMBOL(irq_stat);
54 54
55static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; 55static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
56 56
57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 57DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
61 "TASKLET", "SCHED", "HRTIMER", "RCU" 61 "TASKLET", "SCHED", "HRTIMER"
62}; 62};
63 63
64/* 64/*
@@ -311,9 +311,21 @@ void irq_enter(void)
311} 311}
312 312
313#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 313#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
314# define invoke_softirq() __do_softirq() 314static inline void invoke_softirq(void)
315{
316 if (!force_irqthreads)
317 __do_softirq();
318 else
319 wakeup_softirqd();
320}
315#else 321#else
316# define invoke_softirq() do_softirq() 322static inline void invoke_softirq(void)
323{
324 if (!force_irqthreads)
325 do_softirq();
326 else
327 wakeup_softirqd();
328}
317#endif 329#endif
318 330
319/* 331/*
@@ -555,7 +567,7 @@ static void __tasklet_hrtimer_trampoline(unsigned long data)
555/** 567/**
556 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks 568 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
557 * @ttimer: tasklet_hrtimer which is initialized 569 * @ttimer: tasklet_hrtimer which is initialized
558 * @function: hrtimer callback funtion which gets called from softirq context 570 * @function: hrtimer callback function which gets called from softirq context
559 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) 571 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
560 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) 572 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
561 */ 573 */
@@ -721,7 +733,6 @@ static int run_ksoftirqd(void * __bind_cpu)
721{ 733{
722 set_current_state(TASK_INTERRUPTIBLE); 734 set_current_state(TASK_INTERRUPTIBLE);
723 735
724 current->flags |= PF_KSOFTIRQD;
725 while (!kthread_should_stop()) { 736 while (!kthread_should_stop()) {
726 preempt_disable(); 737 preempt_disable();
727 if (!local_softirq_pending()) { 738 if (!local_softirq_pending()) {
@@ -738,7 +749,10 @@ static int run_ksoftirqd(void * __bind_cpu)
738 don't process */ 749 don't process */
739 if (cpu_is_offline((long)__bind_cpu)) 750 if (cpu_is_offline((long)__bind_cpu))
740 goto wait_to_die; 751 goto wait_to_die;
741 do_softirq(); 752 local_irq_disable();
753 if (local_softirq_pending())
754 __do_softirq();
755 local_irq_enable();
742 preempt_enable_no_resched(); 756 preempt_enable_no_resched();
743 cond_resched(); 757 cond_resched();
744 preempt_disable(); 758 preempt_disable();
@@ -831,7 +845,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
831 switch (action) { 845 switch (action) {
832 case CPU_UP_PREPARE: 846 case CPU_UP_PREPARE:
833 case CPU_UP_PREPARE_FROZEN: 847 case CPU_UP_PREPARE_FROZEN:
834 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 848 p = kthread_create_on_node(run_ksoftirqd,
849 hcpu,
850 cpu_to_node(hotcpu),
851 "ksoftirqd/%d", hotcpu);
835 if (IS_ERR(p)) { 852 if (IS_ERR(p)) {
836 printk("ksoftirqd for %i failed\n", hotcpu); 853 printk("ksoftirqd for %i failed\n", hotcpu);
837 return notifier_from_errno(PTR_ERR(p)); 854 return notifier_from_errno(PTR_ERR(p));
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2df820b03beb..e3516b29076c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -301,8 +301,10 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
301 case CPU_UP_PREPARE: 301 case CPU_UP_PREPARE:
302 BUG_ON(stopper->thread || stopper->enabled || 302 BUG_ON(stopper->thread || stopper->enabled ||
303 !list_empty(&stopper->works)); 303 !list_empty(&stopper->works));
304 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", 304 p = kthread_create_on_node(cpu_stopper_thread,
305 cpu); 305 stopper,
306 cpu_to_node(cpu),
307 "migration/%d", cpu);
306 if (IS_ERR(p)) 308 if (IS_ERR(p))
307 return notifier_from_errno(PTR_ERR(p)); 309 return notifier_from_errno(PTR_ERR(p));
308 get_task_struct(p); 310 get_task_struct(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 18da702ec813..e4128b278f23 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -37,6 +37,7 @@
37#include <linux/ptrace.h> 37#include <linux/ptrace.h>
38#include <linux/fs_struct.h> 38#include <linux/fs_struct.h>
39#include <linux/gfp.h> 39#include <linux/gfp.h>
40#include <linux/syscore_ops.h>
40 41
41#include <linux/compat.h> 42#include <linux/compat.h>
42#include <linux/syscalls.h> 43#include <linux/syscalls.h>
@@ -119,16 +120,33 @@ EXPORT_SYMBOL(cad_pid);
119void (*pm_power_off_prepare)(void); 120void (*pm_power_off_prepare)(void);
120 121
121/* 122/*
123 * Returns true if current's euid is same as p's uid or euid,
124 * or has CAP_SYS_NICE to p's user_ns.
125 *
126 * Called with rcu_read_lock, creds are safe
127 */
128static bool set_one_prio_perm(struct task_struct *p)
129{
130 const struct cred *cred = current_cred(), *pcred = __task_cred(p);
131
132 if (pcred->user->user_ns == cred->user->user_ns &&
133 (pcred->uid == cred->euid ||
134 pcred->euid == cred->euid))
135 return true;
136 if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
137 return true;
138 return false;
139}
140
141/*
122 * set the priority of a task 142 * set the priority of a task
123 * - the caller must hold the RCU read lock 143 * - the caller must hold the RCU read lock
124 */ 144 */
125static int set_one_prio(struct task_struct *p, int niceval, int error) 145static int set_one_prio(struct task_struct *p, int niceval, int error)
126{ 146{
127 const struct cred *cred = current_cred(), *pcred = __task_cred(p);
128 int no_nice; 147 int no_nice;
129 148
130 if (pcred->uid != cred->euid && 149 if (!set_one_prio_perm(p)) {
131 pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
132 error = -EPERM; 150 error = -EPERM;
133 goto out; 151 goto out;
134 } 152 }
@@ -296,8 +314,9 @@ void kernel_restart_prepare(char *cmd)
296{ 314{
297 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 315 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
298 system_state = SYSTEM_RESTART; 316 system_state = SYSTEM_RESTART;
317 usermodehelper_disable();
299 device_shutdown(); 318 device_shutdown();
300 sysdev_shutdown(); 319 syscore_shutdown();
301} 320}
302 321
303/** 322/**
@@ -325,6 +344,7 @@ static void kernel_shutdown_prepare(enum system_states state)
325 blocking_notifier_call_chain(&reboot_notifier_list, 344 blocking_notifier_call_chain(&reboot_notifier_list,
326 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); 345 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
327 system_state = state; 346 system_state = state;
347 usermodehelper_disable();
328 device_shutdown(); 348 device_shutdown();
329} 349}
330/** 350/**
@@ -335,7 +355,7 @@ static void kernel_shutdown_prepare(enum system_states state)
335void kernel_halt(void) 355void kernel_halt(void)
336{ 356{
337 kernel_shutdown_prepare(SYSTEM_HALT); 357 kernel_shutdown_prepare(SYSTEM_HALT);
338 sysdev_shutdown(); 358 syscore_shutdown();
339 printk(KERN_EMERG "System halted.\n"); 359 printk(KERN_EMERG "System halted.\n");
340 kmsg_dump(KMSG_DUMP_HALT); 360 kmsg_dump(KMSG_DUMP_HALT);
341 machine_halt(); 361 machine_halt();
@@ -354,7 +374,7 @@ void kernel_power_off(void)
354 if (pm_power_off_prepare) 374 if (pm_power_off_prepare)
355 pm_power_off_prepare(); 375 pm_power_off_prepare();
356 disable_nonboot_cpus(); 376 disable_nonboot_cpus();
357 sysdev_shutdown(); 377 syscore_shutdown();
358 printk(KERN_EMERG "Power down.\n"); 378 printk(KERN_EMERG "Power down.\n");
359 kmsg_dump(KMSG_DUMP_POWEROFF); 379 kmsg_dump(KMSG_DUMP_POWEROFF);
360 machine_power_off(); 380 machine_power_off();
@@ -502,7 +522,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
502 if (rgid != (gid_t) -1) { 522 if (rgid != (gid_t) -1) {
503 if (old->gid == rgid || 523 if (old->gid == rgid ||
504 old->egid == rgid || 524 old->egid == rgid ||
505 capable(CAP_SETGID)) 525 nsown_capable(CAP_SETGID))
506 new->gid = rgid; 526 new->gid = rgid;
507 else 527 else
508 goto error; 528 goto error;
@@ -511,7 +531,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
511 if (old->gid == egid || 531 if (old->gid == egid ||
512 old->egid == egid || 532 old->egid == egid ||
513 old->sgid == egid || 533 old->sgid == egid ||
514 capable(CAP_SETGID)) 534 nsown_capable(CAP_SETGID))
515 new->egid = egid; 535 new->egid = egid;
516 else 536 else
517 goto error; 537 goto error;
@@ -546,7 +566,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
546 old = current_cred(); 566 old = current_cred();
547 567
548 retval = -EPERM; 568 retval = -EPERM;
549 if (capable(CAP_SETGID)) 569 if (nsown_capable(CAP_SETGID))
550 new->gid = new->egid = new->sgid = new->fsgid = gid; 570 new->gid = new->egid = new->sgid = new->fsgid = gid;
551 else if (gid == old->gid || gid == old->sgid) 571 else if (gid == old->gid || gid == old->sgid)
552 new->egid = new->fsgid = gid; 572 new->egid = new->fsgid = gid;
@@ -613,7 +633,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
613 new->uid = ruid; 633 new->uid = ruid;
614 if (old->uid != ruid && 634 if (old->uid != ruid &&
615 old->euid != ruid && 635 old->euid != ruid &&
616 !capable(CAP_SETUID)) 636 !nsown_capable(CAP_SETUID))
617 goto error; 637 goto error;
618 } 638 }
619 639
@@ -622,7 +642,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
622 if (old->uid != euid && 642 if (old->uid != euid &&
623 old->euid != euid && 643 old->euid != euid &&
624 old->suid != euid && 644 old->suid != euid &&
625 !capable(CAP_SETUID)) 645 !nsown_capable(CAP_SETUID))
626 goto error; 646 goto error;
627 } 647 }
628 648
@@ -670,7 +690,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
670 old = current_cred(); 690 old = current_cred();
671 691
672 retval = -EPERM; 692 retval = -EPERM;
673 if (capable(CAP_SETUID)) { 693 if (nsown_capable(CAP_SETUID)) {
674 new->suid = new->uid = uid; 694 new->suid = new->uid = uid;
675 if (uid != old->uid) { 695 if (uid != old->uid) {
676 retval = set_user(new); 696 retval = set_user(new);
@@ -712,7 +732,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
712 old = current_cred(); 732 old = current_cred();
713 733
714 retval = -EPERM; 734 retval = -EPERM;
715 if (!capable(CAP_SETUID)) { 735 if (!nsown_capable(CAP_SETUID)) {
716 if (ruid != (uid_t) -1 && ruid != old->uid && 736 if (ruid != (uid_t) -1 && ruid != old->uid &&
717 ruid != old->euid && ruid != old->suid) 737 ruid != old->euid && ruid != old->suid)
718 goto error; 738 goto error;
@@ -776,7 +796,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
776 old = current_cred(); 796 old = current_cred();
777 797
778 retval = -EPERM; 798 retval = -EPERM;
779 if (!capable(CAP_SETGID)) { 799 if (!nsown_capable(CAP_SETGID)) {
780 if (rgid != (gid_t) -1 && rgid != old->gid && 800 if (rgid != (gid_t) -1 && rgid != old->gid &&
781 rgid != old->egid && rgid != old->sgid) 801 rgid != old->egid && rgid != old->sgid)
782 goto error; 802 goto error;
@@ -836,7 +856,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
836 856
837 if (uid == old->uid || uid == old->euid || 857 if (uid == old->uid || uid == old->euid ||
838 uid == old->suid || uid == old->fsuid || 858 uid == old->suid || uid == old->fsuid ||
839 capable(CAP_SETUID)) { 859 nsown_capable(CAP_SETUID)) {
840 if (uid != old_fsuid) { 860 if (uid != old_fsuid) {
841 new->fsuid = uid; 861 new->fsuid = uid;
842 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) 862 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -869,7 +889,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
869 889
870 if (gid == old->gid || gid == old->egid || 890 if (gid == old->gid || gid == old->egid ||
871 gid == old->sgid || gid == old->fsgid || 891 gid == old->sgid || gid == old->fsgid ||
872 capable(CAP_SETGID)) { 892 nsown_capable(CAP_SETGID)) {
873 if (gid != old_fsgid) { 893 if (gid != old_fsgid) {
874 new->fsgid = gid; 894 new->fsgid = gid;
875 goto change_okay; 895 goto change_okay;
@@ -1177,8 +1197,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1177 int errno; 1197 int errno;
1178 char tmp[__NEW_UTS_LEN]; 1198 char tmp[__NEW_UTS_LEN];
1179 1199
1180 if (!capable(CAP_SYS_ADMIN)) 1200 if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
1181 return -EPERM; 1201 return -EPERM;
1202
1182 if (len < 0 || len > __NEW_UTS_LEN) 1203 if (len < 0 || len > __NEW_UTS_LEN)
1183 return -EINVAL; 1204 return -EINVAL;
1184 down_write(&uts_sem); 1205 down_write(&uts_sem);
@@ -1226,7 +1247,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1226 int errno; 1247 int errno;
1227 char tmp[__NEW_UTS_LEN]; 1248 char tmp[__NEW_UTS_LEN];
1228 1249
1229 if (!capable(CAP_SYS_ADMIN)) 1250 if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
1230 return -EPERM; 1251 return -EPERM;
1231 if (len < 0 || len > __NEW_UTS_LEN) 1252 if (len < 0 || len > __NEW_UTS_LEN)
1232 return -EINVAL; 1253 return -EINVAL;
@@ -1341,6 +1362,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
1341 rlim = tsk->signal->rlim + resource; 1362 rlim = tsk->signal->rlim + resource;
1342 task_lock(tsk->group_leader); 1363 task_lock(tsk->group_leader);
1343 if (new_rlim) { 1364 if (new_rlim) {
1365 /* Keep the capable check against init_user_ns until
1366 cgroups can contain all limits */
1344 if (new_rlim->rlim_max > rlim->rlim_max && 1367 if (new_rlim->rlim_max > rlim->rlim_max &&
1345 !capable(CAP_SYS_RESOURCE)) 1368 !capable(CAP_SYS_RESOURCE))
1346 retval = -EPERM; 1369 retval = -EPERM;
@@ -1384,19 +1407,22 @@ static int check_prlimit_permission(struct task_struct *task)
1384{ 1407{
1385 const struct cred *cred = current_cred(), *tcred; 1408 const struct cred *cred = current_cred(), *tcred;
1386 1409
1387 tcred = __task_cred(task); 1410 if (current == task)
1388 if (current != task && 1411 return 0;
1389 (cred->uid != tcred->euid ||
1390 cred->uid != tcred->suid ||
1391 cred->uid != tcred->uid ||
1392 cred->gid != tcred->egid ||
1393 cred->gid != tcred->sgid ||
1394 cred->gid != tcred->gid) &&
1395 !capable(CAP_SYS_RESOURCE)) {
1396 return -EPERM;
1397 }
1398 1412
1399 return 0; 1413 tcred = __task_cred(task);
1414 if (cred->user->user_ns == tcred->user->user_ns &&
1415 (cred->uid == tcred->euid &&
1416 cred->uid == tcred->suid &&
1417 cred->uid == tcred->uid &&
1418 cred->gid == tcred->egid &&
1419 cred->gid == tcred->sgid &&
1420 cred->gid == tcred->gid))
1421 return 0;
1422 if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
1423 return 0;
1424
1425 return -EPERM;
1400} 1426}
1401 1427
1402SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, 1428SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c782fe9924c7..62cbc8877fef 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -46,7 +46,9 @@ cond_syscall(sys_getsockopt);
46cond_syscall(compat_sys_getsockopt); 46cond_syscall(compat_sys_getsockopt);
47cond_syscall(sys_shutdown); 47cond_syscall(sys_shutdown);
48cond_syscall(sys_sendmsg); 48cond_syscall(sys_sendmsg);
49cond_syscall(sys_sendmmsg);
49cond_syscall(compat_sys_sendmsg); 50cond_syscall(compat_sys_sendmsg);
51cond_syscall(compat_sys_sendmmsg);
50cond_syscall(sys_recvmsg); 52cond_syscall(sys_recvmsg);
51cond_syscall(sys_recvmmsg); 53cond_syscall(sys_recvmmsg);
52cond_syscall(compat_sys_recvmsg); 54cond_syscall(compat_sys_recvmsg);
@@ -69,15 +71,22 @@ cond_syscall(compat_sys_epoll_pwait);
69cond_syscall(sys_semget); 71cond_syscall(sys_semget);
70cond_syscall(sys_semop); 72cond_syscall(sys_semop);
71cond_syscall(sys_semtimedop); 73cond_syscall(sys_semtimedop);
74cond_syscall(compat_sys_semtimedop);
72cond_syscall(sys_semctl); 75cond_syscall(sys_semctl);
76cond_syscall(compat_sys_semctl);
73cond_syscall(sys_msgget); 77cond_syscall(sys_msgget);
74cond_syscall(sys_msgsnd); 78cond_syscall(sys_msgsnd);
79cond_syscall(compat_sys_msgsnd);
75cond_syscall(sys_msgrcv); 80cond_syscall(sys_msgrcv);
81cond_syscall(compat_sys_msgrcv);
76cond_syscall(sys_msgctl); 82cond_syscall(sys_msgctl);
83cond_syscall(compat_sys_msgctl);
77cond_syscall(sys_shmget); 84cond_syscall(sys_shmget);
78cond_syscall(sys_shmat); 85cond_syscall(sys_shmat);
86cond_syscall(compat_sys_shmat);
79cond_syscall(sys_shmdt); 87cond_syscall(sys_shmdt);
80cond_syscall(sys_shmctl); 88cond_syscall(sys_shmctl);
89cond_syscall(compat_sys_shmctl);
81cond_syscall(sys_mq_open); 90cond_syscall(sys_mq_open);
82cond_syscall(sys_mq_unlink); 91cond_syscall(sys_mq_unlink);
83cond_syscall(sys_mq_timedsend); 92cond_syscall(sys_mq_timedsend);
@@ -186,3 +195,8 @@ cond_syscall(sys_perf_event_open);
186/* fanotify! */ 195/* fanotify! */
187cond_syscall(sys_fanotify_init); 196cond_syscall(sys_fanotify_init);
188cond_syscall(sys_fanotify_mark); 197cond_syscall(sys_fanotify_mark);
198
199/* open by handle */
200cond_syscall(sys_name_to_handle_at);
201cond_syscall(sys_open_by_handle_at);
202cond_syscall(compat_sys_open_by_handle_at);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0f1bd83db985..4fc92445a29c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -56,6 +56,7 @@
56#include <linux/kprobes.h> 56#include <linux/kprobes.h>
57#include <linux/pipe_fs_i.h> 57#include <linux/pipe_fs_i.h>
58#include <linux/oom.h> 58#include <linux/oom.h>
59#include <linux/kmod.h>
59 60
60#include <asm/uaccess.h> 61#include <asm/uaccess.h>
61#include <asm/processor.h> 62#include <asm/processor.h>
@@ -117,6 +118,7 @@ static int neg_one = -1;
117static int zero; 118static int zero;
118static int __maybe_unused one = 1; 119static int __maybe_unused one = 1;
119static int __maybe_unused two = 2; 120static int __maybe_unused two = 2;
121static int __maybe_unused three = 3;
120static unsigned long one_ul = 1; 122static unsigned long one_ul = 1;
121static int one_hundred = 100; 123static int one_hundred = 100;
122#ifdef CONFIG_PRINTK 124#ifdef CONFIG_PRINTK
@@ -169,6 +171,11 @@ static int proc_taint(struct ctl_table *table, int write,
169 void __user *buffer, size_t *lenp, loff_t *ppos); 171 void __user *buffer, size_t *lenp, loff_t *ppos);
170#endif 172#endif
171 173
174#ifdef CONFIG_PRINTK
175static int proc_dmesg_restrict(struct ctl_table *table, int write,
176 void __user *buffer, size_t *lenp, loff_t *ppos);
177#endif
178
172#ifdef CONFIG_MAGIC_SYSRQ 179#ifdef CONFIG_MAGIC_SYSRQ
173/* Note: sysrq code uses it's own private copy */ 180/* Note: sysrq code uses it's own private copy */
174static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; 181static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -194,9 +201,9 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
194static struct ctl_table root_table[]; 201static struct ctl_table root_table[];
195static struct ctl_table_root sysctl_table_root; 202static struct ctl_table_root sysctl_table_root;
196static struct ctl_table_header root_table_header = { 203static struct ctl_table_header root_table_header = {
197 .count = 1, 204 {{.count = 1,
198 .ctl_table = root_table, 205 .ctl_table = root_table,
199 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), 206 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
200 .root = &sysctl_table_root, 207 .root = &sysctl_table_root,
201 .set = &sysctl_table_root.default_set, 208 .set = &sysctl_table_root.default_set,
202}; 209};
@@ -361,20 +368,13 @@ static struct ctl_table kern_table[] = {
361 .mode = 0644, 368 .mode = 0644,
362 .proc_handler = sched_rt_handler, 369 .proc_handler = sched_rt_handler,
363 }, 370 },
364 {
365 .procname = "sched_compat_yield",
366 .data = &sysctl_sched_compat_yield,
367 .maxlen = sizeof(unsigned int),
368 .mode = 0644,
369 .proc_handler = proc_dointvec,
370 },
371#ifdef CONFIG_SCHED_AUTOGROUP 371#ifdef CONFIG_SCHED_AUTOGROUP
372 { 372 {
373 .procname = "sched_autogroup_enabled", 373 .procname = "sched_autogroup_enabled",
374 .data = &sysctl_sched_autogroup_enabled, 374 .data = &sysctl_sched_autogroup_enabled,
375 .maxlen = sizeof(unsigned int), 375 .maxlen = sizeof(unsigned int),
376 .mode = 0644, 376 .mode = 0644,
377 .proc_handler = proc_dointvec, 377 .proc_handler = proc_dointvec_minmax,
378 .extra1 = &zero, 378 .extra1 = &zero,
379 .extra2 = &one, 379 .extra2 = &one,
380 }, 380 },
@@ -617,6 +617,11 @@ static struct ctl_table kern_table[] = {
617 .child = random_table, 617 .child = random_table,
618 }, 618 },
619 { 619 {
620 .procname = "usermodehelper",
621 .mode = 0555,
622 .child = usermodehelper_table,
623 },
624 {
620 .procname = "overflowuid", 625 .procname = "overflowuid",
621 .data = &overflowuid, 626 .data = &overflowuid,
622 .maxlen = sizeof(int), 627 .maxlen = sizeof(int),
@@ -713,7 +718,7 @@ static struct ctl_table kern_table[] = {
713 .data = &kptr_restrict, 718 .data = &kptr_restrict,
714 .maxlen = sizeof(int), 719 .maxlen = sizeof(int),
715 .mode = 0644, 720 .mode = 0644,
716 .proc_handler = proc_dointvec_minmax, 721 .proc_handler = proc_dmesg_restrict,
717 .extra1 = &zero, 722 .extra1 = &zero,
718 .extra2 = &two, 723 .extra2 = &two,
719 }, 724 },
@@ -731,14 +736,16 @@ static struct ctl_table kern_table[] = {
731 .data = &watchdog_enabled, 736 .data = &watchdog_enabled,
732 .maxlen = sizeof (int), 737 .maxlen = sizeof (int),
733 .mode = 0644, 738 .mode = 0644,
734 .proc_handler = proc_dowatchdog_enabled, 739 .proc_handler = proc_dowatchdog,
740 .extra1 = &zero,
741 .extra2 = &one,
735 }, 742 },
736 { 743 {
737 .procname = "watchdog_thresh", 744 .procname = "watchdog_thresh",
738 .data = &softlockup_thresh, 745 .data = &watchdog_thresh,
739 .maxlen = sizeof(int), 746 .maxlen = sizeof(int),
740 .mode = 0644, 747 .mode = 0644,
741 .proc_handler = proc_dowatchdog_thresh, 748 .proc_handler = proc_dowatchdog,
742 .extra1 = &neg_one, 749 .extra1 = &neg_one,
743 .extra2 = &sixty, 750 .extra2 = &sixty,
744 }, 751 },
@@ -756,7 +763,9 @@ static struct ctl_table kern_table[] = {
756 .data = &watchdog_enabled, 763 .data = &watchdog_enabled,
757 .maxlen = sizeof (int), 764 .maxlen = sizeof (int),
758 .mode = 0644, 765 .mode = 0644,
759 .proc_handler = proc_dowatchdog_enabled, 766 .proc_handler = proc_dowatchdog,
767 .extra1 = &zero,
768 .extra2 = &one,
760 }, 769 },
761#endif 770#endif
762#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 771#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
@@ -948,7 +957,7 @@ static struct ctl_table kern_table[] = {
948 .data = &sysctl_perf_event_sample_rate, 957 .data = &sysctl_perf_event_sample_rate,
949 .maxlen = sizeof(sysctl_perf_event_sample_rate), 958 .maxlen = sizeof(sysctl_perf_event_sample_rate),
950 .mode = 0644, 959 .mode = 0644,
951 .proc_handler = proc_dointvec, 960 .proc_handler = perf_proc_update_handler,
952 }, 961 },
953#endif 962#endif
954#ifdef CONFIG_KMEMCHECK 963#ifdef CONFIG_KMEMCHECK
@@ -978,14 +987,18 @@ static struct ctl_table vm_table[] = {
978 .data = &sysctl_overcommit_memory, 987 .data = &sysctl_overcommit_memory,
979 .maxlen = sizeof(sysctl_overcommit_memory), 988 .maxlen = sizeof(sysctl_overcommit_memory),
980 .mode = 0644, 989 .mode = 0644,
981 .proc_handler = proc_dointvec, 990 .proc_handler = proc_dointvec_minmax,
991 .extra1 = &zero,
992 .extra2 = &two,
982 }, 993 },
983 { 994 {
984 .procname = "panic_on_oom", 995 .procname = "panic_on_oom",
985 .data = &sysctl_panic_on_oom, 996 .data = &sysctl_panic_on_oom,
986 .maxlen = sizeof(sysctl_panic_on_oom), 997 .maxlen = sizeof(sysctl_panic_on_oom),
987 .mode = 0644, 998 .mode = 0644,
988 .proc_handler = proc_dointvec, 999 .proc_handler = proc_dointvec_minmax,
1000 .extra1 = &zero,
1001 .extra2 = &two,
989 }, 1002 },
990 { 1003 {
991 .procname = "oom_kill_allocating_task", 1004 .procname = "oom_kill_allocating_task",
@@ -1013,7 +1026,8 @@ static struct ctl_table vm_table[] = {
1013 .data = &page_cluster, 1026 .data = &page_cluster,
1014 .maxlen = sizeof(int), 1027 .maxlen = sizeof(int),
1015 .mode = 0644, 1028 .mode = 0644,
1016 .proc_handler = proc_dointvec, 1029 .proc_handler = proc_dointvec_minmax,
1030 .extra1 = &zero,
1017 }, 1031 },
1018 { 1032 {
1019 .procname = "dirty_background_ratio", 1033 .procname = "dirty_background_ratio",
@@ -1061,7 +1075,8 @@ static struct ctl_table vm_table[] = {
1061 .data = &dirty_expire_interval, 1075 .data = &dirty_expire_interval,
1062 .maxlen = sizeof(dirty_expire_interval), 1076 .maxlen = sizeof(dirty_expire_interval),
1063 .mode = 0644, 1077 .mode = 0644,
1064 .proc_handler = proc_dointvec, 1078 .proc_handler = proc_dointvec_minmax,
1079 .extra1 = &zero,
1065 }, 1080 },
1066 { 1081 {
1067 .procname = "nr_pdflush_threads", 1082 .procname = "nr_pdflush_threads",
@@ -1137,6 +1152,8 @@ static struct ctl_table vm_table[] = {
1137 .maxlen = sizeof(int), 1152 .maxlen = sizeof(int),
1138 .mode = 0644, 1153 .mode = 0644,
1139 .proc_handler = drop_caches_sysctl_handler, 1154 .proc_handler = drop_caches_sysctl_handler,
1155 .extra1 = &one,
1156 .extra2 = &three,
1140 }, 1157 },
1141#ifdef CONFIG_COMPACTION 1158#ifdef CONFIG_COMPACTION
1142 { 1159 {
@@ -1489,7 +1506,7 @@ static struct ctl_table fs_table[] = {
1489 1506
1490static struct ctl_table debug_table[] = { 1507static struct ctl_table debug_table[] = {
1491#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ 1508#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
1492 defined(CONFIG_S390) 1509 defined(CONFIG_S390) || defined(CONFIG_TILE)
1493 { 1510 {
1494 .procname = "exception-trace", 1511 .procname = "exception-trace",
1495 .data = &show_unhandled_signals, 1512 .data = &show_unhandled_signals,
@@ -1567,11 +1584,16 @@ void sysctl_head_get(struct ctl_table_header *head)
1567 spin_unlock(&sysctl_lock); 1584 spin_unlock(&sysctl_lock);
1568} 1585}
1569 1586
1587static void free_head(struct rcu_head *rcu)
1588{
1589 kfree(container_of(rcu, struct ctl_table_header, rcu));
1590}
1591
1570void sysctl_head_put(struct ctl_table_header *head) 1592void sysctl_head_put(struct ctl_table_header *head)
1571{ 1593{
1572 spin_lock(&sysctl_lock); 1594 spin_lock(&sysctl_lock);
1573 if (!--head->count) 1595 if (!--head->count)
1574 kfree(head); 1596 call_rcu(&head->rcu, free_head);
1575 spin_unlock(&sysctl_lock); 1597 spin_unlock(&sysctl_lock);
1576} 1598}
1577 1599
@@ -1685,13 +1707,8 @@ static int test_perm(int mode, int op)
1685 1707
1686int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) 1708int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1687{ 1709{
1688 int error;
1689 int mode; 1710 int mode;
1690 1711
1691 error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
1692 if (error)
1693 return error;
1694
1695 if (root->permissions) 1712 if (root->permissions)
1696 mode = root->permissions(root, current->nsproxy, table); 1713 mode = root->permissions(root, current->nsproxy, table);
1697 else 1714 else
@@ -1948,10 +1965,10 @@ void unregister_sysctl_table(struct ctl_table_header * header)
1948 start_unregistering(header); 1965 start_unregistering(header);
1949 if (!--header->parent->count) { 1966 if (!--header->parent->count) {
1950 WARN_ON(1); 1967 WARN_ON(1);
1951 kfree(header->parent); 1968 call_rcu(&header->parent->rcu, free_head);
1952 } 1969 }
1953 if (!--header->count) 1970 if (!--header->count)
1954 kfree(header); 1971 call_rcu(&header->rcu, free_head);
1955 spin_unlock(&sysctl_lock); 1972 spin_unlock(&sysctl_lock);
1956} 1973}
1957 1974
@@ -2392,6 +2409,17 @@ static int proc_taint(struct ctl_table *table, int write,
2392 return err; 2409 return err;
2393} 2410}
2394 2411
2412#ifdef CONFIG_PRINTK
2413static int proc_dmesg_restrict(struct ctl_table *table, int write,
2414 void __user *buffer, size_t *lenp, loff_t *ppos)
2415{
2416 if (write && !capable(CAP_SYS_ADMIN))
2417 return -EPERM;
2418
2419 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2420}
2421#endif
2422
2395struct do_proc_dointvec_minmax_conv_param { 2423struct do_proc_dointvec_minmax_conv_param {
2396 int *min; 2424 int *min;
2397 int *max; 2425 int *max;
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index b875bedf7c9a..3b8e028b9601 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1321,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1321 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1321 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1322{ 1322{
1323 const struct bin_table *table = NULL; 1323 const struct bin_table *table = NULL;
1324 struct nameidata nd;
1325 struct vfsmount *mnt; 1324 struct vfsmount *mnt;
1326 struct file *file; 1325 struct file *file;
1327 ssize_t result; 1326 ssize_t result;
1328 char *pathname; 1327 char *pathname;
1329 int flags; 1328 int flags;
1330 int acc_mode;
1331 1329
1332 pathname = sysctl_getname(name, nlen, &table); 1330 pathname = sysctl_getname(name, nlen, &table);
1333 result = PTR_ERR(pathname); 1331 result = PTR_ERR(pathname);
@@ -1337,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1337 /* How should the sysctl be accessed? */ 1335 /* How should the sysctl be accessed? */
1338 if (oldval && oldlen && newval && newlen) { 1336 if (oldval && oldlen && newval && newlen) {
1339 flags = O_RDWR; 1337 flags = O_RDWR;
1340 acc_mode = MAY_READ | MAY_WRITE;
1341 } else if (newval && newlen) { 1338 } else if (newval && newlen) {
1342 flags = O_WRONLY; 1339 flags = O_WRONLY;
1343 acc_mode = MAY_WRITE;
1344 } else if (oldval && oldlen) { 1340 } else if (oldval && oldlen) {
1345 flags = O_RDONLY; 1341 flags = O_RDONLY;
1346 acc_mode = MAY_READ;
1347 } else { 1342 } else {
1348 result = 0; 1343 result = 0;
1349 goto out_putname; 1344 goto out_putname;
1350 } 1345 }
1351 1346
1352 mnt = current->nsproxy->pid_ns->proc_mnt; 1347 mnt = current->nsproxy->pid_ns->proc_mnt;
1353 result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd); 1348 file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
1354 if (result)
1355 goto out_putname;
1356
1357 result = may_open(&nd.path, acc_mode, flags);
1358 if (result)
1359 goto out_putpath;
1360
1361 file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
1362 result = PTR_ERR(file); 1349 result = PTR_ERR(file);
1363 if (IS_ERR(file)) 1350 if (IS_ERR(file))
1364 goto out_putname; 1351 goto out_putname;
@@ -1370,10 +1357,6 @@ out_putname:
1370 putname(pathname); 1357 putname(pathname);
1371out: 1358out:
1372 return result; 1359 return result;
1373
1374out_putpath:
1375 path_put(&nd.path);
1376 goto out_putname;
1377} 1360}
1378 1361
1379 1362
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 10b90d8a03c4..4e4932a7b360 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
111 const char *fail = NULL; 111 const char *fail = NULL;
112 112
113 if (table->parent) { 113 if (table->parent) {
114 if (table->procname && !table->parent->procname) 114 if (!table->parent->procname)
115 set_fail(&fail, table, "Parent without procname"); 115 set_fail(&fail, table, "Parent without procname");
116 } 116 }
117 if (!table->procname)
118 set_fail(&fail, table, "No procname");
119 if (table->child) { 117 if (table->child) {
120 if (table->data) 118 if (table->data)
121 set_fail(&fail, table, "Directory with data?"); 119 set_fail(&fail, table, "Directory with data?");
@@ -144,13 +142,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
144 set_fail(&fail, table, "No maxlen"); 142 set_fail(&fail, table, "No maxlen");
145 } 143 }
146#ifdef CONFIG_PROC_SYSCTL 144#ifdef CONFIG_PROC_SYSCTL
147 if (table->procname && !table->proc_handler) 145 if (!table->proc_handler)
148 set_fail(&fail, table, "No proc_handler"); 146 set_fail(&fail, table, "No proc_handler");
149#endif 147#endif
150#if 0
151 if (!table->procname && table->proc_handler)
152 set_fail(&fail, table, "proc_handler without procname");
153#endif
154 sysctl_check_leaf(namespaces, table, &fail); 148 sysctl_check_leaf(namespaces, table, &fail);
155 } 149 }
156 if (table->mode > 0777) 150 if (table->mode > 0777)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 3971c6b9d58d..9ffea360a778 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -685,7 +685,7 @@ static int __init taskstats_init(void)
685 goto err_cgroup_ops; 685 goto err_cgroup_ops;
686 686
687 family_registered = 1; 687 family_registered = 1;
688 printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 688 pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
689 return 0; 689 return 0;
690err_cgroup_ops: 690err_cgroup_ops:
691 genl_unregister_ops(&family, &taskstats_ops); 691 genl_unregister_ops(&family, &taskstats_ops);
diff --git a/kernel/time.c b/kernel/time.c
index 32174359576f..8e8dc6d705c9 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -150,7 +150,7 @@ static inline void warp_clock(void)
150 * various programs will get confused when the clock gets warped. 150 * various programs will get confused when the clock gets warped.
151 */ 151 */
152 152
153int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) 153int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
154{ 154{
155 static int firsttime = 1; 155 static int firsttime = 1;
156 int error = 0; 156 int error = 0;
@@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x)
645} 645}
646 646
647/** 647/**
648 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies 648 * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
649 * 649 *
650 * @n: nsecs in u64 650 * @n: nsecs in u64
651 * 651 *
@@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x)
657 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) 657 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
658 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years 658 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
659 */ 659 */
660unsigned long nsecs_to_jiffies(u64 n) 660u64 nsecs_to_jiffies64(u64 n)
661{ 661{
662#if (NSEC_PER_SEC % HZ) == 0 662#if (NSEC_PER_SEC % HZ) == 0
663 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ 663 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
@@ -674,22 +674,23 @@ unsigned long nsecs_to_jiffies(u64 n)
674#endif 674#endif
675} 675}
676 676
677#if (BITS_PER_LONG < 64) 677/**
678u64 get_jiffies_64(void) 678 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
679 *
680 * @n: nsecs in u64
681 *
682 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
683 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
684 * for scheduler, not for use in device drivers to calculate timeout value.
685 *
686 * note:
687 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
688 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
689 */
690unsigned long nsecs_to_jiffies(u64 n)
679{ 691{
680 unsigned long seq; 692 return (unsigned long)nsecs_to_jiffies64(n);
681 u64 ret;
682
683 do {
684 seq = read_seqbegin(&xtime_lock);
685 ret = jiffies_64;
686 } while (read_seqretry(&xtime_lock, seq));
687 return ret;
688} 693}
689EXPORT_SYMBOL(get_jiffies_64);
690#endif
691
692EXPORT_SYMBOL(jiffies);
693 694
694/* 695/*
695 * Add two timespec values and do a safety check for overflow. 696 * Add two timespec values and do a safety check for overflow.
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ee266620b06c..e2fd74b8e8c2 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,5 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
2obj-y += timeconv.o posix-clock.o alarmtimer.o
2 3
3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
new file mode 100644
index 000000000000..2d966244ea60
--- /dev/null
+++ b/kernel/time/alarmtimer.c
@@ -0,0 +1,702 @@
1/*
2 * Alarmtimer interface
3 *
4 * This interface provides a timer which is similarto hrtimers,
5 * but triggers a RTC alarm if the box is suspend.
6 *
7 * This interface is influenced by the Android RTC Alarm timer
8 * interface.
9 *
10 * Copyright (C) 2010 IBM Corperation
11 *
12 * Author: John Stultz <john.stultz@linaro.org>
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License version 2 as
16 * published by the Free Software Foundation.
17 */
18#include <linux/time.h>
19#include <linux/hrtimer.h>
20#include <linux/timerqueue.h>
21#include <linux/rtc.h>
22#include <linux/alarmtimer.h>
23#include <linux/mutex.h>
24#include <linux/platform_device.h>
25#include <linux/posix-timers.h>
26#include <linux/workqueue.h>
27#include <linux/freezer.h>
28
29/**
30 * struct alarm_base - Alarm timer bases
31 * @lock: Lock for syncrhonized access to the base
32 * @timerqueue: Timerqueue head managing the list of events
33 * @timer: hrtimer used to schedule events while running
34 * @gettime: Function to read the time correlating to the base
35 * @base_clockid: clockid for the base
36 */
37static struct alarm_base {
38 spinlock_t lock;
39 struct timerqueue_head timerqueue;
40 struct hrtimer timer;
41 ktime_t (*gettime)(void);
42 clockid_t base_clockid;
43} alarm_bases[ALARM_NUMTYPE];
44
45#ifdef CONFIG_RTC_CLASS
46/* rtc timer and device for setting alarm wakeups at suspend */
47static struct rtc_timer rtctimer;
48static struct rtc_device *rtcdev;
49#endif
50
51/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */
52static ktime_t freezer_delta;
53static DEFINE_SPINLOCK(freezer_delta_lock);
54
55
56/**
57 * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
58 * @base: pointer to the base where the timer is being run
59 * @alarm: pointer to alarm being enqueued.
60 *
61 * Adds alarm to a alarm_base timerqueue and if necessary sets
62 * an hrtimer to run.
63 *
64 * Must hold base->lock when calling.
65 */
66static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
67{
68 timerqueue_add(&base->timerqueue, &alarm->node);
69 if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
70 hrtimer_try_to_cancel(&base->timer);
71 hrtimer_start(&base->timer, alarm->node.expires,
72 HRTIMER_MODE_ABS);
73 }
74}
75
76/**
77 * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue
78 * @base: pointer to the base where the timer is running
79 * @alarm: pointer to alarm being removed
80 *
81 * Removes alarm to a alarm_base timerqueue and if necessary sets
82 * a new timer to run.
83 *
84 * Must hold base->lock when calling.
85 */
86static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
87{
88 struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
89
90 timerqueue_del(&base->timerqueue, &alarm->node);
91 if (next == &alarm->node) {
92 hrtimer_try_to_cancel(&base->timer);
93 next = timerqueue_getnext(&base->timerqueue);
94 if (!next)
95 return;
96 hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS);
97 }
98}
99
100
101/**
102 * alarmtimer_fired - Handles alarm hrtimer being fired.
103 * @timer: pointer to hrtimer being run
104 *
105 * When a alarm timer fires, this runs through the timerqueue to
106 * see which alarms expired, and runs those. If there are more alarm
107 * timers queued for the future, we set the hrtimer to fire when
108 * when the next future alarm timer expires.
109 */
110static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
111{
112 struct alarm_base *base = container_of(timer, struct alarm_base, timer);
113 struct timerqueue_node *next;
114 unsigned long flags;
115 ktime_t now;
116 int ret = HRTIMER_NORESTART;
117
118 spin_lock_irqsave(&base->lock, flags);
119 now = base->gettime();
120 while ((next = timerqueue_getnext(&base->timerqueue))) {
121 struct alarm *alarm;
122 ktime_t expired = next->expires;
123
124 if (expired.tv64 >= now.tv64)
125 break;
126
127 alarm = container_of(next, struct alarm, node);
128
129 timerqueue_del(&base->timerqueue, &alarm->node);
130 alarm->enabled = 0;
131 /* Re-add periodic timers */
132 if (alarm->period.tv64) {
133 alarm->node.expires = ktime_add(expired, alarm->period);
134 timerqueue_add(&base->timerqueue, &alarm->node);
135 alarm->enabled = 1;
136 }
137 spin_unlock_irqrestore(&base->lock, flags);
138 if (alarm->function)
139 alarm->function(alarm);
140 spin_lock_irqsave(&base->lock, flags);
141 }
142
143 if (next) {
144 hrtimer_set_expires(&base->timer, next->expires);
145 ret = HRTIMER_RESTART;
146 }
147 spin_unlock_irqrestore(&base->lock, flags);
148
149 return ret;
150
151}
152
153#ifdef CONFIG_RTC_CLASS
154/**
155 * alarmtimer_suspend - Suspend time callback
156 * @dev: unused
157 * @state: unused
158 *
159 * When we are going into suspend, we look through the bases
160 * to see which is the soonest timer to expire. We then
161 * set an rtc timer to fire that far into the future, which
162 * will wake us from suspend.
163 */
164static int alarmtimer_suspend(struct device *dev)
165{
166 struct rtc_time tm;
167 ktime_t min, now;
168 unsigned long flags;
169 int i;
170
171 spin_lock_irqsave(&freezer_delta_lock, flags);
172 min = freezer_delta;
173 freezer_delta = ktime_set(0, 0);
174 spin_unlock_irqrestore(&freezer_delta_lock, flags);
175
176 /* If we have no rtcdev, just return */
177 if (!rtcdev)
178 return 0;
179
180 /* Find the soonest timer to expire*/
181 for (i = 0; i < ALARM_NUMTYPE; i++) {
182 struct alarm_base *base = &alarm_bases[i];
183 struct timerqueue_node *next;
184 ktime_t delta;
185
186 spin_lock_irqsave(&base->lock, flags);
187 next = timerqueue_getnext(&base->timerqueue);
188 spin_unlock_irqrestore(&base->lock, flags);
189 if (!next)
190 continue;
191 delta = ktime_sub(next->expires, base->gettime());
192 if (!min.tv64 || (delta.tv64 < min.tv64))
193 min = delta;
194 }
195 if (min.tv64 == 0)
196 return 0;
197
198 /* XXX - Should we enforce a minimum sleep time? */
199 WARN_ON(min.tv64 < NSEC_PER_SEC);
200
201 /* Setup an rtc timer to fire that far in the future */
202 rtc_timer_cancel(rtcdev, &rtctimer);
203 rtc_read_time(rtcdev, &tm);
204 now = rtc_tm_to_ktime(tm);
205 now = ktime_add(now, min);
206
207 rtc_timer_start(rtcdev, &rtctimer, now, ktime_set(0, 0));
208
209 return 0;
210}
211#else
212static int alarmtimer_suspend(struct device *dev)
213{
214 return 0;
215}
216#endif
217
218static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
219{
220 ktime_t delta;
221 unsigned long flags;
222 struct alarm_base *base = &alarm_bases[type];
223
224 delta = ktime_sub(absexp, base->gettime());
225
226 spin_lock_irqsave(&freezer_delta_lock, flags);
227 if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64))
228 freezer_delta = delta;
229 spin_unlock_irqrestore(&freezer_delta_lock, flags);
230}
231
232
233/**
234 * alarm_init - Initialize an alarm structure
235 * @alarm: ptr to alarm to be initialized
236 * @type: the type of the alarm
237 * @function: callback that is run when the alarm fires
238 */
239void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
240 void (*function)(struct alarm *))
241{
242 timerqueue_init(&alarm->node);
243 alarm->period = ktime_set(0, 0);
244 alarm->function = function;
245 alarm->type = type;
246 alarm->enabled = 0;
247}
248
249/**
250 * alarm_start - Sets an alarm to fire
251 * @alarm: ptr to alarm to set
252 * @start: time to run the alarm
253 * @period: period at which the alarm will recur
254 */
255void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period)
256{
257 struct alarm_base *base = &alarm_bases[alarm->type];
258 unsigned long flags;
259
260 spin_lock_irqsave(&base->lock, flags);
261 if (alarm->enabled)
262 alarmtimer_remove(base, alarm);
263 alarm->node.expires = start;
264 alarm->period = period;
265 alarmtimer_enqueue(base, alarm);
266 alarm->enabled = 1;
267 spin_unlock_irqrestore(&base->lock, flags);
268}
269
270/**
271 * alarm_cancel - Tries to cancel an alarm timer
272 * @alarm: ptr to alarm to be canceled
273 */
274void alarm_cancel(struct alarm *alarm)
275{
276 struct alarm_base *base = &alarm_bases[alarm->type];
277 unsigned long flags;
278
279 spin_lock_irqsave(&base->lock, flags);
280 if (alarm->enabled)
281 alarmtimer_remove(base, alarm);
282 alarm->enabled = 0;
283 spin_unlock_irqrestore(&base->lock, flags);
284}
285
286
287/**
288 * clock2alarm - helper that converts from clockid to alarmtypes
289 * @clockid: clockid.
290 */
291static enum alarmtimer_type clock2alarm(clockid_t clockid)
292{
293 if (clockid == CLOCK_REALTIME_ALARM)
294 return ALARM_REALTIME;
295 if (clockid == CLOCK_BOOTTIME_ALARM)
296 return ALARM_BOOTTIME;
297 return -1;
298}
299
300/**
301 * alarm_handle_timer - Callback for posix timers
302 * @alarm: alarm that fired
303 *
304 * Posix timer callback for expired alarm timers.
305 */
306static void alarm_handle_timer(struct alarm *alarm)
307{
308 struct k_itimer *ptr = container_of(alarm, struct k_itimer,
309 it.alarmtimer);
310 if (posix_timer_event(ptr, 0) != 0)
311 ptr->it_overrun++;
312}
313
314/**
315 * alarm_clock_getres - posix getres interface
316 * @which_clock: clockid
317 * @tp: timespec to fill
318 *
319 * Returns the granularity of underlying alarm base clock
320 */
321static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
322{
323 clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
324
325 return hrtimer_get_res(baseid, tp);
326}
327
328/**
329 * alarm_clock_get - posix clock_get interface
330 * @which_clock: clockid
331 * @tp: timespec to fill.
332 *
333 * Provides the underlying alarm base time.
334 */
335static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
336{
337 struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
338
339 *tp = ktime_to_timespec(base->gettime());
340 return 0;
341}
342
343/**
344 * alarm_timer_create - posix timer_create interface
345 * @new_timer: k_itimer pointer to manage
346 *
347 * Initializes the k_itimer structure.
348 */
349static int alarm_timer_create(struct k_itimer *new_timer)
350{
351 enum alarmtimer_type type;
352 struct alarm_base *base;
353
354 if (!capable(CAP_WAKE_ALARM))
355 return -EPERM;
356
357 type = clock2alarm(new_timer->it_clock);
358 base = &alarm_bases[type];
359 alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer);
360 return 0;
361}
362
363/**
364 * alarm_timer_get - posix timer_get interface
365 * @new_timer: k_itimer pointer
366 * @cur_setting: itimerspec data to fill
367 *
368 * Copies the itimerspec data out from the k_itimer
369 */
370static void alarm_timer_get(struct k_itimer *timr,
371 struct itimerspec *cur_setting)
372{
373 cur_setting->it_interval =
374 ktime_to_timespec(timr->it.alarmtimer.period);
375 cur_setting->it_value =
376 ktime_to_timespec(timr->it.alarmtimer.node.expires);
377 return;
378}
379
380/**
381 * alarm_timer_del - posix timer_del interface
382 * @timr: k_itimer pointer to be deleted
383 *
384 * Cancels any programmed alarms for the given timer.
385 */
386static int alarm_timer_del(struct k_itimer *timr)
387{
388 alarm_cancel(&timr->it.alarmtimer);
389 return 0;
390}
391
392/**
393 * alarm_timer_set - posix timer_set interface
394 * @timr: k_itimer pointer to be deleted
395 * @flags: timer flags
396 * @new_setting: itimerspec to be used
397 * @old_setting: itimerspec being replaced
398 *
399 * Sets the timer to new_setting, and starts the timer.
400 */
401static int alarm_timer_set(struct k_itimer *timr, int flags,
402 struct itimerspec *new_setting,
403 struct itimerspec *old_setting)
404{
405 /* Save old values */
406 old_setting->it_interval =
407 ktime_to_timespec(timr->it.alarmtimer.period);
408 old_setting->it_value =
409 ktime_to_timespec(timr->it.alarmtimer.node.expires);
410
411 /* If the timer was already set, cancel it */
412 alarm_cancel(&timr->it.alarmtimer);
413
414 /* start the timer */
415 alarm_start(&timr->it.alarmtimer,
416 timespec_to_ktime(new_setting->it_value),
417 timespec_to_ktime(new_setting->it_interval));
418 return 0;
419}
420
421/**
422 * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep
423 * @alarm: ptr to alarm that fired
424 *
425 * Wakes up the task that set the alarmtimer
426 */
427static void alarmtimer_nsleep_wakeup(struct alarm *alarm)
428{
429 struct task_struct *task = (struct task_struct *)alarm->data;
430
431 alarm->data = NULL;
432 if (task)
433 wake_up_process(task);
434}
435
436/**
437 * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation
438 * @alarm: ptr to alarmtimer
439 * @absexp: absolute expiration time
440 *
441 * Sets the alarm timer and sleeps until it is fired or interrupted.
442 */
443static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp)
444{
445 alarm->data = (void *)current;
446 do {
447 set_current_state(TASK_INTERRUPTIBLE);
448 alarm_start(alarm, absexp, ktime_set(0, 0));
449 if (likely(alarm->data))
450 schedule();
451
452 alarm_cancel(alarm);
453 } while (alarm->data && !signal_pending(current));
454
455 __set_current_state(TASK_RUNNING);
456
457 return (alarm->data == NULL);
458}
459
460
461/**
462 * update_rmtp - Update remaining timespec value
463 * @exp: expiration time
464 * @type: timer type
465 * @rmtp: user pointer to remaining timepsec value
466 *
467 * Helper function that fills in rmtp value with time between
468 * now and the exp value
469 */
470static int update_rmtp(ktime_t exp, enum alarmtimer_type type,
471 struct timespec __user *rmtp)
472{
473 struct timespec rmt;
474 ktime_t rem;
475
476 rem = ktime_sub(exp, alarm_bases[type].gettime());
477
478 if (rem.tv64 <= 0)
479 return 0;
480 rmt = ktime_to_timespec(rem);
481
482 if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
483 return -EFAULT;
484
485 return 1;
486
487}
488
489/**
490 * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep
491 * @restart: ptr to restart block
492 *
493 * Handles restarted clock_nanosleep calls
494 */
495static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
496{
497 enum alarmtimer_type type = restart->nanosleep.clockid;
498 ktime_t exp;
499 struct timespec __user *rmtp;
500 struct alarm alarm;
501 int ret = 0;
502
503 exp.tv64 = restart->nanosleep.expires;
504 alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
505
506 if (alarmtimer_do_nsleep(&alarm, exp))
507 goto out;
508
509 if (freezing(current))
510 alarmtimer_freezerset(exp, type);
511
512 rmtp = restart->nanosleep.rmtp;
513 if (rmtp) {
514 ret = update_rmtp(exp, type, rmtp);
515 if (ret <= 0)
516 goto out;
517 }
518
519
520 /* The other values in restart are already filled in */
521 ret = -ERESTART_RESTARTBLOCK;
522out:
523 return ret;
524}
525
526/**
527 * alarm_timer_nsleep - alarmtimer nanosleep
528 * @which_clock: clockid
529 * @flags: determins abstime or relative
530 * @tsreq: requested sleep time (abs or rel)
531 * @rmtp: remaining sleep time saved
532 *
533 * Handles clock_nanosleep calls against _ALARM clockids
534 */
535static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
536 struct timespec *tsreq, struct timespec __user *rmtp)
537{
538 enum alarmtimer_type type = clock2alarm(which_clock);
539 struct alarm alarm;
540 ktime_t exp;
541 int ret = 0;
542 struct restart_block *restart;
543
544 if (!capable(CAP_WAKE_ALARM))
545 return -EPERM;
546
547 alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
548
549 exp = timespec_to_ktime(*tsreq);
550 /* Convert (if necessary) to absolute time */
551 if (flags != TIMER_ABSTIME) {
552 ktime_t now = alarm_bases[type].gettime();
553 exp = ktime_add(now, exp);
554 }
555
556 if (alarmtimer_do_nsleep(&alarm, exp))
557 goto out;
558
559 if (freezing(current))
560 alarmtimer_freezerset(exp, type);
561
562 /* abs timers don't set remaining time or restart */
563 if (flags == TIMER_ABSTIME) {
564 ret = -ERESTARTNOHAND;
565 goto out;
566 }
567
568 if (rmtp) {
569 ret = update_rmtp(exp, type, rmtp);
570 if (ret <= 0)
571 goto out;
572 }
573
574 restart = &current_thread_info()->restart_block;
575 restart->fn = alarm_timer_nsleep_restart;
576 restart->nanosleep.clockid = type;
577 restart->nanosleep.expires = exp.tv64;
578 restart->nanosleep.rmtp = rmtp;
579 ret = -ERESTART_RESTARTBLOCK;
580
581out:
582 return ret;
583}
584
585
586/* Suspend hook structures */
587static const struct dev_pm_ops alarmtimer_pm_ops = {
588 .suspend = alarmtimer_suspend,
589};
590
591static struct platform_driver alarmtimer_driver = {
592 .driver = {
593 .name = "alarmtimer",
594 .pm = &alarmtimer_pm_ops,
595 }
596};
597
598/**
599 * alarmtimer_init - Initialize alarm timer code
600 *
601 * This function initializes the alarm bases and registers
602 * the posix clock ids.
603 */
604static int __init alarmtimer_init(void)
605{
606 int error = 0;
607 int i;
608 struct k_clock alarm_clock = {
609 .clock_getres = alarm_clock_getres,
610 .clock_get = alarm_clock_get,
611 .timer_create = alarm_timer_create,
612 .timer_set = alarm_timer_set,
613 .timer_del = alarm_timer_del,
614 .timer_get = alarm_timer_get,
615 .nsleep = alarm_timer_nsleep,
616 };
617
618 posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
619 posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
620
621 /* Initialize alarm bases */
622 alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
623 alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real;
624 alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME;
625 alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime;
626 for (i = 0; i < ALARM_NUMTYPE; i++) {
627 timerqueue_init_head(&alarm_bases[i].timerqueue);
628 spin_lock_init(&alarm_bases[i].lock);
629 hrtimer_init(&alarm_bases[i].timer,
630 alarm_bases[i].base_clockid,
631 HRTIMER_MODE_ABS);
632 alarm_bases[i].timer.function = alarmtimer_fired;
633 }
634 error = platform_driver_register(&alarmtimer_driver);
635 platform_device_register_simple("alarmtimer", -1, NULL, 0);
636
637 return error;
638}
639device_initcall(alarmtimer_init);
640
641#ifdef CONFIG_RTC_CLASS
642/**
643 * has_wakealarm - check rtc device has wakealarm ability
644 * @dev: current device
645 * @name_ptr: name to be returned
646 *
647 * This helper function checks to see if the rtc device can wake
648 * from suspend.
649 */
650static int __init has_wakealarm(struct device *dev, void *name_ptr)
651{
652 struct rtc_device *candidate = to_rtc_device(dev);
653
654 if (!candidate->ops->set_alarm)
655 return 0;
656 if (!device_may_wakeup(candidate->dev.parent))
657 return 0;
658
659 *(const char **)name_ptr = dev_name(dev);
660 return 1;
661}
662
663/**
664 * alarmtimer_init_late - Late initializing of alarmtimer code
665 *
666 * This function locates a rtc device to use for wakealarms.
667 * Run as late_initcall to make sure rtc devices have been
668 * registered.
669 */
670static int __init alarmtimer_init_late(void)
671{
672 struct device *dev;
673 char *str;
674
675 /* Find an rtc device and init the rtc_timer */
676 dev = class_find_device(rtc_class, NULL, &str, has_wakealarm);
677 /* If we have a device then str is valid. See has_wakealarm() */
678 if (dev) {
679 rtcdev = rtc_class_open(str);
680 /*
681 * Drop the reference we got in class_find_device,
682 * rtc_open takes its own.
683 */
684 put_device(dev);
685 }
686 if (!rtcdev) {
687 printk(KERN_WARNING "No RTC device found, ALARM timers will"
688 " not wake from suspend");
689 }
690 rtc_timer_init(&rtctimer, NULL, NULL);
691
692 return 0;
693}
694#else
695static int __init alarmtimer_init_late(void)
696{
697 printk(KERN_WARNING "Kernel not built with RTC support, ALARM timers"
698 " will not wake from suspend");
699 return 0;
700}
701#endif
702late_initcall(alarmtimer_init_late);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d7395fdfb9f3..c027d4f602f1 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,7 +18,6 @@
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
@@ -195,6 +194,70 @@ void clockevents_register_device(struct clock_event_device *dev)
195} 194}
196EXPORT_SYMBOL_GPL(clockevents_register_device); 195EXPORT_SYMBOL_GPL(clockevents_register_device);
197 196
197static void clockevents_config(struct clock_event_device *dev,
198 u32 freq)
199{
200 u64 sec;
201
202 if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
203 return;
204
205 /*
206 * Calculate the maximum number of seconds we can sleep. Limit
207 * to 10 minutes for hardware which can program more than
208 * 32bit ticks so we still get reasonable conversion values.
209 */
210 sec = dev->max_delta_ticks;
211 do_div(sec, freq);
212 if (!sec)
213 sec = 1;
214 else if (sec > 600 && dev->max_delta_ticks > UINT_MAX)
215 sec = 600;
216
217 clockevents_calc_mult_shift(dev, freq, sec);
218 dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev);
219 dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev);
220}
221
222/**
223 * clockevents_config_and_register - Configure and register a clock event device
224 * @dev: device to register
225 * @freq: The clock frequency
226 * @min_delta: The minimum clock ticks to program in oneshot mode
227 * @max_delta: The maximum clock ticks to program in oneshot mode
228 *
229 * min/max_delta can be 0 for devices which do not support oneshot mode.
230 */
231void clockevents_config_and_register(struct clock_event_device *dev,
232 u32 freq, unsigned long min_delta,
233 unsigned long max_delta)
234{
235 dev->min_delta_ticks = min_delta;
236 dev->max_delta_ticks = max_delta;
237 clockevents_config(dev, freq);
238 clockevents_register_device(dev);
239}
240
241/**
242 * clockevents_update_freq - Update frequency and reprogram a clock event device.
243 * @dev: device to modify
244 * @freq: new device frequency
245 *
246 * Reconfigure and reprogram a clock event device in oneshot
247 * mode. Must be called on the cpu for which the device delivers per
248 * cpu timer events with interrupts disabled! Returns 0 on success,
249 * -ETIME when the event is in the past.
250 */
251int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
252{
253 clockevents_config(dev, freq);
254
255 if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
256 return 0;
257
258 return clockevents_program_event(dev, dev->next_event, ktime_get());
259}
260
198/* 261/*
199 * Noop handler when we shut down an event device 262 * Noop handler when we shut down an event device
200 */ 263 */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 6519cf62d9cd..1c95fd677328 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -626,19 +626,6 @@ static void clocksource_enqueue(struct clocksource *cs)
626 list_add(&cs->list, entry); 626 list_add(&cs->list, entry);
627} 627}
628 628
629
630/*
631 * Maximum time we expect to go between ticks. This includes idle
632 * tickless time. It provides the trade off between selecting a
633 * mult/shift pair that is very precise but can only handle a short
634 * period of time, vs. a mult/shift pair that can handle long periods
635 * of time but isn't as precise.
636 *
637 * This is a subsystem constant, and actual hardware limitations
638 * may override it (ie: clocksources that wrap every 3 seconds).
639 */
640#define MAX_UPDATE_LENGTH 5 /* Seconds */
641
642/** 629/**
643 * __clocksource_updatefreq_scale - Used update clocksource with new freq 630 * __clocksource_updatefreq_scale - Used update clocksource with new freq
644 * @t: clocksource to be registered 631 * @t: clocksource to be registered
@@ -652,15 +639,28 @@ static void clocksource_enqueue(struct clocksource *cs)
652 */ 639 */
653void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) 640void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
654{ 641{
642 u64 sec;
643
655 /* 644 /*
656 * Ideally we want to use some of the limits used in 645 * Calc the maximum number of seconds which we can run before
657 * clocksource_max_deferment, to provide a more informed 646 * wrapping around. For clocksources which have a mask > 32bit
658 * MAX_UPDATE_LENGTH. But for now this just gets the 647 * we need to limit the max sleep time to have a good
659 * register interface working properly. 648 * conversion precision. 10 minutes is still a reasonable
649 * amount. That results in a shift value of 24 for a
650 * clocksource with mask >= 40bit and f >= 4GHz. That maps to
651 * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
652 * margin as we do in clocksource_max_deferment()
660 */ 653 */
654 sec = (cs->mask - (cs->mask >> 5));
655 do_div(sec, freq);
656 do_div(sec, scale);
657 if (!sec)
658 sec = 1;
659 else if (sec > 600 && cs->mask > UINT_MAX)
660 sec = 600;
661
661 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, 662 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
662 NSEC_PER_SEC/scale, 663 NSEC_PER_SEC / scale, sec * scale);
663 MAX_UPDATE_LENGTH*scale);
664 cs->max_idle_ns = clocksource_max_deferment(cs); 664 cs->max_idle_ns = clocksource_max_deferment(cs);
665} 665}
666EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); 666EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
@@ -685,8 +685,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
685 /* Add clocksource to the clcoksource list */ 685 /* Add clocksource to the clcoksource list */
686 mutex_lock(&clocksource_mutex); 686 mutex_lock(&clocksource_mutex);
687 clocksource_enqueue(cs); 687 clocksource_enqueue(cs);
688 clocksource_select();
689 clocksource_enqueue_watchdog(cs); 688 clocksource_enqueue_watchdog(cs);
689 clocksource_select();
690 mutex_unlock(&clocksource_mutex); 690 mutex_unlock(&clocksource_mutex);
691 return 0; 691 return 0;
692} 692}
@@ -706,8 +706,8 @@ int clocksource_register(struct clocksource *cs)
706 706
707 mutex_lock(&clocksource_mutex); 707 mutex_lock(&clocksource_mutex);
708 clocksource_enqueue(cs); 708 clocksource_enqueue(cs);
709 clocksource_select();
710 clocksource_enqueue_watchdog(cs); 709 clocksource_enqueue_watchdog(cs);
710 clocksource_select();
711 mutex_unlock(&clocksource_mutex); 711 mutex_unlock(&clocksource_mutex);
712 return 0; 712 return 0;
713} 713}
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 5404a8456909..a470154e0408 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -22,8 +22,11 @@
22************************************************************************/ 22************************************************************************/
23#include <linux/clocksource.h> 23#include <linux/clocksource.h>
24#include <linux/jiffies.h> 24#include <linux/jiffies.h>
25#include <linux/module.h>
25#include <linux/init.h> 26#include <linux/init.h>
26 27
28#include "tick-internal.h"
29
27/* The Jiffies based clocksource is the lowest common 30/* The Jiffies based clocksource is the lowest common
28 * denominator clock source which should function on 31 * denominator clock source which should function on
29 * all systems. It has the same coarse resolution as 32 * all systems. It has the same coarse resolution as
@@ -31,7 +34,7 @@
31 * inaccuracies caused by missed or lost timer 34 * inaccuracies caused by missed or lost timer
32 * interrupts and the inability for the timer 35 * interrupts and the inability for the timer
33 * interrupt hardware to accuratly tick at the 36 * interrupt hardware to accuratly tick at the
34 * requested HZ value. It is also not reccomended 37 * requested HZ value. It is also not recommended
35 * for "tick-less" systems. 38 * for "tick-less" systems.
36 */ 39 */
37#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) 40#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
@@ -64,6 +67,23 @@ struct clocksource clocksource_jiffies = {
64 .shift = JIFFIES_SHIFT, 67 .shift = JIFFIES_SHIFT,
65}; 68};
66 69
70#if (BITS_PER_LONG < 64)
71u64 get_jiffies_64(void)
72{
73 unsigned long seq;
74 u64 ret;
75
76 do {
77 seq = read_seqbegin(&xtime_lock);
78 ret = jiffies_64;
79 } while (read_seqretry(&xtime_lock, seq));
80 return ret;
81}
82EXPORT_SYMBOL(get_jiffies_64);
83#endif
84
85EXPORT_SYMBOL(jiffies);
86
67static int __init init_jiffies_clocksource(void) 87static int __init init_jiffies_clocksource(void)
68{ 88{
69 return clocksource_register(&clocksource_jiffies); 89 return clocksource_register(&clocksource_jiffies);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5c00242fa921..f6117a4c7cb8 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -16,6 +16,8 @@
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/module.h> 17#include <linux/module.h>
18 18
19#include "tick-internal.h"
20
19/* 21/*
20 * NTP timekeeping variables: 22 * NTP timekeeping variables:
21 */ 23 */
@@ -646,6 +648,19 @@ int do_adjtimex(struct timex *txc)
646 hrtimer_cancel(&leap_timer); 648 hrtimer_cancel(&leap_timer);
647 } 649 }
648 650
651 if (txc->modes & ADJ_SETOFFSET) {
652 struct timespec delta;
653 delta.tv_sec = txc->time.tv_sec;
654 delta.tv_nsec = txc->time.tv_usec;
655 if (!capable(CAP_SYS_TIME))
656 return -EPERM;
657 if (!(txc->modes & ADJ_NANO))
658 delta.tv_nsec *= 1000;
659 result = timekeeping_inject_offset(&delta);
660 if (result)
661 return result;
662 }
663
649 getnstimeofday(&ts); 664 getnstimeofday(&ts);
650 665
651 write_seqlock_irq(&xtime_lock); 666 write_seqlock_irq(&xtime_lock);
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
new file mode 100644
index 000000000000..c340ca658f37
--- /dev/null
+++ b/kernel/time/posix-clock.c
@@ -0,0 +1,445 @@
1/*
2 * posix-clock.c - support for dynamic clock devices
3 *
4 * Copyright (C) 2010 OMICRON electronics GmbH
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20#include <linux/device.h>
21#include <linux/file.h>
22#include <linux/posix-clock.h>
23#include <linux/slab.h>
24#include <linux/syscalls.h>
25#include <linux/uaccess.h>
26
27static void delete_clock(struct kref *kref);
28
29/*
30 * Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
31 */
32static struct posix_clock *get_posix_clock(struct file *fp)
33{
34 struct posix_clock *clk = fp->private_data;
35
36 down_read(&clk->rwsem);
37
38 if (!clk->zombie)
39 return clk;
40
41 up_read(&clk->rwsem);
42
43 return NULL;
44}
45
46static void put_posix_clock(struct posix_clock *clk)
47{
48 up_read(&clk->rwsem);
49}
50
51static ssize_t posix_clock_read(struct file *fp, char __user *buf,
52 size_t count, loff_t *ppos)
53{
54 struct posix_clock *clk = get_posix_clock(fp);
55 int err = -EINVAL;
56
57 if (!clk)
58 return -ENODEV;
59
60 if (clk->ops.read)
61 err = clk->ops.read(clk, fp->f_flags, buf, count);
62
63 put_posix_clock(clk);
64
65 return err;
66}
67
68static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
69{
70 struct posix_clock *clk = get_posix_clock(fp);
71 int result = 0;
72
73 if (!clk)
74 return -ENODEV;
75
76 if (clk->ops.poll)
77 result = clk->ops.poll(clk, fp, wait);
78
79 put_posix_clock(clk);
80
81 return result;
82}
83
84static int posix_clock_fasync(int fd, struct file *fp, int on)
85{
86 struct posix_clock *clk = get_posix_clock(fp);
87 int err = 0;
88
89 if (!clk)
90 return -ENODEV;
91
92 if (clk->ops.fasync)
93 err = clk->ops.fasync(clk, fd, fp, on);
94
95 put_posix_clock(clk);
96
97 return err;
98}
99
100static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma)
101{
102 struct posix_clock *clk = get_posix_clock(fp);
103 int err = -ENODEV;
104
105 if (!clk)
106 return -ENODEV;
107
108 if (clk->ops.mmap)
109 err = clk->ops.mmap(clk, vma);
110
111 put_posix_clock(clk);
112
113 return err;
114}
115
116static long posix_clock_ioctl(struct file *fp,
117 unsigned int cmd, unsigned long arg)
118{
119 struct posix_clock *clk = get_posix_clock(fp);
120 int err = -ENOTTY;
121
122 if (!clk)
123 return -ENODEV;
124
125 if (clk->ops.ioctl)
126 err = clk->ops.ioctl(clk, cmd, arg);
127
128 put_posix_clock(clk);
129
130 return err;
131}
132
133#ifdef CONFIG_COMPAT
134static long posix_clock_compat_ioctl(struct file *fp,
135 unsigned int cmd, unsigned long arg)
136{
137 struct posix_clock *clk = get_posix_clock(fp);
138 int err = -ENOTTY;
139
140 if (!clk)
141 return -ENODEV;
142
143 if (clk->ops.ioctl)
144 err = clk->ops.ioctl(clk, cmd, arg);
145
146 put_posix_clock(clk);
147
148 return err;
149}
150#endif
151
152static int posix_clock_open(struct inode *inode, struct file *fp)
153{
154 int err;
155 struct posix_clock *clk =
156 container_of(inode->i_cdev, struct posix_clock, cdev);
157
158 down_read(&clk->rwsem);
159
160 if (clk->zombie) {
161 err = -ENODEV;
162 goto out;
163 }
164 if (clk->ops.open)
165 err = clk->ops.open(clk, fp->f_mode);
166 else
167 err = 0;
168
169 if (!err) {
170 kref_get(&clk->kref);
171 fp->private_data = clk;
172 }
173out:
174 up_read(&clk->rwsem);
175 return err;
176}
177
178static int posix_clock_release(struct inode *inode, struct file *fp)
179{
180 struct posix_clock *clk = fp->private_data;
181 int err = 0;
182
183 if (clk->ops.release)
184 err = clk->ops.release(clk);
185
186 kref_put(&clk->kref, delete_clock);
187
188 fp->private_data = NULL;
189
190 return err;
191}
192
193static const struct file_operations posix_clock_file_operations = {
194 .owner = THIS_MODULE,
195 .llseek = no_llseek,
196 .read = posix_clock_read,
197 .poll = posix_clock_poll,
198 .unlocked_ioctl = posix_clock_ioctl,
199 .open = posix_clock_open,
200 .release = posix_clock_release,
201 .fasync = posix_clock_fasync,
202 .mmap = posix_clock_mmap,
203#ifdef CONFIG_COMPAT
204 .compat_ioctl = posix_clock_compat_ioctl,
205#endif
206};
207
208int posix_clock_register(struct posix_clock *clk, dev_t devid)
209{
210 int err;
211
212 kref_init(&clk->kref);
213 init_rwsem(&clk->rwsem);
214
215 cdev_init(&clk->cdev, &posix_clock_file_operations);
216 clk->cdev.owner = clk->ops.owner;
217 err = cdev_add(&clk->cdev, devid, 1);
218
219 return err;
220}
221EXPORT_SYMBOL_GPL(posix_clock_register);
222
223static void delete_clock(struct kref *kref)
224{
225 struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
226
227 if (clk->release)
228 clk->release(clk);
229}
230
231void posix_clock_unregister(struct posix_clock *clk)
232{
233 cdev_del(&clk->cdev);
234
235 down_write(&clk->rwsem);
236 clk->zombie = true;
237 up_write(&clk->rwsem);
238
239 kref_put(&clk->kref, delete_clock);
240}
241EXPORT_SYMBOL_GPL(posix_clock_unregister);
242
243struct posix_clock_desc {
244 struct file *fp;
245 struct posix_clock *clk;
246};
247
248static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd)
249{
250 struct file *fp = fget(CLOCKID_TO_FD(id));
251 int err = -EINVAL;
252
253 if (!fp)
254 return err;
255
256 if (fp->f_op->open != posix_clock_open || !fp->private_data)
257 goto out;
258
259 cd->fp = fp;
260 cd->clk = get_posix_clock(fp);
261
262 err = cd->clk ? 0 : -ENODEV;
263out:
264 if (err)
265 fput(fp);
266 return err;
267}
268
269static void put_clock_desc(struct posix_clock_desc *cd)
270{
271 put_posix_clock(cd->clk);
272 fput(cd->fp);
273}
274
275static int pc_clock_adjtime(clockid_t id, struct timex *tx)
276{
277 struct posix_clock_desc cd;
278 int err;
279
280 err = get_clock_desc(id, &cd);
281 if (err)
282 return err;
283
284 if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
285 err = -EACCES;
286 goto out;
287 }
288
289 if (cd.clk->ops.clock_adjtime)
290 err = cd.clk->ops.clock_adjtime(cd.clk, tx);
291 else
292 err = -EOPNOTSUPP;
293out:
294 put_clock_desc(&cd);
295
296 return err;
297}
298
299static int pc_clock_gettime(clockid_t id, struct timespec *ts)
300{
301 struct posix_clock_desc cd;
302 int err;
303
304 err = get_clock_desc(id, &cd);
305 if (err)
306 return err;
307
308 if (cd.clk->ops.clock_gettime)
309 err = cd.clk->ops.clock_gettime(cd.clk, ts);
310 else
311 err = -EOPNOTSUPP;
312
313 put_clock_desc(&cd);
314
315 return err;
316}
317
318static int pc_clock_getres(clockid_t id, struct timespec *ts)
319{
320 struct posix_clock_desc cd;
321 int err;
322
323 err = get_clock_desc(id, &cd);
324 if (err)
325 return err;
326
327 if (cd.clk->ops.clock_getres)
328 err = cd.clk->ops.clock_getres(cd.clk, ts);
329 else
330 err = -EOPNOTSUPP;
331
332 put_clock_desc(&cd);
333
334 return err;
335}
336
337static int pc_clock_settime(clockid_t id, const struct timespec *ts)
338{
339 struct posix_clock_desc cd;
340 int err;
341
342 err = get_clock_desc(id, &cd);
343 if (err)
344 return err;
345
346 if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
347 err = -EACCES;
348 goto out;
349 }
350
351 if (cd.clk->ops.clock_settime)
352 err = cd.clk->ops.clock_settime(cd.clk, ts);
353 else
354 err = -EOPNOTSUPP;
355out:
356 put_clock_desc(&cd);
357
358 return err;
359}
360
361static int pc_timer_create(struct k_itimer *kit)
362{
363 clockid_t id = kit->it_clock;
364 struct posix_clock_desc cd;
365 int err;
366
367 err = get_clock_desc(id, &cd);
368 if (err)
369 return err;
370
371 if (cd.clk->ops.timer_create)
372 err = cd.clk->ops.timer_create(cd.clk, kit);
373 else
374 err = -EOPNOTSUPP;
375
376 put_clock_desc(&cd);
377
378 return err;
379}
380
381static int pc_timer_delete(struct k_itimer *kit)
382{
383 clockid_t id = kit->it_clock;
384 struct posix_clock_desc cd;
385 int err;
386
387 err = get_clock_desc(id, &cd);
388 if (err)
389 return err;
390
391 if (cd.clk->ops.timer_delete)
392 err = cd.clk->ops.timer_delete(cd.clk, kit);
393 else
394 err = -EOPNOTSUPP;
395
396 put_clock_desc(&cd);
397
398 return err;
399}
400
401static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
402{
403 clockid_t id = kit->it_clock;
404 struct posix_clock_desc cd;
405
406 if (get_clock_desc(id, &cd))
407 return;
408
409 if (cd.clk->ops.timer_gettime)
410 cd.clk->ops.timer_gettime(cd.clk, kit, ts);
411
412 put_clock_desc(&cd);
413}
414
415static int pc_timer_settime(struct k_itimer *kit, int flags,
416 struct itimerspec *ts, struct itimerspec *old)
417{
418 clockid_t id = kit->it_clock;
419 struct posix_clock_desc cd;
420 int err;
421
422 err = get_clock_desc(id, &cd);
423 if (err)
424 return err;
425
426 if (cd.clk->ops.timer_settime)
427 err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
428 else
429 err = -EOPNOTSUPP;
430
431 put_clock_desc(&cd);
432
433 return err;
434}
435
436struct k_clock clock_posix_dynamic = {
437 .clock_getres = pc_clock_getres,
438 .clock_set = pc_clock_settime,
439 .clock_get = pc_clock_gettime,
440 .clock_adj = pc_clock_adjtime,
441 .timer_create = pc_timer_create,
442 .timer_set = pc_timer_settime,
443 .timer_del = pc_timer_delete,
444 .timer_get = pc_timer_gettime,
445};
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 48b2761b5668..c7218d132738 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
@@ -457,23 +456,27 @@ void tick_broadcast_oneshot_control(unsigned long reason)
457 unsigned long flags; 456 unsigned long flags;
458 int cpu; 457 int cpu;
459 458
460 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
461
462 /* 459 /*
463 * Periodic mode does not care about the enter/exit of power 460 * Periodic mode does not care about the enter/exit of power
464 * states 461 * states
465 */ 462 */
466 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) 463 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
467 goto out; 464 return;
468 465
469 bc = tick_broadcast_device.evtdev; 466 /*
467 * We are called with preemtion disabled from the depth of the
468 * idle code, so we can't be moved away.
469 */
470 cpu = smp_processor_id(); 470 cpu = smp_processor_id();
471 td = &per_cpu(tick_cpu_device, cpu); 471 td = &per_cpu(tick_cpu_device, cpu);
472 dev = td->evtdev; 472 dev = td->evtdev;
473 473
474 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) 474 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
475 goto out; 475 return;
476
477 bc = tick_broadcast_device.evtdev;
476 478
479 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
477 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { 480 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
478 if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { 481 if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
479 cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); 482 cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
@@ -490,8 +493,6 @@ void tick_broadcast_oneshot_control(unsigned long reason)
490 tick_program_event(dev->next_event, 1); 493 tick_program_event(dev->next_event, 1);
491 } 494 }
492 } 495 }
493
494out:
495 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 496 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
496} 497}
497 498
@@ -523,10 +524,11 @@ static void tick_broadcast_init_next_event(struct cpumask *mask,
523 */ 524 */
524void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 525void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
525{ 526{
527 int cpu = smp_processor_id();
528
526 /* Set it up only once ! */ 529 /* Set it up only once ! */
527 if (bc->event_handler != tick_handle_oneshot_broadcast) { 530 if (bc->event_handler != tick_handle_oneshot_broadcast) {
528 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; 531 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
529 int cpu = smp_processor_id();
530 532
531 bc->event_handler = tick_handle_oneshot_broadcast; 533 bc->event_handler = tick_handle_oneshot_broadcast;
532 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 534 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
@@ -552,6 +554,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
552 tick_broadcast_set_event(tick_next_period, 1); 554 tick_broadcast_set_event(tick_next_period, 1);
553 } else 555 } else
554 bc->next_event.tv64 = KTIME_MAX; 556 bc->next_event.tv64 = KTIME_MAX;
557 } else {
558 /*
559 * The first cpu which switches to oneshot mode sets
560 * the bit for all other cpus which are in the general
561 * (periodic) broadcast mask. So the bit is set and
562 * would prevent the first broadcast enter after this
563 * to program the bc device.
564 */
565 tick_broadcast_clear_oneshot(cpu);
555 } 566 }
556} 567}
557 568
@@ -600,4 +611,14 @@ int tick_broadcast_oneshot_active(void)
600 return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; 611 return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
601} 612}
602 613
614/*
615 * Check whether the broadcast device supports oneshot.
616 */
617bool tick_broadcast_oneshot_available(void)
618{
619 struct clock_event_device *bc = tick_broadcast_device.evtdev;
620
621 return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
622}
623
603#endif 624#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 051bc80a0c43..119528de8235 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include <asm/irq_regs.h> 22#include <asm/irq_regs.h>
24 23
@@ -51,7 +50,11 @@ int tick_is_oneshot_available(void)
51{ 50{
52 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 51 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
53 52
54 return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); 53 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT))
54 return 0;
55 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
56 return 1;
57 return tick_broadcast_oneshot_available();
55} 58}
56 59
57/* 60/*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 290eefbc1f60..1009b06d6f89 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
1/* 1/*
2 * tick internal variable and functions used by low/high res code 2 * tick internal variable and functions used by low/high res code
3 */ 3 */
4#include <linux/hrtimer.h>
5#include <linux/tick.h>
6
7#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
4 8
5#define TICK_DO_TIMER_NONE -1 9#define TICK_DO_TIMER_NONE -1
6#define TICK_DO_TIMER_BOOT -2 10#define TICK_DO_TIMER_BOOT -2
@@ -36,6 +40,7 @@ extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
36extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); 40extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
37extern int tick_broadcast_oneshot_active(void); 41extern int tick_broadcast_oneshot_active(void);
38extern void tick_check_oneshot_broadcast(int cpu); 42extern void tick_check_oneshot_broadcast(int cpu);
43bool tick_broadcast_oneshot_available(void);
39# else /* BROADCAST */ 44# else /* BROADCAST */
40static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 45static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
41{ 46{
@@ -46,6 +51,7 @@ static inline void tick_broadcast_switch_to_oneshot(void) { }
46static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 51static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
47static inline int tick_broadcast_oneshot_active(void) { return 0; } 52static inline int tick_broadcast_oneshot_active(void) { return 0; }
48static inline void tick_check_oneshot_broadcast(int cpu) { } 53static inline void tick_check_oneshot_broadcast(int cpu) { }
54static inline bool tick_broadcast_oneshot_available(void) { return true; }
49# endif /* !BROADCAST */ 55# endif /* !BROADCAST */
50 56
51#else /* !ONESHOT */ 57#else /* !ONESHOT */
@@ -76,6 +82,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
76 return 0; 82 return 0;
77} 83}
78static inline int tick_broadcast_oneshot_active(void) { return 0; } 84static inline int tick_broadcast_oneshot_active(void) { return 0; }
85static inline bool tick_broadcast_oneshot_available(void) { return false; }
79#endif /* !TICK_ONESHOT */ 86#endif /* !TICK_ONESHOT */
80 87
81/* 88/*
@@ -132,3 +139,8 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
132{ 139{
133 return !(dev->features & CLOCK_EVT_FEAT_DUMMY); 140 return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
134} 141}
142
143#endif
144
145extern void do_timer(unsigned long ticks);
146extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 5cbc101f908b..2d04411a5f05 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c55ea2433471..d5097c44b407 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -19,7 +19,6 @@
19#include <linux/percpu.h> 19#include <linux/percpu.h>
20#include <linux/profile.h> 20#include <linux/profile.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/tick.h>
23#include <linux/module.h> 22#include <linux/module.h>
24 23
25#include <asm/irq_regs.h> 24#include <asm/irq_regs.h>
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d27c7562902c..342408cf68dd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -14,7 +14,7 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/sysdev.h> 17#include <linux/syscore_ops.h>
18#include <linux/clocksource.h> 18#include <linux/clocksource.h>
19#include <linux/jiffies.h> 19#include <linux/jiffies.h>
20#include <linux/time.h> 20#include <linux/time.h>
@@ -353,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday);
353 * 353 *
354 * Sets the time of day to the new time and update NTP and notify hrtimers 354 * Sets the time of day to the new time and update NTP and notify hrtimers
355 */ 355 */
356int do_settimeofday(struct timespec *tv) 356int do_settimeofday(const struct timespec *tv)
357{ 357{
358 struct timespec ts_delta; 358 struct timespec ts_delta;
359 unsigned long flags; 359 unsigned long flags;
@@ -387,6 +387,42 @@ int do_settimeofday(struct timespec *tv)
387 387
388EXPORT_SYMBOL(do_settimeofday); 388EXPORT_SYMBOL(do_settimeofday);
389 389
390
391/**
392 * timekeeping_inject_offset - Adds or subtracts from the current time.
393 * @tv: pointer to the timespec variable containing the offset
394 *
395 * Adds or subtracts an offset value from the current time.
396 */
397int timekeeping_inject_offset(struct timespec *ts)
398{
399 unsigned long flags;
400
401 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
402 return -EINVAL;
403
404 write_seqlock_irqsave(&xtime_lock, flags);
405
406 timekeeping_forward_now();
407
408 xtime = timespec_add(xtime, *ts);
409 wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts);
410
411 timekeeper.ntp_error = 0;
412 ntp_clear();
413
414 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
415 timekeeper.mult);
416
417 write_sequnlock_irqrestore(&xtime_lock, flags);
418
419 /* signal hrtimers about time change */
420 clock_was_set();
421
422 return 0;
423}
424EXPORT_SYMBOL(timekeeping_inject_offset);
425
390/** 426/**
391 * change_clocksource - Swaps clocksources if a new one is available 427 * change_clocksource - Swaps clocksources if a new one is available
392 * 428 *
@@ -560,14 +596,65 @@ void __init timekeeping_init(void)
560static struct timespec timekeeping_suspend_time; 596static struct timespec timekeeping_suspend_time;
561 597
562/** 598/**
599 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
600 * @delta: pointer to a timespec delta value
601 *
602 * Takes a timespec offset measuring a suspend interval and properly
603 * adds the sleep offset to the timekeeping variables.
604 */
605static void __timekeeping_inject_sleeptime(struct timespec *delta)
606{
607 xtime = timespec_add(xtime, *delta);
608 wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta);
609 total_sleep_time = timespec_add(total_sleep_time, *delta);
610}
611
612
613/**
614 * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
615 * @delta: pointer to a timespec delta value
616 *
617 * This hook is for architectures that cannot support read_persistent_clock
618 * because their RTC/persistent clock is only accessible when irqs are enabled.
619 *
620 * This function should only be called by rtc_resume(), and allows
621 * a suspend offset to be injected into the timekeeping values.
622 */
623void timekeeping_inject_sleeptime(struct timespec *delta)
624{
625 unsigned long flags;
626 struct timespec ts;
627
628 /* Make sure we don't set the clock twice */
629 read_persistent_clock(&ts);
630 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
631 return;
632
633 write_seqlock_irqsave(&xtime_lock, flags);
634 timekeeping_forward_now();
635
636 __timekeeping_inject_sleeptime(delta);
637
638 timekeeper.ntp_error = 0;
639 ntp_clear();
640 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
641 timekeeper.mult);
642
643 write_sequnlock_irqrestore(&xtime_lock, flags);
644
645 /* signal hrtimers about time change */
646 clock_was_set();
647}
648
649
650/**
563 * timekeeping_resume - Resumes the generic timekeeping subsystem. 651 * timekeeping_resume - Resumes the generic timekeeping subsystem.
564 * @dev: unused
565 * 652 *
566 * This is for the generic clocksource timekeeping. 653 * This is for the generic clocksource timekeeping.
567 * xtime/wall_to_monotonic/jiffies/etc are 654 * xtime/wall_to_monotonic/jiffies/etc are
568 * still managed by arch specific suspend/resume code. 655 * still managed by arch specific suspend/resume code.
569 */ 656 */
570static int timekeeping_resume(struct sys_device *dev) 657static void timekeeping_resume(void)
571{ 658{
572 unsigned long flags; 659 unsigned long flags;
573 struct timespec ts; 660 struct timespec ts;
@@ -580,9 +667,7 @@ static int timekeeping_resume(struct sys_device *dev)
580 667
581 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 668 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
582 ts = timespec_sub(ts, timekeeping_suspend_time); 669 ts = timespec_sub(ts, timekeeping_suspend_time);
583 xtime = timespec_add(xtime, ts); 670 __timekeeping_inject_sleeptime(&ts);
584 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
585 total_sleep_time = timespec_add(total_sleep_time, ts);
586 } 671 }
587 /* re-base the last cycle value */ 672 /* re-base the last cycle value */
588 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 673 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
@@ -595,12 +680,10 @@ static int timekeeping_resume(struct sys_device *dev)
595 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); 680 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
596 681
597 /* Resume hrtimers */ 682 /* Resume hrtimers */
598 hres_timers_resume(); 683 hrtimers_resume();
599
600 return 0;
601} 684}
602 685
603static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) 686static int timekeeping_suspend(void)
604{ 687{
605 unsigned long flags; 688 unsigned long flags;
606 689
@@ -618,26 +701,18 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
618} 701}
619 702
620/* sysfs resume/suspend bits for timekeeping */ 703/* sysfs resume/suspend bits for timekeeping */
621static struct sysdev_class timekeeping_sysclass = { 704static struct syscore_ops timekeeping_syscore_ops = {
622 .name = "timekeeping",
623 .resume = timekeeping_resume, 705 .resume = timekeeping_resume,
624 .suspend = timekeeping_suspend, 706 .suspend = timekeeping_suspend,
625}; 707};
626 708
627static struct sys_device device_timer = { 709static int __init timekeeping_init_ops(void)
628 .id = 0,
629 .cls = &timekeeping_sysclass,
630};
631
632static int __init timekeeping_init_device(void)
633{ 710{
634 int error = sysdev_class_register(&timekeeping_sysclass); 711 register_syscore_ops(&timekeeping_syscore_ops);
635 if (!error) 712 return 0;
636 error = sysdev_register(&device_timer);
637 return error;
638} 713}
639 714
640device_initcall(timekeeping_init_device); 715device_initcall(timekeeping_init_ops);
641 716
642/* 717/*
643 * If the error is already larger, we look ahead even further 718 * If the error is already larger, we look ahead even further
@@ -779,7 +854,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
779 * 854 *
780 * Called from the timer interrupt, must hold a write on xtime_lock. 855 * Called from the timer interrupt, must hold a write on xtime_lock.
781 */ 856 */
782void update_wall_time(void) 857static void update_wall_time(void)
783{ 858{
784 struct clocksource *clock; 859 struct clocksource *clock;
785 cycle_t offset; 860 cycle_t offset;
@@ -871,7 +946,7 @@ void update_wall_time(void)
871 * getboottime - Return the real time of system boot. 946 * getboottime - Return the real time of system boot.
872 * @ts: pointer to the timespec to be set 947 * @ts: pointer to the timespec to be set
873 * 948 *
874 * Returns the time of day in a timespec. 949 * Returns the wall-time of boot in a timespec.
875 * 950 *
876 * This is based on the wall_to_monotonic offset and the total suspend 951 * This is based on the wall_to_monotonic offset and the total suspend
877 * time. Calls to settimeofday will affect the value returned (which 952 * time. Calls to settimeofday will affect the value returned (which
@@ -889,6 +964,55 @@ void getboottime(struct timespec *ts)
889} 964}
890EXPORT_SYMBOL_GPL(getboottime); 965EXPORT_SYMBOL_GPL(getboottime);
891 966
967
968/**
969 * get_monotonic_boottime - Returns monotonic time since boot
970 * @ts: pointer to the timespec to be set
971 *
972 * Returns the monotonic time since boot in a timespec.
973 *
974 * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
975 * includes the time spent in suspend.
976 */
977void get_monotonic_boottime(struct timespec *ts)
978{
979 struct timespec tomono, sleep;
980 unsigned int seq;
981 s64 nsecs;
982
983 WARN_ON(timekeeping_suspended);
984
985 do {
986 seq = read_seqbegin(&xtime_lock);
987 *ts = xtime;
988 tomono = wall_to_monotonic;
989 sleep = total_sleep_time;
990 nsecs = timekeeping_get_ns();
991
992 } while (read_seqretry(&xtime_lock, seq));
993
994 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
995 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
996}
997EXPORT_SYMBOL_GPL(get_monotonic_boottime);
998
999/**
1000 * ktime_get_boottime - Returns monotonic time since boot in a ktime
1001 *
1002 * Returns the monotonic time since boot in a ktime
1003 *
1004 * This is similar to CLOCK_MONTONIC/ktime_get, but also
1005 * includes the time spent in suspend.
1006 */
1007ktime_t ktime_get_boottime(void)
1008{
1009 struct timespec ts;
1010
1011 get_monotonic_boottime(&ts);
1012 return timespec_to_ktime(ts);
1013}
1014EXPORT_SYMBOL_GPL(ktime_get_boottime);
1015
892/** 1016/**
893 * monotonic_to_bootbased - Convert the monotonic time to boot based. 1017 * monotonic_to_bootbased - Convert the monotonic time to boot based.
894 * @ts: pointer to the timespec to be converted 1018 * @ts: pointer to the timespec to be converted
@@ -910,11 +1034,6 @@ struct timespec __current_kernel_time(void)
910 return xtime; 1034 return xtime;
911} 1035}
912 1036
913struct timespec __get_wall_to_monotonic(void)
914{
915 return wall_to_monotonic;
916}
917
918struct timespec current_kernel_time(void) 1037struct timespec current_kernel_time(void)
919{ 1038{
920 struct timespec now; 1039 struct timespec now;
@@ -946,3 +1065,63 @@ struct timespec get_monotonic_coarse(void)
946 now.tv_nsec + mono.tv_nsec); 1065 now.tv_nsec + mono.tv_nsec);
947 return now; 1066 return now;
948} 1067}
1068
1069/*
1070 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1071 * without sampling the sequence number in xtime_lock.
1072 * jiffies is defined in the linker script...
1073 */
1074void do_timer(unsigned long ticks)
1075{
1076 jiffies_64 += ticks;
1077 update_wall_time();
1078 calc_global_load(ticks);
1079}
1080
1081/**
1082 * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic,
1083 * and sleep offsets.
1084 * @xtim: pointer to timespec to be set with xtime
1085 * @wtom: pointer to timespec to be set with wall_to_monotonic
1086 * @sleep: pointer to timespec to be set with time in suspend
1087 */
1088void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1089 struct timespec *wtom, struct timespec *sleep)
1090{
1091 unsigned long seq;
1092
1093 do {
1094 seq = read_seqbegin(&xtime_lock);
1095 *xtim = xtime;
1096 *wtom = wall_to_monotonic;
1097 *sleep = total_sleep_time;
1098 } while (read_seqretry(&xtime_lock, seq));
1099}
1100
1101/**
1102 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
1103 */
1104ktime_t ktime_get_monotonic_offset(void)
1105{
1106 unsigned long seq;
1107 struct timespec wtom;
1108
1109 do {
1110 seq = read_seqbegin(&xtime_lock);
1111 wtom = wall_to_monotonic;
1112 } while (read_seqretry(&xtime_lock, seq));
1113 return timespec_to_ktime(wtom);
1114}
1115
1116/**
1117 * xtime_update() - advances the timekeeping infrastructure
1118 * @ticks: number of ticks, that have elapsed since the last call.
1119 *
1120 * Must be called with interrupts disabled.
1121 */
1122void xtime_update(unsigned long ticks)
1123{
1124 write_seqlock(&xtime_lock);
1125 do_timer(ticks);
1126 write_sequnlock(&xtime_lock);
1127}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 32a19f9397fc..3258455549f4 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -41,7 +41,7 @@ static void print_name_offset(struct seq_file *m, void *sym)
41 char symname[KSYM_NAME_LEN]; 41 char symname[KSYM_NAME_LEN];
42 42
43 if (lookup_symbol_name((unsigned long)sym, symname) < 0) 43 if (lookup_symbol_name((unsigned long)sym, symname) < 0)
44 SEQ_printf(m, "<%p>", sym); 44 SEQ_printf(m, "<%pK>", sym);
45 else 45 else
46 SEQ_printf(m, "%s", symname); 46 SEQ_printf(m, "%s", symname);
47} 47}
@@ -112,7 +112,7 @@ next_one:
112static void 112static void
113print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) 113print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
114{ 114{
115 SEQ_printf(m, " .base: %p\n", base); 115 SEQ_printf(m, " .base: %pK\n", base);
116 SEQ_printf(m, " .index: %d\n", 116 SEQ_printf(m, " .index: %d\n",
117 base->index); 117 base->index);
118 SEQ_printf(m, " .resolution: %Lu nsecs\n", 118 SEQ_printf(m, " .resolution: %Lu nsecs\n",
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 2f3b585b8d7d..a5d0a3a85dd8 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -236,7 +236,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
236 unsigned int timer_flag) 236 unsigned int timer_flag)
237{ 237{
238 /* 238 /*
239 * It doesnt matter which lock we take: 239 * It doesn't matter which lock we take:
240 */ 240 */
241 raw_spinlock_t *lock; 241 raw_spinlock_t *lock;
242 struct entry *entry, input; 242 struct entry *entry, input;
diff --git a/kernel/timer.c b/kernel/timer.c
index d53ce66daea0..fd6198692b57 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -404,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {}
404 404
405static struct debug_obj_descr timer_debug_descr; 405static struct debug_obj_descr timer_debug_descr;
406 406
407static void *timer_debug_hint(void *addr)
408{
409 return ((struct timer_list *) addr)->function;
410}
411
407/* 412/*
408 * fixup_init is called when: 413 * fixup_init is called when:
409 * - an active object is initialized 414 * - an active object is initialized
@@ -477,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
477 482
478static struct debug_obj_descr timer_debug_descr = { 483static struct debug_obj_descr timer_debug_descr = {
479 .name = "timer_list", 484 .name = "timer_list",
485 .debug_hint = timer_debug_hint,
480 .fixup_init = timer_fixup_init, 486 .fixup_init = timer_fixup_init,
481 .fixup_activate = timer_fixup_activate, 487 .fixup_activate = timer_fixup_activate,
482 .fixup_free = timer_fixup_free, 488 .fixup_free = timer_fixup_free,
@@ -959,11 +965,30 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
959 * 965 *
960 * Synchronization rules: Callers must prevent restarting of the timer, 966 * Synchronization rules: Callers must prevent restarting of the timer,
961 * otherwise this function is meaningless. It must not be called from 967 * otherwise this function is meaningless. It must not be called from
962 * hardirq contexts. The caller must not hold locks which would prevent 968 * interrupt contexts. The caller must not hold locks which would prevent
963 * completion of the timer's handler. The timer's handler must not call 969 * completion of the timer's handler. The timer's handler must not call
964 * add_timer_on(). Upon exit the timer is not queued and the handler is 970 * add_timer_on(). Upon exit the timer is not queued and the handler is
965 * not running on any CPU. 971 * not running on any CPU.
966 * 972 *
973 * Note: You must not hold locks that are held in interrupt context
974 * while calling this function. Even if the lock has nothing to do
975 * with the timer in question. Here's why:
976 *
977 * CPU0 CPU1
978 * ---- ----
979 * <SOFTIRQ>
980 * call_timer_fn();
981 * base->running_timer = mytimer;
982 * spin_lock_irq(somelock);
983 * <IRQ>
984 * spin_lock(somelock);
985 * del_timer_sync(mytimer);
986 * while (base->running_timer == mytimer);
987 *
988 * Now del_timer_sync() will never return and never release somelock.
989 * The interrupt on the other CPU is waiting to grab somelock but
990 * it has interrupted the softirq that CPU0 is waiting to finish.
991 *
967 * The function returns whether it has deactivated a pending timer or not. 992 * The function returns whether it has deactivated a pending timer or not.
968 */ 993 */
969int del_timer_sync(struct timer_list *timer) 994int del_timer_sync(struct timer_list *timer)
@@ -971,12 +996,14 @@ int del_timer_sync(struct timer_list *timer)
971#ifdef CONFIG_LOCKDEP 996#ifdef CONFIG_LOCKDEP
972 unsigned long flags; 997 unsigned long flags;
973 998
974 raw_local_irq_save(flags); 999 /*
975 local_bh_disable(); 1000 * If lockdep gives a backtrace here, please reference
1001 * the synchronization rules above.
1002 */
1003 local_irq_save(flags);
976 lock_map_acquire(&timer->lockdep_map); 1004 lock_map_acquire(&timer->lockdep_map);
977 lock_map_release(&timer->lockdep_map); 1005 lock_map_release(&timer->lockdep_map);
978 _local_bh_enable(); 1006 local_irq_restore(flags);
979 raw_local_irq_restore(flags);
980#endif 1007#endif
981 /* 1008 /*
982 * don't use it in hardirq context, because it 1009 * don't use it in hardirq context, because it
@@ -1297,19 +1324,6 @@ void run_local_timers(void)
1297 raise_softirq(TIMER_SOFTIRQ); 1324 raise_softirq(TIMER_SOFTIRQ);
1298} 1325}
1299 1326
1300/*
1301 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1302 * without sampling the sequence number in xtime_lock.
1303 * jiffies is defined in the linker script...
1304 */
1305
1306void do_timer(unsigned long ticks)
1307{
1308 jiffies_64 += ticks;
1309 update_wall_time();
1310 calc_global_load(ticks);
1311}
1312
1313#ifdef __ARCH_WANT_SYS_ALARM 1327#ifdef __ARCH_WANT_SYS_ALARM
1314 1328
1315/* 1329/*
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 14674dce77a6..2ad39e556cb4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -141,7 +141,7 @@ if FTRACE
141config FUNCTION_TRACER 141config FUNCTION_TRACER
142 bool "Kernel Function Tracer" 142 bool "Kernel Function Tracer"
143 depends on HAVE_FUNCTION_TRACER 143 depends on HAVE_FUNCTION_TRACER
144 select FRAME_POINTER if !ARM_UNWIND && !S390 144 select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE
145 select KALLSYMS 145 select KALLSYMS
146 select GENERIC_TRACER 146 select GENERIC_TRACER
147 select CONTEXT_SWITCH_TRACER 147 select CONTEXT_SWITCH_TRACER
@@ -275,7 +275,7 @@ config PROFILE_ANNOTATED_BRANCHES
275 This tracer profiles all the the likely and unlikely macros 275 This tracer profiles all the the likely and unlikely macros
276 in the kernel. It will display the results in: 276 in the kernel. It will display the results in:
277 277
278 /sys/kernel/debug/tracing/profile_annotated_branch 278 /sys/kernel/debug/tracing/trace_stat/branch_annotated
279 279
280 Note: this will add a significant overhead; only turn this 280 Note: this will add a significant overhead; only turn this
281 on if you need to profile the system's use of these macros. 281 on if you need to profile the system's use of these macros.
@@ -288,7 +288,7 @@ config PROFILE_ALL_BRANCHES
288 taken in the kernel is recorded whether it hit or miss. 288 taken in the kernel is recorded whether it hit or miss.
289 The results will be displayed in: 289 The results will be displayed in:
290 290
291 /sys/kernel/debug/tracing/profile_branch 291 /sys/kernel/debug/tracing/trace_stat/branch_all
292 292
293 This option also enables the likely/unlikely profiler. 293 This option also enables the likely/unlikely profiler.
294 294
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 153562d0b93c..6957aa298dfa 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -138,6 +138,13 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
138 !blk_tracer_enabled)) 138 !blk_tracer_enabled))
139 return; 139 return;
140 140
141 /*
142 * If the BLK_TC_NOTIFY action mask isn't set, don't send any note
143 * message to the trace.
144 */
145 if (!(bt->act_mask & BLK_TC_NOTIFY))
146 return;
147
141 local_irq_save(flags); 148 local_irq_save(flags);
142 buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); 149 buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
143 va_start(args, fmt); 150 va_start(args, fmt);
@@ -696,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q)
696 * 703 *
697 **/ 704 **/
698static void blk_add_trace_rq(struct request_queue *q, struct request *rq, 705static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
699 u32 what) 706 u32 what)
700{ 707{
701 struct blk_trace *bt = q->blk_trace; 708 struct blk_trace *bt = q->blk_trace;
702 int rw = rq->cmd_flags & 0x03;
703 709
704 if (likely(!bt)) 710 if (likely(!bt))
705 return; 711 return;
706 712
707 if (rq->cmd_flags & REQ_DISCARD)
708 rw |= REQ_DISCARD;
709
710 if (rq->cmd_flags & REQ_SECURE)
711 rw |= REQ_SECURE;
712
713 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 713 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
714 what |= BLK_TC_ACT(BLK_TC_PC); 714 what |= BLK_TC_ACT(BLK_TC_PC);
715 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, 715 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags,
716 what, rq->errors, rq->cmd_len, rq->cmd); 716 what, rq->errors, rq->cmd_len, rq->cmd);
717 } else { 717 } else {
718 what |= BLK_TC_ACT(BLK_TC_FS); 718 what |= BLK_TC_ACT(BLK_TC_FS);
719 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw, 719 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
720 what, rq->errors, 0, NULL); 720 rq->cmd_flags, what, rq->errors, 0, NULL);
721 } 721 }
722} 722}
723 723
@@ -850,29 +850,21 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
850 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); 850 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
851} 851}
852 852
853static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q) 853static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
854 unsigned int depth, bool explicit)
854{ 855{
855 struct blk_trace *bt = q->blk_trace; 856 struct blk_trace *bt = q->blk_trace;
856 857
857 if (bt) { 858 if (bt) {
858 unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; 859 __be64 rpdu = cpu_to_be64(depth);
859 __be64 rpdu = cpu_to_be64(pdu); 860 u32 what;
860
861 __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
862 sizeof(rpdu), &rpdu);
863 }
864}
865
866static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
867{
868 struct blk_trace *bt = q->blk_trace;
869 861
870 if (bt) { 862 if (explicit)
871 unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; 863 what = BLK_TA_UNPLUG_IO;
872 __be64 rpdu = cpu_to_be64(pdu); 864 else
865 what = BLK_TA_UNPLUG_TIMER;
873 866
874 __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, 867 __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
875 sizeof(rpdu), &rpdu);
876 } 868 }
877} 869}
878 870
@@ -1015,9 +1007,7 @@ static void blk_register_tracepoints(void)
1015 WARN_ON(ret); 1007 WARN_ON(ret);
1016 ret = register_trace_block_plug(blk_add_trace_plug, NULL); 1008 ret = register_trace_block_plug(blk_add_trace_plug, NULL);
1017 WARN_ON(ret); 1009 WARN_ON(ret);
1018 ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); 1010 ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
1019 WARN_ON(ret);
1020 ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
1021 WARN_ON(ret); 1011 WARN_ON(ret);
1022 ret = register_trace_block_split(blk_add_trace_split, NULL); 1012 ret = register_trace_block_split(blk_add_trace_split, NULL);
1023 WARN_ON(ret); 1013 WARN_ON(ret);
@@ -1032,8 +1022,7 @@ static void blk_unregister_tracepoints(void)
1032 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); 1022 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1033 unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); 1023 unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1034 unregister_trace_block_split(blk_add_trace_split, NULL); 1024 unregister_trace_block_split(blk_add_trace_split, NULL);
1035 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); 1025 unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
1036 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
1037 unregister_trace_block_plug(blk_add_trace_plug, NULL); 1026 unregister_trace_block_plug(blk_add_trace_plug, NULL);
1038 unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); 1027 unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
1039 unregister_trace_block_getrq(blk_add_trace_getrq, NULL); 1028 unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
@@ -1820,21 +1809,5 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1820 rwbs[i] = '\0'; 1809 rwbs[i] = '\0';
1821} 1810}
1822 1811
1823void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
1824{
1825 int rw = rq->cmd_flags & 0x03;
1826 int bytes;
1827
1828 if (rq->cmd_flags & REQ_DISCARD)
1829 rw |= REQ_DISCARD;
1830
1831 if (rq->cmd_flags & REQ_SECURE)
1832 rw |= REQ_SECURE;
1833
1834 bytes = blk_rq_bytes(rq);
1835
1836 blk_fill_rwbs(rwbs, rw, bytes);
1837}
1838
1839#endif /* CONFIG_EVENT_TRACING */ 1812#endif /* CONFIG_EVENT_TRACING */
1840 1813
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f3dadae83883..d017c2c82c44 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -39,20 +39,26 @@
39#include "trace_stat.h" 39#include "trace_stat.h"
40 40
41#define FTRACE_WARN_ON(cond) \ 41#define FTRACE_WARN_ON(cond) \
42 do { \ 42 ({ \
43 if (WARN_ON(cond)) \ 43 int ___r = cond; \
44 if (WARN_ON(___r)) \
44 ftrace_kill(); \ 45 ftrace_kill(); \
45 } while (0) 46 ___r; \
47 })
46 48
47#define FTRACE_WARN_ON_ONCE(cond) \ 49#define FTRACE_WARN_ON_ONCE(cond) \
48 do { \ 50 ({ \
49 if (WARN_ON_ONCE(cond)) \ 51 int ___r = cond; \
52 if (WARN_ON_ONCE(___r)) \
50 ftrace_kill(); \ 53 ftrace_kill(); \
51 } while (0) 54 ___r; \
55 })
52 56
53/* hash bits for specific function selection */ 57/* hash bits for specific function selection */
54#define FTRACE_HASH_BITS 7 58#define FTRACE_HASH_BITS 7
55#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) 59#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
60#define FTRACE_HASH_DEFAULT_BITS 10
61#define FTRACE_HASH_MAX_BITS 12
56 62
57/* ftrace_enabled is a method to turn ftrace on or off */ 63/* ftrace_enabled is a method to turn ftrace on or off */
58int ftrace_enabled __read_mostly; 64int ftrace_enabled __read_mostly;
@@ -81,23 +87,29 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
81 .func = ftrace_stub, 87 .func = ftrace_stub,
82}; 88};
83 89
84static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; 90static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
91static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
85ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 92ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
86ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 93ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
87ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 94ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
95static struct ftrace_ops global_ops;
96
97static void
98ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
88 99
89/* 100/*
90 * Traverse the ftrace_list, invoking all entries. The reason that we 101 * Traverse the ftrace_global_list, invoking all entries. The reason that we
91 * can use rcu_dereference_raw() is that elements removed from this list 102 * can use rcu_dereference_raw() is that elements removed from this list
92 * are simply leaked, so there is no need to interact with a grace-period 103 * are simply leaked, so there is no need to interact with a grace-period
93 * mechanism. The rcu_dereference_raw() calls are needed to handle 104 * mechanism. The rcu_dereference_raw() calls are needed to handle
94 * concurrent insertions into the ftrace_list. 105 * concurrent insertions into the ftrace_global_list.
95 * 106 *
96 * Silly Alpha and silly pointer-speculation compiler optimizations! 107 * Silly Alpha and silly pointer-speculation compiler optimizations!
97 */ 108 */
98static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 109static void ftrace_global_list_func(unsigned long ip,
110 unsigned long parent_ip)
99{ 111{
100 struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/ 112 struct ftrace_ops *op = rcu_dereference_raw(ftrace_global_list); /*see above*/
101 113
102 while (op != &ftrace_list_end) { 114 while (op != &ftrace_list_end) {
103 op->func(ip, parent_ip); 115 op->func(ip, parent_ip);
@@ -147,46 +159,69 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
147} 159}
148#endif 160#endif
149 161
150static int __register_ftrace_function(struct ftrace_ops *ops) 162static void update_global_ops(void)
151{ 163{
152 ops->next = ftrace_list; 164 ftrace_func_t func;
165
153 /* 166 /*
154 * We are entering ops into the ftrace_list but another 167 * If there's only one function registered, then call that
155 * CPU might be walking that list. We need to make sure 168 * function directly. Otherwise, we need to iterate over the
156 * the ops->next pointer is valid before another CPU sees 169 * registered callers.
157 * the ops pointer included into the ftrace_list.
158 */ 170 */
159 rcu_assign_pointer(ftrace_list, ops); 171 if (ftrace_global_list == &ftrace_list_end ||
172 ftrace_global_list->next == &ftrace_list_end)
173 func = ftrace_global_list->func;
174 else
175 func = ftrace_global_list_func;
160 176
161 if (ftrace_enabled) { 177 /* If we filter on pids, update to use the pid function */
162 ftrace_func_t func; 178 if (!list_empty(&ftrace_pids)) {
179 set_ftrace_pid_function(func);
180 func = ftrace_pid_func;
181 }
163 182
164 if (ops->next == &ftrace_list_end) 183 global_ops.func = func;
165 func = ops->func; 184}
166 else
167 func = ftrace_list_func;
168 185
169 if (!list_empty(&ftrace_pids)) { 186static void update_ftrace_function(void)
170 set_ftrace_pid_function(func); 187{
171 func = ftrace_pid_func; 188 ftrace_func_t func;
172 } 189
190 update_global_ops();
191
192 /*
193 * If we are at the end of the list and this ops is
194 * not dynamic, then have the mcount trampoline call
195 * the function directly
196 */
197 if (ftrace_ops_list == &ftrace_list_end ||
198 (ftrace_ops_list->next == &ftrace_list_end &&
199 !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC)))
200 func = ftrace_ops_list->func;
201 else
202 func = ftrace_ops_list_func;
173 203
174 /*
175 * For one func, simply call it directly.
176 * For more than one func, call the chain.
177 */
178#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 204#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
179 ftrace_trace_function = func; 205 ftrace_trace_function = func;
180#else 206#else
181 __ftrace_trace_function = func; 207 __ftrace_trace_function = func;
182 ftrace_trace_function = ftrace_test_stop_func; 208 ftrace_trace_function = ftrace_test_stop_func;
183#endif 209#endif
184 } 210}
185 211
186 return 0; 212static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
213{
214 ops->next = *list;
215 /*
216 * We are entering ops into the list but another
217 * CPU might be walking that list. We need to make sure
218 * the ops->next pointer is valid before another CPU sees
219 * the ops pointer included into the list.
220 */
221 rcu_assign_pointer(*list, ops);
187} 222}
188 223
189static int __unregister_ftrace_function(struct ftrace_ops *ops) 224static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
190{ 225{
191 struct ftrace_ops **p; 226 struct ftrace_ops **p;
192 227
@@ -194,13 +229,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
194 * If we are removing the last function, then simply point 229 * If we are removing the last function, then simply point
195 * to the ftrace_stub. 230 * to the ftrace_stub.
196 */ 231 */
197 if (ftrace_list == ops && ops->next == &ftrace_list_end) { 232 if (*list == ops && ops->next == &ftrace_list_end) {
198 ftrace_trace_function = ftrace_stub; 233 *list = &ftrace_list_end;
199 ftrace_list = &ftrace_list_end;
200 return 0; 234 return 0;
201 } 235 }
202 236
203 for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) 237 for (p = list; *p != &ftrace_list_end; p = &(*p)->next)
204 if (*p == ops) 238 if (*p == ops)
205 break; 239 break;
206 240
@@ -208,53 +242,83 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
208 return -1; 242 return -1;
209 243
210 *p = (*p)->next; 244 *p = (*p)->next;
245 return 0;
246}
211 247
212 if (ftrace_enabled) { 248static int __register_ftrace_function(struct ftrace_ops *ops)
213 /* If we only have one func left, then call that directly */ 249{
214 if (ftrace_list->next == &ftrace_list_end) { 250 if (ftrace_disabled)
215 ftrace_func_t func = ftrace_list->func; 251 return -ENODEV;
216 252
217 if (!list_empty(&ftrace_pids)) { 253 if (FTRACE_WARN_ON(ops == &global_ops))
218 set_ftrace_pid_function(func); 254 return -EINVAL;
219 func = ftrace_pid_func; 255
220 } 256 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
221#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 257 return -EBUSY;
222 ftrace_trace_function = func; 258
223#else 259 if (!core_kernel_data((unsigned long)ops))
224 __ftrace_trace_function = func; 260 ops->flags |= FTRACE_OPS_FL_DYNAMIC;
225#endif 261
226 } 262 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
227 } 263 int first = ftrace_global_list == &ftrace_list_end;
264 add_ftrace_ops(&ftrace_global_list, ops);
265 ops->flags |= FTRACE_OPS_FL_ENABLED;
266 if (first)
267 add_ftrace_ops(&ftrace_ops_list, &global_ops);
268 } else
269 add_ftrace_ops(&ftrace_ops_list, ops);
270
271 if (ftrace_enabled)
272 update_ftrace_function();
228 273
229 return 0; 274 return 0;
230} 275}
231 276
232static void ftrace_update_pid_func(void) 277static int __unregister_ftrace_function(struct ftrace_ops *ops)
233{ 278{
234 ftrace_func_t func; 279 int ret;
235 280
236 if (ftrace_trace_function == ftrace_stub) 281 if (ftrace_disabled)
237 return; 282 return -ENODEV;
238 283
239#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 284 if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
240 func = ftrace_trace_function; 285 return -EBUSY;
241#else
242 func = __ftrace_trace_function;
243#endif
244 286
245 if (!list_empty(&ftrace_pids)) { 287 if (FTRACE_WARN_ON(ops == &global_ops))
246 set_ftrace_pid_function(func); 288 return -EINVAL;
247 func = ftrace_pid_func;
248 } else {
249 if (func == ftrace_pid_func)
250 func = ftrace_pid_function;
251 }
252 289
253#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 290 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
254 ftrace_trace_function = func; 291 ret = remove_ftrace_ops(&ftrace_global_list, ops);
255#else 292 if (!ret && ftrace_global_list == &ftrace_list_end)
256 __ftrace_trace_function = func; 293 ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops);
257#endif 294 if (!ret)
295 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
296 } else
297 ret = remove_ftrace_ops(&ftrace_ops_list, ops);
298
299 if (ret < 0)
300 return ret;
301
302 if (ftrace_enabled)
303 update_ftrace_function();
304
305 /*
306 * Dynamic ops may be freed, we must make sure that all
307 * callers are done before leaving this function.
308 */
309 if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
310 synchronize_sched();
311
312 return 0;
313}
314
315static void ftrace_update_pid_func(void)
316{
317 /* Only do something if we are tracing something */
318 if (ftrace_trace_function == ftrace_stub)
319 return;
320
321 update_ftrace_function();
258} 322}
259 323
260#ifdef CONFIG_FUNCTION_PROFILER 324#ifdef CONFIG_FUNCTION_PROFILER
@@ -888,8 +952,35 @@ enum {
888 FTRACE_START_FUNC_RET = (1 << 3), 952 FTRACE_START_FUNC_RET = (1 << 3),
889 FTRACE_STOP_FUNC_RET = (1 << 4), 953 FTRACE_STOP_FUNC_RET = (1 << 4),
890}; 954};
955struct ftrace_func_entry {
956 struct hlist_node hlist;
957 unsigned long ip;
958};
959
960struct ftrace_hash {
961 unsigned long size_bits;
962 struct hlist_head *buckets;
963 unsigned long count;
964 struct rcu_head rcu;
965};
891 966
892static int ftrace_filtered; 967/*
968 * We make these constant because no one should touch them,
969 * but they are used as the default "empty hash", to avoid allocating
970 * it all the time. These are in a read only section such that if
971 * anyone does try to modify it, it will cause an exception.
972 */
973static const struct hlist_head empty_buckets[1];
974static const struct ftrace_hash empty_hash = {
975 .buckets = (struct hlist_head *)empty_buckets,
976};
977#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash)
978
979static struct ftrace_ops global_ops = {
980 .func = ftrace_stub,
981 .notrace_hash = EMPTY_HASH,
982 .filter_hash = EMPTY_HASH,
983};
893 984
894static struct dyn_ftrace *ftrace_new_addrs; 985static struct dyn_ftrace *ftrace_new_addrs;
895 986
@@ -912,6 +1003,269 @@ static struct ftrace_page *ftrace_pages;
912 1003
913static struct dyn_ftrace *ftrace_free_records; 1004static struct dyn_ftrace *ftrace_free_records;
914 1005
1006static struct ftrace_func_entry *
1007ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1008{
1009 unsigned long key;
1010 struct ftrace_func_entry *entry;
1011 struct hlist_head *hhd;
1012 struct hlist_node *n;
1013
1014 if (!hash->count)
1015 return NULL;
1016
1017 if (hash->size_bits > 0)
1018 key = hash_long(ip, hash->size_bits);
1019 else
1020 key = 0;
1021
1022 hhd = &hash->buckets[key];
1023
1024 hlist_for_each_entry_rcu(entry, n, hhd, hlist) {
1025 if (entry->ip == ip)
1026 return entry;
1027 }
1028 return NULL;
1029}
1030
1031static void __add_hash_entry(struct ftrace_hash *hash,
1032 struct ftrace_func_entry *entry)
1033{
1034 struct hlist_head *hhd;
1035 unsigned long key;
1036
1037 if (hash->size_bits)
1038 key = hash_long(entry->ip, hash->size_bits);
1039 else
1040 key = 0;
1041
1042 hhd = &hash->buckets[key];
1043 hlist_add_head(&entry->hlist, hhd);
1044 hash->count++;
1045}
1046
1047static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
1048{
1049 struct ftrace_func_entry *entry;
1050
1051 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
1052 if (!entry)
1053 return -ENOMEM;
1054
1055 entry->ip = ip;
1056 __add_hash_entry(hash, entry);
1057
1058 return 0;
1059}
1060
1061static void
1062free_hash_entry(struct ftrace_hash *hash,
1063 struct ftrace_func_entry *entry)
1064{
1065 hlist_del(&entry->hlist);
1066 kfree(entry);
1067 hash->count--;
1068}
1069
1070static void
1071remove_hash_entry(struct ftrace_hash *hash,
1072 struct ftrace_func_entry *entry)
1073{
1074 hlist_del(&entry->hlist);
1075 hash->count--;
1076}
1077
1078static void ftrace_hash_clear(struct ftrace_hash *hash)
1079{
1080 struct hlist_head *hhd;
1081 struct hlist_node *tp, *tn;
1082 struct ftrace_func_entry *entry;
1083 int size = 1 << hash->size_bits;
1084 int i;
1085
1086 if (!hash->count)
1087 return;
1088
1089 for (i = 0; i < size; i++) {
1090 hhd = &hash->buckets[i];
1091 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist)
1092 free_hash_entry(hash, entry);
1093 }
1094 FTRACE_WARN_ON(hash->count);
1095}
1096
1097static void free_ftrace_hash(struct ftrace_hash *hash)
1098{
1099 if (!hash || hash == EMPTY_HASH)
1100 return;
1101 ftrace_hash_clear(hash);
1102 kfree(hash->buckets);
1103 kfree(hash);
1104}
1105
1106static void __free_ftrace_hash_rcu(struct rcu_head *rcu)
1107{
1108 struct ftrace_hash *hash;
1109
1110 hash = container_of(rcu, struct ftrace_hash, rcu);
1111 free_ftrace_hash(hash);
1112}
1113
1114static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
1115{
1116 if (!hash || hash == EMPTY_HASH)
1117 return;
1118 call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
1119}
1120
1121static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
1122{
1123 struct ftrace_hash *hash;
1124 int size;
1125
1126 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
1127 if (!hash)
1128 return NULL;
1129
1130 size = 1 << size_bits;
1131 hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL);
1132
1133 if (!hash->buckets) {
1134 kfree(hash);
1135 return NULL;
1136 }
1137
1138 hash->size_bits = size_bits;
1139
1140 return hash;
1141}
1142
1143static struct ftrace_hash *
1144alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1145{
1146 struct ftrace_func_entry *entry;
1147 struct ftrace_hash *new_hash;
1148 struct hlist_node *tp;
1149 int size;
1150 int ret;
1151 int i;
1152
1153 new_hash = alloc_ftrace_hash(size_bits);
1154 if (!new_hash)
1155 return NULL;
1156
1157 /* Empty hash? */
1158 if (!hash || !hash->count)
1159 return new_hash;
1160
1161 size = 1 << hash->size_bits;
1162 for (i = 0; i < size; i++) {
1163 hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) {
1164 ret = add_hash_entry(new_hash, entry->ip);
1165 if (ret < 0)
1166 goto free_hash;
1167 }
1168 }
1169
1170 FTRACE_WARN_ON(new_hash->count != hash->count);
1171
1172 return new_hash;
1173
1174 free_hash:
1175 free_ftrace_hash(new_hash);
1176 return NULL;
1177}
1178
1179static int
1180ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
1181{
1182 struct ftrace_func_entry *entry;
1183 struct hlist_node *tp, *tn;
1184 struct hlist_head *hhd;
1185 struct ftrace_hash *old_hash;
1186 struct ftrace_hash *new_hash;
1187 unsigned long key;
1188 int size = src->count;
1189 int bits = 0;
1190 int i;
1191
1192 /*
1193 * If the new source is empty, just free dst and assign it
1194 * the empty_hash.
1195 */
1196 if (!src->count) {
1197 free_ftrace_hash_rcu(*dst);
1198 rcu_assign_pointer(*dst, EMPTY_HASH);
1199 return 0;
1200 }
1201
1202 /*
1203 * Make the hash size about 1/2 the # found
1204 */
1205 for (size /= 2; size; size >>= 1)
1206 bits++;
1207
1208 /* Don't allocate too much */
1209 if (bits > FTRACE_HASH_MAX_BITS)
1210 bits = FTRACE_HASH_MAX_BITS;
1211
1212 new_hash = alloc_ftrace_hash(bits);
1213 if (!new_hash)
1214 return -ENOMEM;
1215
1216 size = 1 << src->size_bits;
1217 for (i = 0; i < size; i++) {
1218 hhd = &src->buckets[i];
1219 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) {
1220 if (bits > 0)
1221 key = hash_long(entry->ip, bits);
1222 else
1223 key = 0;
1224 remove_hash_entry(src, entry);
1225 __add_hash_entry(new_hash, entry);
1226 }
1227 }
1228
1229 old_hash = *dst;
1230 rcu_assign_pointer(*dst, new_hash);
1231 free_ftrace_hash_rcu(old_hash);
1232
1233 return 0;
1234}
1235
1236/*
1237 * Test the hashes for this ops to see if we want to call
1238 * the ops->func or not.
1239 *
1240 * It's a match if the ip is in the ops->filter_hash or
1241 * the filter_hash does not exist or is empty,
1242 * AND
1243 * the ip is not in the ops->notrace_hash.
1244 *
1245 * This needs to be called with preemption disabled as
1246 * the hashes are freed with call_rcu_sched().
1247 */
1248static int
1249ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1250{
1251 struct ftrace_hash *filter_hash;
1252 struct ftrace_hash *notrace_hash;
1253 int ret;
1254
1255 filter_hash = rcu_dereference_raw(ops->filter_hash);
1256 notrace_hash = rcu_dereference_raw(ops->notrace_hash);
1257
1258 if ((!filter_hash || !filter_hash->count ||
1259 ftrace_lookup_ip(filter_hash, ip)) &&
1260 (!notrace_hash || !notrace_hash->count ||
1261 !ftrace_lookup_ip(notrace_hash, ip)))
1262 ret = 1;
1263 else
1264 ret = 0;
1265
1266 return ret;
1267}
1268
915/* 1269/*
916 * This is a double for. Do not use 'break' to break out of the loop, 1270 * This is a double for. Do not use 'break' to break out of the loop,
917 * you must use a goto. 1271 * you must use a goto.
@@ -926,6 +1280,105 @@ static struct dyn_ftrace *ftrace_free_records;
926 } \ 1280 } \
927 } 1281 }
928 1282
1283static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1284 int filter_hash,
1285 bool inc)
1286{
1287 struct ftrace_hash *hash;
1288 struct ftrace_hash *other_hash;
1289 struct ftrace_page *pg;
1290 struct dyn_ftrace *rec;
1291 int count = 0;
1292 int all = 0;
1293
1294 /* Only update if the ops has been registered */
1295 if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
1296 return;
1297
1298 /*
1299 * In the filter_hash case:
1300 * If the count is zero, we update all records.
1301 * Otherwise we just update the items in the hash.
1302 *
1303 * In the notrace_hash case:
1304 * We enable the update in the hash.
1305 * As disabling notrace means enabling the tracing,
1306 * and enabling notrace means disabling, the inc variable
1307 * gets inversed.
1308 */
1309 if (filter_hash) {
1310 hash = ops->filter_hash;
1311 other_hash = ops->notrace_hash;
1312 if (!hash || !hash->count)
1313 all = 1;
1314 } else {
1315 inc = !inc;
1316 hash = ops->notrace_hash;
1317 other_hash = ops->filter_hash;
1318 /*
1319 * If the notrace hash has no items,
1320 * then there's nothing to do.
1321 */
1322 if (hash && !hash->count)
1323 return;
1324 }
1325
1326 do_for_each_ftrace_rec(pg, rec) {
1327 int in_other_hash = 0;
1328 int in_hash = 0;
1329 int match = 0;
1330
1331 if (all) {
1332 /*
1333 * Only the filter_hash affects all records.
1334 * Update if the record is not in the notrace hash.
1335 */
1336 if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
1337 match = 1;
1338 } else {
1339 in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip);
1340 in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip);
1341
1342 /*
1343 *
1344 */
1345 if (filter_hash && in_hash && !in_other_hash)
1346 match = 1;
1347 else if (!filter_hash && in_hash &&
1348 (in_other_hash || !other_hash->count))
1349 match = 1;
1350 }
1351 if (!match)
1352 continue;
1353
1354 if (inc) {
1355 rec->flags++;
1356 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
1357 return;
1358 } else {
1359 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
1360 return;
1361 rec->flags--;
1362 }
1363 count++;
1364 /* Shortcut, if we handled all records, we are done. */
1365 if (!all && count == hash->count)
1366 return;
1367 } while_for_each_ftrace_rec();
1368}
1369
1370static void ftrace_hash_rec_disable(struct ftrace_ops *ops,
1371 int filter_hash)
1372{
1373 __ftrace_hash_rec_update(ops, filter_hash, 0);
1374}
1375
1376static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
1377 int filter_hash)
1378{
1379 __ftrace_hash_rec_update(ops, filter_hash, 1);
1380}
1381
929static void ftrace_free_rec(struct dyn_ftrace *rec) 1382static void ftrace_free_rec(struct dyn_ftrace *rec)
930{ 1383{
931 rec->freelist = ftrace_free_records; 1384 rec->freelist = ftrace_free_records;
@@ -1047,18 +1500,18 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1047 ftrace_addr = (unsigned long)FTRACE_ADDR; 1500 ftrace_addr = (unsigned long)FTRACE_ADDR;
1048 1501
1049 /* 1502 /*
1050 * If this record is not to be traced or we want to disable it, 1503 * If we are enabling tracing:
1051 * then disable it.
1052 * 1504 *
1053 * If we want to enable it and filtering is off, then enable it. 1505 * If the record has a ref count, then we need to enable it
1506 * because someone is using it.
1054 * 1507 *
1055 * If we want to enable it and filtering is on, enable it only if 1508 * Otherwise we make sure its disabled.
1056 * it's filtered 1509 *
1510 * If we are disabling tracing, then disable all records that
1511 * are enabled.
1057 */ 1512 */
1058 if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) { 1513 if (enable && (rec->flags & ~FTRACE_FL_MASK))
1059 if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) 1514 flag = FTRACE_FL_ENABLED;
1060 flag = FTRACE_FL_ENABLED;
1061 }
1062 1515
1063 /* If the state of this record hasn't changed, then do nothing */ 1516 /* If the state of this record hasn't changed, then do nothing */
1064 if ((rec->flags & FTRACE_FL_ENABLED) == flag) 1517 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
@@ -1079,19 +1532,16 @@ static void ftrace_replace_code(int enable)
1079 struct ftrace_page *pg; 1532 struct ftrace_page *pg;
1080 int failed; 1533 int failed;
1081 1534
1535 if (unlikely(ftrace_disabled))
1536 return;
1537
1082 do_for_each_ftrace_rec(pg, rec) { 1538 do_for_each_ftrace_rec(pg, rec) {
1083 /* 1539 /* Skip over free records */
1084 * Skip over free records, records that have 1540 if (rec->flags & FTRACE_FL_FREE)
1085 * failed and not converted.
1086 */
1087 if (rec->flags & FTRACE_FL_FREE ||
1088 rec->flags & FTRACE_FL_FAILED ||
1089 !(rec->flags & FTRACE_FL_CONVERTED))
1090 continue; 1541 continue;
1091 1542
1092 failed = __ftrace_replace_code(rec, enable); 1543 failed = __ftrace_replace_code(rec, enable);
1093 if (failed) { 1544 if (failed) {
1094 rec->flags |= FTRACE_FL_FAILED;
1095 ftrace_bug(failed, rec->ip); 1545 ftrace_bug(failed, rec->ip);
1096 /* Stop processing */ 1546 /* Stop processing */
1097 return; 1547 return;
@@ -1107,10 +1557,12 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
1107 1557
1108 ip = rec->ip; 1558 ip = rec->ip;
1109 1559
1560 if (unlikely(ftrace_disabled))
1561 return 0;
1562
1110 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); 1563 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
1111 if (ret) { 1564 if (ret) {
1112 ftrace_bug(ret, ip); 1565 ftrace_bug(ret, ip);
1113 rec->flags |= FTRACE_FL_FAILED;
1114 return 0; 1566 return 0;
1115 } 1567 }
1116 return 1; 1568 return 1;
@@ -1171,6 +1623,7 @@ static void ftrace_run_update_code(int command)
1171 1623
1172static ftrace_func_t saved_ftrace_func; 1624static ftrace_func_t saved_ftrace_func;
1173static int ftrace_start_up; 1625static int ftrace_start_up;
1626static int global_start_up;
1174 1627
1175static void ftrace_startup_enable(int command) 1628static void ftrace_startup_enable(int command)
1176{ 1629{
@@ -1185,19 +1638,36 @@ static void ftrace_startup_enable(int command)
1185 ftrace_run_update_code(command); 1638 ftrace_run_update_code(command);
1186} 1639}
1187 1640
1188static void ftrace_startup(int command) 1641static void ftrace_startup(struct ftrace_ops *ops, int command)
1189{ 1642{
1643 bool hash_enable = true;
1644
1190 if (unlikely(ftrace_disabled)) 1645 if (unlikely(ftrace_disabled))
1191 return; 1646 return;
1192 1647
1193 ftrace_start_up++; 1648 ftrace_start_up++;
1194 command |= FTRACE_ENABLE_CALLS; 1649 command |= FTRACE_ENABLE_CALLS;
1195 1650
1651 /* ops marked global share the filter hashes */
1652 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
1653 ops = &global_ops;
1654 /* Don't update hash if global is already set */
1655 if (global_start_up)
1656 hash_enable = false;
1657 global_start_up++;
1658 }
1659
1660 ops->flags |= FTRACE_OPS_FL_ENABLED;
1661 if (hash_enable)
1662 ftrace_hash_rec_enable(ops, 1);
1663
1196 ftrace_startup_enable(command); 1664 ftrace_startup_enable(command);
1197} 1665}
1198 1666
1199static void ftrace_shutdown(int command) 1667static void ftrace_shutdown(struct ftrace_ops *ops, int command)
1200{ 1668{
1669 bool hash_disable = true;
1670
1201 if (unlikely(ftrace_disabled)) 1671 if (unlikely(ftrace_disabled))
1202 return; 1672 return;
1203 1673
@@ -1209,6 +1679,23 @@ static void ftrace_shutdown(int command)
1209 */ 1679 */
1210 WARN_ON_ONCE(ftrace_start_up < 0); 1680 WARN_ON_ONCE(ftrace_start_up < 0);
1211 1681
1682 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
1683 ops = &global_ops;
1684 global_start_up--;
1685 WARN_ON_ONCE(global_start_up < 0);
1686 /* Don't update hash if global still has users */
1687 if (global_start_up) {
1688 WARN_ON_ONCE(!ftrace_start_up);
1689 hash_disable = false;
1690 }
1691 }
1692
1693 if (hash_disable)
1694 ftrace_hash_rec_disable(ops, 1);
1695
1696 if (ops != &global_ops || !global_start_up)
1697 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
1698
1212 if (!ftrace_start_up) 1699 if (!ftrace_start_up)
1213 command |= FTRACE_DISABLE_CALLS; 1700 command |= FTRACE_DISABLE_CALLS;
1214 1701
@@ -1268,15 +1755,15 @@ static int ftrace_update_code(struct module *mod)
1268 p->flags = 0L; 1755 p->flags = 0L;
1269 1756
1270 /* 1757 /*
1271 * Do the initial record convertion from mcount jump 1758 * Do the initial record conversion from mcount jump
1272 * to the NOP instructions. 1759 * to the NOP instructions.
1273 */ 1760 */
1274 if (!ftrace_code_disable(mod, p)) { 1761 if (!ftrace_code_disable(mod, p)) {
1275 ftrace_free_rec(p); 1762 ftrace_free_rec(p);
1276 continue; 1763 /* Game over */
1764 break;
1277 } 1765 }
1278 1766
1279 p->flags |= FTRACE_FL_CONVERTED;
1280 ftrace_update_cnt++; 1767 ftrace_update_cnt++;
1281 1768
1282 /* 1769 /*
@@ -1351,9 +1838,9 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
1351enum { 1838enum {
1352 FTRACE_ITER_FILTER = (1 << 0), 1839 FTRACE_ITER_FILTER = (1 << 0),
1353 FTRACE_ITER_NOTRACE = (1 << 1), 1840 FTRACE_ITER_NOTRACE = (1 << 1),
1354 FTRACE_ITER_FAILURES = (1 << 2), 1841 FTRACE_ITER_PRINTALL = (1 << 2),
1355 FTRACE_ITER_PRINTALL = (1 << 3), 1842 FTRACE_ITER_HASH = (1 << 3),
1356 FTRACE_ITER_HASH = (1 << 4), 1843 FTRACE_ITER_ENABLED = (1 << 4),
1357}; 1844};
1358 1845
1359#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1846#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -1365,6 +1852,8 @@ struct ftrace_iterator {
1365 struct dyn_ftrace *func; 1852 struct dyn_ftrace *func;
1366 struct ftrace_func_probe *probe; 1853 struct ftrace_func_probe *probe;
1367 struct trace_parser parser; 1854 struct trace_parser parser;
1855 struct ftrace_hash *hash;
1856 struct ftrace_ops *ops;
1368 int hidx; 1857 int hidx;
1369 int idx; 1858 int idx;
1370 unsigned flags; 1859 unsigned flags;
@@ -1461,13 +1950,17 @@ static void *
1461t_next(struct seq_file *m, void *v, loff_t *pos) 1950t_next(struct seq_file *m, void *v, loff_t *pos)
1462{ 1951{
1463 struct ftrace_iterator *iter = m->private; 1952 struct ftrace_iterator *iter = m->private;
1953 struct ftrace_ops *ops = &global_ops;
1464 struct dyn_ftrace *rec = NULL; 1954 struct dyn_ftrace *rec = NULL;
1465 1955
1956 if (unlikely(ftrace_disabled))
1957 return NULL;
1958
1466 if (iter->flags & FTRACE_ITER_HASH) 1959 if (iter->flags & FTRACE_ITER_HASH)
1467 return t_hash_next(m, pos); 1960 return t_hash_next(m, pos);
1468 1961
1469 (*pos)++; 1962 (*pos)++;
1470 iter->pos = *pos; 1963 iter->pos = iter->func_pos = *pos;
1471 1964
1472 if (iter->flags & FTRACE_ITER_PRINTALL) 1965 if (iter->flags & FTRACE_ITER_PRINTALL)
1473 return t_hash_start(m, pos); 1966 return t_hash_start(m, pos);
@@ -1483,17 +1976,15 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1483 rec = &iter->pg->records[iter->idx++]; 1976 rec = &iter->pg->records[iter->idx++];
1484 if ((rec->flags & FTRACE_FL_FREE) || 1977 if ((rec->flags & FTRACE_FL_FREE) ||
1485 1978
1486 (!(iter->flags & FTRACE_ITER_FAILURES) &&
1487 (rec->flags & FTRACE_FL_FAILED)) ||
1488
1489 ((iter->flags & FTRACE_ITER_FAILURES) &&
1490 !(rec->flags & FTRACE_FL_FAILED)) ||
1491
1492 ((iter->flags & FTRACE_ITER_FILTER) && 1979 ((iter->flags & FTRACE_ITER_FILTER) &&
1493 !(rec->flags & FTRACE_FL_FILTER)) || 1980 !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
1494 1981
1495 ((iter->flags & FTRACE_ITER_NOTRACE) && 1982 ((iter->flags & FTRACE_ITER_NOTRACE) &&
1496 !(rec->flags & FTRACE_FL_NOTRACE))) { 1983 !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) ||
1984
1985 ((iter->flags & FTRACE_ITER_ENABLED) &&
1986 !(rec->flags & ~FTRACE_FL_MASK))) {
1987
1497 rec = NULL; 1988 rec = NULL;
1498 goto retry; 1989 goto retry;
1499 } 1990 }
@@ -1502,7 +1993,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1502 if (!rec) 1993 if (!rec)
1503 return t_hash_start(m, pos); 1994 return t_hash_start(m, pos);
1504 1995
1505 iter->func_pos = *pos;
1506 iter->func = rec; 1996 iter->func = rec;
1507 1997
1508 return iter; 1998 return iter;
@@ -1518,10 +2008,15 @@ static void reset_iter_read(struct ftrace_iterator *iter)
1518static void *t_start(struct seq_file *m, loff_t *pos) 2008static void *t_start(struct seq_file *m, loff_t *pos)
1519{ 2009{
1520 struct ftrace_iterator *iter = m->private; 2010 struct ftrace_iterator *iter = m->private;
2011 struct ftrace_ops *ops = &global_ops;
1521 void *p = NULL; 2012 void *p = NULL;
1522 loff_t l; 2013 loff_t l;
1523 2014
1524 mutex_lock(&ftrace_lock); 2015 mutex_lock(&ftrace_lock);
2016
2017 if (unlikely(ftrace_disabled))
2018 return NULL;
2019
1525 /* 2020 /*
1526 * If an lseek was done, then reset and start from beginning. 2021 * If an lseek was done, then reset and start from beginning.
1527 */ 2022 */
@@ -1533,7 +2028,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1533 * off, we can short cut and just print out that all 2028 * off, we can short cut and just print out that all
1534 * functions are enabled. 2029 * functions are enabled.
1535 */ 2030 */
1536 if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) { 2031 if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) {
1537 if (*pos > 0) 2032 if (*pos > 0)
1538 return t_hash_start(m, pos); 2033 return t_hash_start(m, pos);
1539 iter->flags |= FTRACE_ITER_PRINTALL; 2034 iter->flags |= FTRACE_ITER_PRINTALL;
@@ -1591,7 +2086,11 @@ static int t_show(struct seq_file *m, void *v)
1591 if (!rec) 2086 if (!rec)
1592 return 0; 2087 return 0;
1593 2088
1594 seq_printf(m, "%ps\n", (void *)rec->ip); 2089 seq_printf(m, "%ps", (void *)rec->ip);
2090 if (iter->flags & FTRACE_ITER_ENABLED)
2091 seq_printf(m, " (%ld)",
2092 rec->flags & ~FTRACE_FL_MASK);
2093 seq_printf(m, "\n");
1595 2094
1596 return 0; 2095 return 0;
1597} 2096}
@@ -1631,44 +2130,46 @@ ftrace_avail_open(struct inode *inode, struct file *file)
1631} 2130}
1632 2131
1633static int 2132static int
1634ftrace_failures_open(struct inode *inode, struct file *file) 2133ftrace_enabled_open(struct inode *inode, struct file *file)
1635{ 2134{
1636 int ret;
1637 struct seq_file *m;
1638 struct ftrace_iterator *iter; 2135 struct ftrace_iterator *iter;
2136 int ret;
1639 2137
1640 ret = ftrace_avail_open(inode, file); 2138 if (unlikely(ftrace_disabled))
2139 return -ENODEV;
2140
2141 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2142 if (!iter)
2143 return -ENOMEM;
2144
2145 iter->pg = ftrace_pages_start;
2146 iter->flags = FTRACE_ITER_ENABLED;
2147
2148 ret = seq_open(file, &show_ftrace_seq_ops);
1641 if (!ret) { 2149 if (!ret) {
1642 m = file->private_data; 2150 struct seq_file *m = file->private_data;
1643 iter = m->private; 2151
1644 iter->flags = FTRACE_ITER_FAILURES; 2152 m->private = iter;
2153 } else {
2154 kfree(iter);
1645 } 2155 }
1646 2156
1647 return ret; 2157 return ret;
1648} 2158}
1649 2159
1650 2160static void ftrace_filter_reset(struct ftrace_hash *hash)
1651static void ftrace_filter_reset(int enable)
1652{ 2161{
1653 struct ftrace_page *pg;
1654 struct dyn_ftrace *rec;
1655 unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1656
1657 mutex_lock(&ftrace_lock); 2162 mutex_lock(&ftrace_lock);
1658 if (enable) 2163 ftrace_hash_clear(hash);
1659 ftrace_filtered = 0;
1660 do_for_each_ftrace_rec(pg, rec) {
1661 if (rec->flags & FTRACE_FL_FAILED)
1662 continue;
1663 rec->flags &= ~type;
1664 } while_for_each_ftrace_rec();
1665 mutex_unlock(&ftrace_lock); 2164 mutex_unlock(&ftrace_lock);
1666} 2165}
1667 2166
1668static int 2167static int
1669ftrace_regex_open(struct inode *inode, struct file *file, int enable) 2168ftrace_regex_open(struct ftrace_ops *ops, int flag,
2169 struct inode *inode, struct file *file)
1670{ 2170{
1671 struct ftrace_iterator *iter; 2171 struct ftrace_iterator *iter;
2172 struct ftrace_hash *hash;
1672 int ret = 0; 2173 int ret = 0;
1673 2174
1674 if (unlikely(ftrace_disabled)) 2175 if (unlikely(ftrace_disabled))
@@ -1683,21 +2184,42 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1683 return -ENOMEM; 2184 return -ENOMEM;
1684 } 2185 }
1685 2186
2187 if (flag & FTRACE_ITER_NOTRACE)
2188 hash = ops->notrace_hash;
2189 else
2190 hash = ops->filter_hash;
2191
2192 iter->ops = ops;
2193 iter->flags = flag;
2194
2195 if (file->f_mode & FMODE_WRITE) {
2196 mutex_lock(&ftrace_lock);
2197 iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);
2198 mutex_unlock(&ftrace_lock);
2199
2200 if (!iter->hash) {
2201 trace_parser_put(&iter->parser);
2202 kfree(iter);
2203 return -ENOMEM;
2204 }
2205 }
2206
1686 mutex_lock(&ftrace_regex_lock); 2207 mutex_lock(&ftrace_regex_lock);
2208
1687 if ((file->f_mode & FMODE_WRITE) && 2209 if ((file->f_mode & FMODE_WRITE) &&
1688 (file->f_flags & O_TRUNC)) 2210 (file->f_flags & O_TRUNC))
1689 ftrace_filter_reset(enable); 2211 ftrace_filter_reset(iter->hash);
1690 2212
1691 if (file->f_mode & FMODE_READ) { 2213 if (file->f_mode & FMODE_READ) {
1692 iter->pg = ftrace_pages_start; 2214 iter->pg = ftrace_pages_start;
1693 iter->flags = enable ? FTRACE_ITER_FILTER :
1694 FTRACE_ITER_NOTRACE;
1695 2215
1696 ret = seq_open(file, &show_ftrace_seq_ops); 2216 ret = seq_open(file, &show_ftrace_seq_ops);
1697 if (!ret) { 2217 if (!ret) {
1698 struct seq_file *m = file->private_data; 2218 struct seq_file *m = file->private_data;
1699 m->private = iter; 2219 m->private = iter;
1700 } else { 2220 } else {
2221 /* Failed */
2222 free_ftrace_hash(iter->hash);
1701 trace_parser_put(&iter->parser); 2223 trace_parser_put(&iter->parser);
1702 kfree(iter); 2224 kfree(iter);
1703 } 2225 }
@@ -1711,13 +2233,15 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1711static int 2233static int
1712ftrace_filter_open(struct inode *inode, struct file *file) 2234ftrace_filter_open(struct inode *inode, struct file *file)
1713{ 2235{
1714 return ftrace_regex_open(inode, file, 1); 2236 return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER,
2237 inode, file);
1715} 2238}
1716 2239
1717static int 2240static int
1718ftrace_notrace_open(struct inode *inode, struct file *file) 2241ftrace_notrace_open(struct inode *inode, struct file *file)
1719{ 2242{
1720 return ftrace_regex_open(inode, file, 0); 2243 return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE,
2244 inode, file);
1721} 2245}
1722 2246
1723static loff_t 2247static loff_t
@@ -1762,86 +2286,99 @@ static int ftrace_match(char *str, char *regex, int len, int type)
1762} 2286}
1763 2287
1764static int 2288static int
1765ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) 2289enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not)
2290{
2291 struct ftrace_func_entry *entry;
2292 int ret = 0;
2293
2294 entry = ftrace_lookup_ip(hash, rec->ip);
2295 if (not) {
2296 /* Do nothing if it doesn't exist */
2297 if (!entry)
2298 return 0;
2299
2300 free_hash_entry(hash, entry);
2301 } else {
2302 /* Do nothing if it exists */
2303 if (entry)
2304 return 0;
2305
2306 ret = add_hash_entry(hash, rec->ip);
2307 }
2308 return ret;
2309}
2310
2311static int
2312ftrace_match_record(struct dyn_ftrace *rec, char *mod,
2313 char *regex, int len, int type)
1766{ 2314{
1767 char str[KSYM_SYMBOL_LEN]; 2315 char str[KSYM_SYMBOL_LEN];
2316 char *modname;
2317
2318 kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
2319
2320 if (mod) {
2321 /* module lookup requires matching the module */
2322 if (!modname || strcmp(modname, mod))
2323 return 0;
2324
2325 /* blank search means to match all funcs in the mod */
2326 if (!len)
2327 return 1;
2328 }
1768 2329
1769 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
1770 return ftrace_match(str, regex, len, type); 2330 return ftrace_match(str, regex, len, type);
1771} 2331}
1772 2332
1773static int ftrace_match_records(char *buff, int len, int enable) 2333static int
2334match_records(struct ftrace_hash *hash, char *buff,
2335 int len, char *mod, int not)
1774{ 2336{
1775 unsigned int search_len; 2337 unsigned search_len = 0;
1776 struct ftrace_page *pg; 2338 struct ftrace_page *pg;
1777 struct dyn_ftrace *rec; 2339 struct dyn_ftrace *rec;
1778 unsigned long flag; 2340 int type = MATCH_FULL;
1779 char *search; 2341 char *search = buff;
1780 int type;
1781 int not;
1782 int found = 0; 2342 int found = 0;
2343 int ret;
1783 2344
1784 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 2345 if (len) {
1785 type = filter_parse_regex(buff, len, &search, &not); 2346 type = filter_parse_regex(buff, len, &search, &not);
1786 2347 search_len = strlen(search);
1787 search_len = strlen(search); 2348 }
1788 2349
1789 mutex_lock(&ftrace_lock); 2350 mutex_lock(&ftrace_lock);
1790 do_for_each_ftrace_rec(pg, rec) {
1791 2351
1792 if (rec->flags & FTRACE_FL_FAILED) 2352 if (unlikely(ftrace_disabled))
1793 continue; 2353 goto out_unlock;
1794 2354
1795 if (ftrace_match_record(rec, search, search_len, type)) { 2355 do_for_each_ftrace_rec(pg, rec) {
1796 if (not) 2356
1797 rec->flags &= ~flag; 2357 if (ftrace_match_record(rec, mod, search, search_len, type)) {
1798 else 2358 ret = enter_record(hash, rec, not);
1799 rec->flags |= flag; 2359 if (ret < 0) {
2360 found = ret;
2361 goto out_unlock;
2362 }
1800 found = 1; 2363 found = 1;
1801 } 2364 }
1802 /*
1803 * Only enable filtering if we have a function that
1804 * is filtered on.
1805 */
1806 if (enable && (rec->flags & FTRACE_FL_FILTER))
1807 ftrace_filtered = 1;
1808 } while_for_each_ftrace_rec(); 2365 } while_for_each_ftrace_rec();
2366 out_unlock:
1809 mutex_unlock(&ftrace_lock); 2367 mutex_unlock(&ftrace_lock);
1810 2368
1811 return found; 2369 return found;
1812} 2370}
1813 2371
1814static int 2372static int
1815ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, 2373ftrace_match_records(struct ftrace_hash *hash, char *buff, int len)
1816 char *regex, int len, int type)
1817{ 2374{
1818 char str[KSYM_SYMBOL_LEN]; 2375 return match_records(hash, buff, len, NULL, 0);
1819 char *modname;
1820
1821 kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
1822
1823 if (!modname || strcmp(modname, mod))
1824 return 0;
1825
1826 /* blank search means to match all funcs in the mod */
1827 if (len)
1828 return ftrace_match(str, regex, len, type);
1829 else
1830 return 1;
1831} 2376}
1832 2377
1833static int ftrace_match_module_records(char *buff, char *mod, int enable) 2378static int
2379ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
1834{ 2380{
1835 unsigned search_len = 0;
1836 struct ftrace_page *pg;
1837 struct dyn_ftrace *rec;
1838 int type = MATCH_FULL;
1839 char *search = buff;
1840 unsigned long flag;
1841 int not = 0; 2381 int not = 0;
1842 int found = 0;
1843
1844 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1845 2382
1846 /* blank or '*' mean the same */ 2383 /* blank or '*' mean the same */
1847 if (strcmp(buff, "*") == 0) 2384 if (strcmp(buff, "*") == 0)
@@ -1853,32 +2390,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable)
1853 not = 1; 2390 not = 1;
1854 } 2391 }
1855 2392
1856 if (strlen(buff)) { 2393 return match_records(hash, buff, strlen(buff), mod, not);
1857 type = filter_parse_regex(buff, strlen(buff), &search, &not);
1858 search_len = strlen(search);
1859 }
1860
1861 mutex_lock(&ftrace_lock);
1862 do_for_each_ftrace_rec(pg, rec) {
1863
1864 if (rec->flags & FTRACE_FL_FAILED)
1865 continue;
1866
1867 if (ftrace_match_module_record(rec, mod,
1868 search, search_len, type)) {
1869 if (not)
1870 rec->flags &= ~flag;
1871 else
1872 rec->flags |= flag;
1873 found = 1;
1874 }
1875 if (enable && (rec->flags & FTRACE_FL_FILTER))
1876 ftrace_filtered = 1;
1877
1878 } while_for_each_ftrace_rec();
1879 mutex_unlock(&ftrace_lock);
1880
1881 return found;
1882} 2394}
1883 2395
1884/* 2396/*
@@ -1889,7 +2401,10 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable)
1889static int 2401static int
1890ftrace_mod_callback(char *func, char *cmd, char *param, int enable) 2402ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
1891{ 2403{
2404 struct ftrace_ops *ops = &global_ops;
2405 struct ftrace_hash *hash;
1892 char *mod; 2406 char *mod;
2407 int ret = -EINVAL;
1893 2408
1894 /* 2409 /*
1895 * cmd == 'mod' because we only registered this func 2410 * cmd == 'mod' because we only registered this func
@@ -1901,15 +2416,24 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
1901 2416
1902 /* we must have a module name */ 2417 /* we must have a module name */
1903 if (!param) 2418 if (!param)
1904 return -EINVAL; 2419 return ret;
1905 2420
1906 mod = strsep(&param, ":"); 2421 mod = strsep(&param, ":");
1907 if (!strlen(mod)) 2422 if (!strlen(mod))
1908 return -EINVAL; 2423 return ret;
1909 2424
1910 if (ftrace_match_module_records(func, mod, enable)) 2425 if (enable)
1911 return 0; 2426 hash = ops->filter_hash;
1912 return -EINVAL; 2427 else
2428 hash = ops->notrace_hash;
2429
2430 ret = ftrace_match_module_records(hash, func, mod);
2431 if (!ret)
2432 ret = -EINVAL;
2433 if (ret < 0)
2434 return ret;
2435
2436 return 0;
1913} 2437}
1914 2438
1915static struct ftrace_func_command ftrace_mod_cmd = { 2439static struct ftrace_func_command ftrace_mod_cmd = {
@@ -1960,6 +2484,7 @@ static int ftrace_probe_registered;
1960 2484
1961static void __enable_ftrace_function_probe(void) 2485static void __enable_ftrace_function_probe(void)
1962{ 2486{
2487 int ret;
1963 int i; 2488 int i;
1964 2489
1965 if (ftrace_probe_registered) 2490 if (ftrace_probe_registered)
@@ -1974,13 +2499,16 @@ static void __enable_ftrace_function_probe(void)
1974 if (i == FTRACE_FUNC_HASHSIZE) 2499 if (i == FTRACE_FUNC_HASHSIZE)
1975 return; 2500 return;
1976 2501
1977 __register_ftrace_function(&trace_probe_ops); 2502 ret = __register_ftrace_function(&trace_probe_ops);
1978 ftrace_startup(0); 2503 if (!ret)
2504 ftrace_startup(&trace_probe_ops, 0);
2505
1979 ftrace_probe_registered = 1; 2506 ftrace_probe_registered = 1;
1980} 2507}
1981 2508
1982static void __disable_ftrace_function_probe(void) 2509static void __disable_ftrace_function_probe(void)
1983{ 2510{
2511 int ret;
1984 int i; 2512 int i;
1985 2513
1986 if (!ftrace_probe_registered) 2514 if (!ftrace_probe_registered)
@@ -1993,8 +2521,10 @@ static void __disable_ftrace_function_probe(void)
1993 } 2521 }
1994 2522
1995 /* no more funcs left */ 2523 /* no more funcs left */
1996 __unregister_ftrace_function(&trace_probe_ops); 2524 ret = __unregister_ftrace_function(&trace_probe_ops);
1997 ftrace_shutdown(0); 2525 if (!ret)
2526 ftrace_shutdown(&trace_probe_ops, 0);
2527
1998 ftrace_probe_registered = 0; 2528 ftrace_probe_registered = 0;
1999} 2529}
2000 2530
@@ -2030,12 +2560,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2030 return -EINVAL; 2560 return -EINVAL;
2031 2561
2032 mutex_lock(&ftrace_lock); 2562 mutex_lock(&ftrace_lock);
2033 do_for_each_ftrace_rec(pg, rec) {
2034 2563
2035 if (rec->flags & FTRACE_FL_FAILED) 2564 if (unlikely(ftrace_disabled))
2036 continue; 2565 goto out_unlock;
2566
2567 do_for_each_ftrace_rec(pg, rec) {
2037 2568
2038 if (!ftrace_match_record(rec, search, len, type)) 2569 if (!ftrace_match_record(rec, NULL, search, len, type))
2039 continue; 2570 continue;
2040 2571
2041 entry = kmalloc(sizeof(*entry), GFP_KERNEL); 2572 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
@@ -2196,18 +2727,22 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd)
2196 return ret; 2727 return ret;
2197} 2728}
2198 2729
2199static int ftrace_process_regex(char *buff, int len, int enable) 2730static int ftrace_process_regex(struct ftrace_hash *hash,
2731 char *buff, int len, int enable)
2200{ 2732{
2201 char *func, *command, *next = buff; 2733 char *func, *command, *next = buff;
2202 struct ftrace_func_command *p; 2734 struct ftrace_func_command *p;
2203 int ret = -EINVAL; 2735 int ret;
2204 2736
2205 func = strsep(&next, ":"); 2737 func = strsep(&next, ":");
2206 2738
2207 if (!next) { 2739 if (!next) {
2208 if (ftrace_match_records(func, len, enable)) 2740 ret = ftrace_match_records(hash, func, len);
2209 return 0; 2741 if (!ret)
2210 return ret; 2742 ret = -EINVAL;
2743 if (ret < 0)
2744 return ret;
2745 return 0;
2211 } 2746 }
2212 2747
2213 /* command found */ 2748 /* command found */
@@ -2240,6 +2775,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2240 2775
2241 mutex_lock(&ftrace_regex_lock); 2776 mutex_lock(&ftrace_regex_lock);
2242 2777
2778 ret = -ENODEV;
2779 if (unlikely(ftrace_disabled))
2780 goto out_unlock;
2781
2243 if (file->f_mode & FMODE_READ) { 2782 if (file->f_mode & FMODE_READ) {
2244 struct seq_file *m = file->private_data; 2783 struct seq_file *m = file->private_data;
2245 iter = m->private; 2784 iter = m->private;
@@ -2251,7 +2790,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2251 2790
2252 if (read >= 0 && trace_parser_loaded(parser) && 2791 if (read >= 0 && trace_parser_loaded(parser) &&
2253 !trace_parser_cont(parser)) { 2792 !trace_parser_cont(parser)) {
2254 ret = ftrace_process_regex(parser->buffer, 2793 ret = ftrace_process_regex(iter->hash, parser->buffer,
2255 parser->idx, enable); 2794 parser->idx, enable);
2256 trace_parser_clear(parser); 2795 trace_parser_clear(parser);
2257 if (ret) 2796 if (ret)
@@ -2279,22 +2818,83 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
2279 return ftrace_regex_write(file, ubuf, cnt, ppos, 0); 2818 return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
2280} 2819}
2281 2820
2282static void 2821static int
2283ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) 2822ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
2823 int reset, int enable)
2284{ 2824{
2825 struct ftrace_hash **orig_hash;
2826 struct ftrace_hash *hash;
2827 int ret;
2828
2829 /* All global ops uses the global ops filters */
2830 if (ops->flags & FTRACE_OPS_FL_GLOBAL)
2831 ops = &global_ops;
2832
2285 if (unlikely(ftrace_disabled)) 2833 if (unlikely(ftrace_disabled))
2286 return; 2834 return -ENODEV;
2835
2836 if (enable)
2837 orig_hash = &ops->filter_hash;
2838 else
2839 orig_hash = &ops->notrace_hash;
2840
2841 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
2842 if (!hash)
2843 return -ENOMEM;
2287 2844
2288 mutex_lock(&ftrace_regex_lock); 2845 mutex_lock(&ftrace_regex_lock);
2289 if (reset) 2846 if (reset)
2290 ftrace_filter_reset(enable); 2847 ftrace_filter_reset(hash);
2291 if (buf) 2848 if (buf)
2292 ftrace_match_records(buf, len, enable); 2849 ftrace_match_records(hash, buf, len);
2850
2851 mutex_lock(&ftrace_lock);
2852 ret = ftrace_hash_move(orig_hash, hash);
2853 mutex_unlock(&ftrace_lock);
2854
2293 mutex_unlock(&ftrace_regex_lock); 2855 mutex_unlock(&ftrace_regex_lock);
2856
2857 free_ftrace_hash(hash);
2858 return ret;
2859}
2860
2861/**
2862 * ftrace_set_filter - set a function to filter on in ftrace
2863 * @ops - the ops to set the filter with
2864 * @buf - the string that holds the function filter text.
2865 * @len - the length of the string.
2866 * @reset - non zero to reset all filters before applying this filter.
2867 *
2868 * Filters denote which functions should be enabled when tracing is enabled.
2869 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
2870 */
2871void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
2872 int len, int reset)
2873{
2874 ftrace_set_regex(ops, buf, len, reset, 1);
2294} 2875}
2876EXPORT_SYMBOL_GPL(ftrace_set_filter);
2295 2877
2296/** 2878/**
2879 * ftrace_set_notrace - set a function to not trace in ftrace
2880 * @ops - the ops to set the notrace filter with
2881 * @buf - the string that holds the function notrace text.
2882 * @len - the length of the string.
2883 * @reset - non zero to reset all filters before applying this filter.
2884 *
2885 * Notrace Filters denote which functions should not be enabled when tracing
2886 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
2887 * for tracing.
2888 */
2889void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
2890 int len, int reset)
2891{
2892 ftrace_set_regex(ops, buf, len, reset, 0);
2893}
2894EXPORT_SYMBOL_GPL(ftrace_set_notrace);
2895/**
2297 * ftrace_set_filter - set a function to filter on in ftrace 2896 * ftrace_set_filter - set a function to filter on in ftrace
2897 * @ops - the ops to set the filter with
2298 * @buf - the string that holds the function filter text. 2898 * @buf - the string that holds the function filter text.
2299 * @len - the length of the string. 2899 * @len - the length of the string.
2300 * @reset - non zero to reset all filters before applying this filter. 2900 * @reset - non zero to reset all filters before applying this filter.
@@ -2302,13 +2902,15 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
2302 * Filters denote which functions should be enabled when tracing is enabled. 2902 * Filters denote which functions should be enabled when tracing is enabled.
2303 * If @buf is NULL and reset is set, all functions will be enabled for tracing. 2903 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
2304 */ 2904 */
2305void ftrace_set_filter(unsigned char *buf, int len, int reset) 2905void ftrace_set_global_filter(unsigned char *buf, int len, int reset)
2306{ 2906{
2307 ftrace_set_regex(buf, len, reset, 1); 2907 ftrace_set_regex(&global_ops, buf, len, reset, 1);
2308} 2908}
2909EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
2309 2910
2310/** 2911/**
2311 * ftrace_set_notrace - set a function to not trace in ftrace 2912 * ftrace_set_notrace - set a function to not trace in ftrace
2913 * @ops - the ops to set the notrace filter with
2312 * @buf - the string that holds the function notrace text. 2914 * @buf - the string that holds the function notrace text.
2313 * @len - the length of the string. 2915 * @len - the length of the string.
2314 * @reset - non zero to reset all filters before applying this filter. 2916 * @reset - non zero to reset all filters before applying this filter.
@@ -2317,10 +2919,11 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset)
2317 * is enabled. If @buf is NULL and reset is set, all functions will be enabled 2919 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
2318 * for tracing. 2920 * for tracing.
2319 */ 2921 */
2320void ftrace_set_notrace(unsigned char *buf, int len, int reset) 2922void ftrace_set_global_notrace(unsigned char *buf, int len, int reset)
2321{ 2923{
2322 ftrace_set_regex(buf, len, reset, 0); 2924 ftrace_set_regex(&global_ops, buf, len, reset, 0);
2323} 2925}
2926EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);
2324 2927
2325/* 2928/*
2326 * command line interface to allow users to set filters on boot up. 2929 * command line interface to allow users to set filters on boot up.
@@ -2371,22 +2974,23 @@ static void __init set_ftrace_early_graph(char *buf)
2371} 2974}
2372#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2975#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2373 2976
2374static void __init set_ftrace_early_filter(char *buf, int enable) 2977static void __init
2978set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
2375{ 2979{
2376 char *func; 2980 char *func;
2377 2981
2378 while (buf) { 2982 while (buf) {
2379 func = strsep(&buf, ","); 2983 func = strsep(&buf, ",");
2380 ftrace_set_regex(func, strlen(func), 0, enable); 2984 ftrace_set_regex(ops, func, strlen(func), 0, enable);
2381 } 2985 }
2382} 2986}
2383 2987
2384static void __init set_ftrace_early_filters(void) 2988static void __init set_ftrace_early_filters(void)
2385{ 2989{
2386 if (ftrace_filter_buf[0]) 2990 if (ftrace_filter_buf[0])
2387 set_ftrace_early_filter(ftrace_filter_buf, 1); 2991 set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1);
2388 if (ftrace_notrace_buf[0]) 2992 if (ftrace_notrace_buf[0])
2389 set_ftrace_early_filter(ftrace_notrace_buf, 0); 2993 set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0);
2390#ifdef CONFIG_FUNCTION_GRAPH_TRACER 2994#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2391 if (ftrace_graph_buf[0]) 2995 if (ftrace_graph_buf[0])
2392 set_ftrace_early_graph(ftrace_graph_buf); 2996 set_ftrace_early_graph(ftrace_graph_buf);
@@ -2394,11 +2998,14 @@ static void __init set_ftrace_early_filters(void)
2394} 2998}
2395 2999
2396static int 3000static int
2397ftrace_regex_release(struct inode *inode, struct file *file, int enable) 3001ftrace_regex_release(struct inode *inode, struct file *file)
2398{ 3002{
2399 struct seq_file *m = (struct seq_file *)file->private_data; 3003 struct seq_file *m = (struct seq_file *)file->private_data;
2400 struct ftrace_iterator *iter; 3004 struct ftrace_iterator *iter;
3005 struct ftrace_hash **orig_hash;
2401 struct trace_parser *parser; 3006 struct trace_parser *parser;
3007 int filter_hash;
3008 int ret;
2402 3009
2403 mutex_lock(&ftrace_regex_lock); 3010 mutex_lock(&ftrace_regex_lock);
2404 if (file->f_mode & FMODE_READ) { 3011 if (file->f_mode & FMODE_READ) {
@@ -2411,33 +3018,41 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2411 parser = &iter->parser; 3018 parser = &iter->parser;
2412 if (trace_parser_loaded(parser)) { 3019 if (trace_parser_loaded(parser)) {
2413 parser->buffer[parser->idx] = 0; 3020 parser->buffer[parser->idx] = 0;
2414 ftrace_match_records(parser->buffer, parser->idx, enable); 3021 ftrace_match_records(iter->hash, parser->buffer, parser->idx);
2415 } 3022 }
2416 3023
2417 mutex_lock(&ftrace_lock);
2418 if (ftrace_start_up && ftrace_enabled)
2419 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
2420 mutex_unlock(&ftrace_lock);
2421
2422 trace_parser_put(parser); 3024 trace_parser_put(parser);
3025
3026 if (file->f_mode & FMODE_WRITE) {
3027 filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
3028
3029 if (filter_hash)
3030 orig_hash = &iter->ops->filter_hash;
3031 else
3032 orig_hash = &iter->ops->notrace_hash;
3033
3034 mutex_lock(&ftrace_lock);
3035 /*
3036 * Remove the current set, update the hash and add
3037 * them back.
3038 */
3039 ftrace_hash_rec_disable(iter->ops, filter_hash);
3040 ret = ftrace_hash_move(orig_hash, iter->hash);
3041 if (!ret) {
3042 ftrace_hash_rec_enable(iter->ops, filter_hash);
3043 if (iter->ops->flags & FTRACE_OPS_FL_ENABLED
3044 && ftrace_enabled)
3045 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
3046 }
3047 mutex_unlock(&ftrace_lock);
3048 }
3049 free_ftrace_hash(iter->hash);
2423 kfree(iter); 3050 kfree(iter);
2424 3051
2425 mutex_unlock(&ftrace_regex_lock); 3052 mutex_unlock(&ftrace_regex_lock);
2426 return 0; 3053 return 0;
2427} 3054}
2428 3055
2429static int
2430ftrace_filter_release(struct inode *inode, struct file *file)
2431{
2432 return ftrace_regex_release(inode, file, 1);
2433}
2434
2435static int
2436ftrace_notrace_release(struct inode *inode, struct file *file)
2437{
2438 return ftrace_regex_release(inode, file, 0);
2439}
2440
2441static const struct file_operations ftrace_avail_fops = { 3056static const struct file_operations ftrace_avail_fops = {
2442 .open = ftrace_avail_open, 3057 .open = ftrace_avail_open,
2443 .read = seq_read, 3058 .read = seq_read,
@@ -2445,8 +3060,8 @@ static const struct file_operations ftrace_avail_fops = {
2445 .release = seq_release_private, 3060 .release = seq_release_private,
2446}; 3061};
2447 3062
2448static const struct file_operations ftrace_failures_fops = { 3063static const struct file_operations ftrace_enabled_fops = {
2449 .open = ftrace_failures_open, 3064 .open = ftrace_enabled_open,
2450 .read = seq_read, 3065 .read = seq_read,
2451 .llseek = seq_lseek, 3066 .llseek = seq_lseek,
2452 .release = seq_release_private, 3067 .release = seq_release_private,
@@ -2457,7 +3072,7 @@ static const struct file_operations ftrace_filter_fops = {
2457 .read = seq_read, 3072 .read = seq_read,
2458 .write = ftrace_filter_write, 3073 .write = ftrace_filter_write,
2459 .llseek = ftrace_regex_lseek, 3074 .llseek = ftrace_regex_lseek,
2460 .release = ftrace_filter_release, 3075 .release = ftrace_regex_release,
2461}; 3076};
2462 3077
2463static const struct file_operations ftrace_notrace_fops = { 3078static const struct file_operations ftrace_notrace_fops = {
@@ -2465,7 +3080,7 @@ static const struct file_operations ftrace_notrace_fops = {
2465 .read = seq_read, 3080 .read = seq_read,
2466 .write = ftrace_notrace_write, 3081 .write = ftrace_notrace_write,
2467 .llseek = ftrace_regex_lseek, 3082 .llseek = ftrace_regex_lseek,
2468 .release = ftrace_notrace_release, 3083 .release = ftrace_regex_release,
2469}; 3084};
2470 3085
2471#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3086#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -2574,9 +3189,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2574 bool exists; 3189 bool exists;
2575 int i; 3190 int i;
2576 3191
2577 if (ftrace_disabled)
2578 return -ENODEV;
2579
2580 /* decode regex */ 3192 /* decode regex */
2581 type = filter_parse_regex(buffer, strlen(buffer), &search, &not); 3193 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2582 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) 3194 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
@@ -2585,12 +3197,18 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2585 search_len = strlen(search); 3197 search_len = strlen(search);
2586 3198
2587 mutex_lock(&ftrace_lock); 3199 mutex_lock(&ftrace_lock);
3200
3201 if (unlikely(ftrace_disabled)) {
3202 mutex_unlock(&ftrace_lock);
3203 return -ENODEV;
3204 }
3205
2588 do_for_each_ftrace_rec(pg, rec) { 3206 do_for_each_ftrace_rec(pg, rec) {
2589 3207
2590 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) 3208 if (rec->flags & FTRACE_FL_FREE)
2591 continue; 3209 continue;
2592 3210
2593 if (ftrace_match_record(rec, search, search_len, type)) { 3211 if (ftrace_match_record(rec, NULL, search, search_len, type)) {
2594 /* if it is in the array */ 3212 /* if it is in the array */
2595 exists = false; 3213 exists = false;
2596 for (i = 0; i < *idx; i++) { 3214 for (i = 0; i < *idx; i++) {
@@ -2680,8 +3298,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
2680 trace_create_file("available_filter_functions", 0444, 3298 trace_create_file("available_filter_functions", 0444,
2681 d_tracer, NULL, &ftrace_avail_fops); 3299 d_tracer, NULL, &ftrace_avail_fops);
2682 3300
2683 trace_create_file("failures", 0444, 3301 trace_create_file("enabled_functions", 0444,
2684 d_tracer, NULL, &ftrace_failures_fops); 3302 d_tracer, NULL, &ftrace_enabled_fops);
2685 3303
2686 trace_create_file("set_ftrace_filter", 0644, d_tracer, 3304 trace_create_file("set_ftrace_filter", 0644, d_tracer,
2687 NULL, &ftrace_filter_fops); 3305 NULL, &ftrace_filter_fops);
@@ -2704,7 +3322,6 @@ static int ftrace_process_locs(struct module *mod,
2704{ 3322{
2705 unsigned long *p; 3323 unsigned long *p;
2706 unsigned long addr; 3324 unsigned long addr;
2707 unsigned long flags;
2708 3325
2709 mutex_lock(&ftrace_lock); 3326 mutex_lock(&ftrace_lock);
2710 p = start; 3327 p = start;
@@ -2721,10 +3338,7 @@ static int ftrace_process_locs(struct module *mod,
2721 ftrace_record_ip(addr); 3338 ftrace_record_ip(addr);
2722 } 3339 }
2723 3340
2724 /* disable interrupts to prevent kstop machine */
2725 local_irq_save(flags);
2726 ftrace_update_code(mod); 3341 ftrace_update_code(mod);
2727 local_irq_restore(flags);
2728 mutex_unlock(&ftrace_lock); 3342 mutex_unlock(&ftrace_lock);
2729 3343
2730 return 0; 3344 return 0;
@@ -2736,10 +3350,11 @@ void ftrace_release_mod(struct module *mod)
2736 struct dyn_ftrace *rec; 3350 struct dyn_ftrace *rec;
2737 struct ftrace_page *pg; 3351 struct ftrace_page *pg;
2738 3352
3353 mutex_lock(&ftrace_lock);
3354
2739 if (ftrace_disabled) 3355 if (ftrace_disabled)
2740 return; 3356 goto out_unlock;
2741 3357
2742 mutex_lock(&ftrace_lock);
2743 do_for_each_ftrace_rec(pg, rec) { 3358 do_for_each_ftrace_rec(pg, rec) {
2744 if (within_module_core(rec->ip, mod)) { 3359 if (within_module_core(rec->ip, mod)) {
2745 /* 3360 /*
@@ -2750,6 +3365,7 @@ void ftrace_release_mod(struct module *mod)
2750 ftrace_free_rec(rec); 3365 ftrace_free_rec(rec);
2751 } 3366 }
2752 } while_for_each_ftrace_rec(); 3367 } while_for_each_ftrace_rec();
3368 out_unlock:
2753 mutex_unlock(&ftrace_lock); 3369 mutex_unlock(&ftrace_lock);
2754} 3370}
2755 3371
@@ -2836,6 +3452,10 @@ void __init ftrace_init(void)
2836 3452
2837#else 3453#else
2838 3454
3455static struct ftrace_ops global_ops = {
3456 .func = ftrace_stub,
3457};
3458
2839static int __init ftrace_nodyn_init(void) 3459static int __init ftrace_nodyn_init(void)
2840{ 3460{
2841 ftrace_enabled = 1; 3461 ftrace_enabled = 1;
@@ -2846,12 +3466,38 @@ device_initcall(ftrace_nodyn_init);
2846static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } 3466static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
2847static inline void ftrace_startup_enable(int command) { } 3467static inline void ftrace_startup_enable(int command) { }
2848/* Keep as macros so we do not need to define the commands */ 3468/* Keep as macros so we do not need to define the commands */
2849# define ftrace_startup(command) do { } while (0) 3469# define ftrace_startup(ops, command) do { } while (0)
2850# define ftrace_shutdown(command) do { } while (0) 3470# define ftrace_shutdown(ops, command) do { } while (0)
2851# define ftrace_startup_sysctl() do { } while (0) 3471# define ftrace_startup_sysctl() do { } while (0)
2852# define ftrace_shutdown_sysctl() do { } while (0) 3472# define ftrace_shutdown_sysctl() do { } while (0)
3473
3474static inline int
3475ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
3476{
3477 return 1;
3478}
3479
2853#endif /* CONFIG_DYNAMIC_FTRACE */ 3480#endif /* CONFIG_DYNAMIC_FTRACE */
2854 3481
3482static void
3483ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
3484{
3485 struct ftrace_ops *op;
3486
3487 /*
3488 * Some of the ops may be dynamically allocated,
3489 * they must be freed after a synchronize_sched().
3490 */
3491 preempt_disable_notrace();
3492 op = rcu_dereference_raw(ftrace_ops_list);
3493 while (op != &ftrace_list_end) {
3494 if (ftrace_ops_test(op, ip))
3495 op->func(ip, parent_ip);
3496 op = rcu_dereference_raw(op->next);
3497 };
3498 preempt_enable_notrace();
3499}
3500
2855static void clear_ftrace_swapper(void) 3501static void clear_ftrace_swapper(void)
2856{ 3502{
2857 struct task_struct *p; 3503 struct task_struct *p;
@@ -3144,19 +3790,23 @@ void ftrace_kill(void)
3144 */ 3790 */
3145int register_ftrace_function(struct ftrace_ops *ops) 3791int register_ftrace_function(struct ftrace_ops *ops)
3146{ 3792{
3147 int ret; 3793 int ret = -1;
3148
3149 if (unlikely(ftrace_disabled))
3150 return -1;
3151 3794
3152 mutex_lock(&ftrace_lock); 3795 mutex_lock(&ftrace_lock);
3153 3796
3797 if (unlikely(ftrace_disabled))
3798 goto out_unlock;
3799
3154 ret = __register_ftrace_function(ops); 3800 ret = __register_ftrace_function(ops);
3155 ftrace_startup(0); 3801 if (!ret)
3802 ftrace_startup(ops, 0);
3156 3803
3804
3805 out_unlock:
3157 mutex_unlock(&ftrace_lock); 3806 mutex_unlock(&ftrace_lock);
3158 return ret; 3807 return ret;
3159} 3808}
3809EXPORT_SYMBOL_GPL(register_ftrace_function);
3160 3810
3161/** 3811/**
3162 * unregister_ftrace_function - unregister a function for profiling. 3812 * unregister_ftrace_function - unregister a function for profiling.
@@ -3170,25 +3820,27 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
3170 3820
3171 mutex_lock(&ftrace_lock); 3821 mutex_lock(&ftrace_lock);
3172 ret = __unregister_ftrace_function(ops); 3822 ret = __unregister_ftrace_function(ops);
3173 ftrace_shutdown(0); 3823 if (!ret)
3824 ftrace_shutdown(ops, 0);
3174 mutex_unlock(&ftrace_lock); 3825 mutex_unlock(&ftrace_lock);
3175 3826
3176 return ret; 3827 return ret;
3177} 3828}
3829EXPORT_SYMBOL_GPL(unregister_ftrace_function);
3178 3830
3179int 3831int
3180ftrace_enable_sysctl(struct ctl_table *table, int write, 3832ftrace_enable_sysctl(struct ctl_table *table, int write,
3181 void __user *buffer, size_t *lenp, 3833 void __user *buffer, size_t *lenp,
3182 loff_t *ppos) 3834 loff_t *ppos)
3183{ 3835{
3184 int ret; 3836 int ret = -ENODEV;
3185
3186 if (unlikely(ftrace_disabled))
3187 return -ENODEV;
3188 3837
3189 mutex_lock(&ftrace_lock); 3838 mutex_lock(&ftrace_lock);
3190 3839
3191 ret = proc_dointvec(table, write, buffer, lenp, ppos); 3840 if (unlikely(ftrace_disabled))
3841 goto out;
3842
3843 ret = proc_dointvec(table, write, buffer, lenp, ppos);
3192 3844
3193 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) 3845 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3194 goto out; 3846 goto out;
@@ -3200,11 +3852,11 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3200 ftrace_startup_sysctl(); 3852 ftrace_startup_sysctl();
3201 3853
3202 /* we are starting ftrace again */ 3854 /* we are starting ftrace again */
3203 if (ftrace_list != &ftrace_list_end) { 3855 if (ftrace_ops_list != &ftrace_list_end) {
3204 if (ftrace_list->next == &ftrace_list_end) 3856 if (ftrace_ops_list->next == &ftrace_list_end)
3205 ftrace_trace_function = ftrace_list->func; 3857 ftrace_trace_function = ftrace_ops_list->func;
3206 else 3858 else
3207 ftrace_trace_function = ftrace_list_func; 3859 ftrace_trace_function = ftrace_ops_list_func;
3208 } 3860 }
3209 3861
3210 } else { 3862 } else {
@@ -3328,7 +3980,7 @@ static int start_graph_tracing(void)
3328 /* The cpu_boot init_task->ret_stack will never be freed */ 3980 /* The cpu_boot init_task->ret_stack will never be freed */
3329 for_each_online_cpu(cpu) { 3981 for_each_online_cpu(cpu) {
3330 if (!idle_task(cpu)->ret_stack) 3982 if (!idle_task(cpu)->ret_stack)
3331 ftrace_graph_init_task(idle_task(cpu)); 3983 ftrace_graph_init_idle_task(idle_task(cpu), cpu);
3332 } 3984 }
3333 3985
3334 do { 3986 do {
@@ -3393,7 +4045,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
3393 ftrace_graph_return = retfunc; 4045 ftrace_graph_return = retfunc;
3394 ftrace_graph_entry = entryfunc; 4046 ftrace_graph_entry = entryfunc;
3395 4047
3396 ftrace_startup(FTRACE_START_FUNC_RET); 4048 ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
3397 4049
3398out: 4050out:
3399 mutex_unlock(&ftrace_lock); 4051 mutex_unlock(&ftrace_lock);
@@ -3410,7 +4062,7 @@ void unregister_ftrace_graph(void)
3410 ftrace_graph_active--; 4062 ftrace_graph_active--;
3411 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 4063 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
3412 ftrace_graph_entry = ftrace_graph_entry_stub; 4064 ftrace_graph_entry = ftrace_graph_entry_stub;
3413 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 4065 ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
3414 unregister_pm_notifier(&ftrace_suspend_notifier); 4066 unregister_pm_notifier(&ftrace_suspend_notifier);
3415 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 4067 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
3416 4068
@@ -3418,6 +4070,49 @@ void unregister_ftrace_graph(void)
3418 mutex_unlock(&ftrace_lock); 4070 mutex_unlock(&ftrace_lock);
3419} 4071}
3420 4072
4073static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
4074
4075static void
4076graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
4077{
4078 atomic_set(&t->tracing_graph_pause, 0);
4079 atomic_set(&t->trace_overrun, 0);
4080 t->ftrace_timestamp = 0;
4081 /* make curr_ret_stack visible before we add the ret_stack */
4082 smp_wmb();
4083 t->ret_stack = ret_stack;
4084}
4085
4086/*
4087 * Allocate a return stack for the idle task. May be the first
4088 * time through, or it may be done by CPU hotplug online.
4089 */
4090void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
4091{
4092 t->curr_ret_stack = -1;
4093 /*
4094 * The idle task has no parent, it either has its own
4095 * stack or no stack at all.
4096 */
4097 if (t->ret_stack)
4098 WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
4099
4100 if (ftrace_graph_active) {
4101 struct ftrace_ret_stack *ret_stack;
4102
4103 ret_stack = per_cpu(idle_ret_stack, cpu);
4104 if (!ret_stack) {
4105 ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
4106 * sizeof(struct ftrace_ret_stack),
4107 GFP_KERNEL);
4108 if (!ret_stack)
4109 return;
4110 per_cpu(idle_ret_stack, cpu) = ret_stack;
4111 }
4112 graph_init_task(t, ret_stack);
4113 }
4114}
4115
3421/* Allocate a return stack for newly created task */ 4116/* Allocate a return stack for newly created task */
3422void ftrace_graph_init_task(struct task_struct *t) 4117void ftrace_graph_init_task(struct task_struct *t)
3423{ 4118{
@@ -3433,12 +4128,7 @@ void ftrace_graph_init_task(struct task_struct *t)
3433 GFP_KERNEL); 4128 GFP_KERNEL);
3434 if (!ret_stack) 4129 if (!ret_stack)
3435 return; 4130 return;
3436 atomic_set(&t->tracing_graph_pause, 0); 4131 graph_init_task(t, ret_stack);
3437 atomic_set(&t->trace_overrun, 0);
3438 t->ftrace_timestamp = 0;
3439 /* make curr_ret_stack visable before we add the ret_stack */
3440 smp_wmb();
3441 t->ret_stack = ret_stack;
3442 } 4132 }
3443} 4133}
3444 4134
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bd1c35a4fbcc..0ef7b4b2a1f7 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5,7 +5,6 @@
5 */ 5 */
6#include <linux/ring_buffer.h> 6#include <linux/ring_buffer.h>
7#include <linux/trace_clock.h> 7#include <linux/trace_clock.h>
8#include <linux/ftrace_irq.h>
9#include <linux/spinlock.h> 8#include <linux/spinlock.h>
10#include <linux/debugfs.h> 9#include <linux/debugfs.h>
11#include <linux/uaccess.h> 10#include <linux/uaccess.h>
@@ -669,7 +668,7 @@ static struct list_head *rb_list_head(struct list_head *list)
669 * the reader page). But if the next page is a header page, 668 * the reader page). But if the next page is a header page,
670 * its flags will be non zero. 669 * its flags will be non zero.
671 */ 670 */
672static int inline 671static inline int
673rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, 672rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
674 struct buffer_page *page, struct list_head *list) 673 struct buffer_page *page, struct list_head *list)
675{ 674{
@@ -1429,6 +1428,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1429} 1428}
1430EXPORT_SYMBOL_GPL(ring_buffer_resize); 1429EXPORT_SYMBOL_GPL(ring_buffer_resize);
1431 1430
1431void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
1432{
1433 mutex_lock(&buffer->mutex);
1434 if (val)
1435 buffer->flags |= RB_FL_OVERWRITE;
1436 else
1437 buffer->flags &= ~RB_FL_OVERWRITE;
1438 mutex_unlock(&buffer->mutex);
1439}
1440EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
1441
1432static inline void * 1442static inline void *
1433__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) 1443__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
1434{ 1444{
@@ -1468,7 +1478,7 @@ static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1468 return local_read(&bpage->entries) & RB_WRITE_MASK; 1478 return local_read(&bpage->entries) & RB_WRITE_MASK;
1469} 1479}
1470 1480
1471/* Size is determined by what has been commited */ 1481/* Size is determined by what has been committed */
1472static inline unsigned rb_page_size(struct buffer_page *bpage) 1482static inline unsigned rb_page_size(struct buffer_page *bpage)
1473{ 1483{
1474 return rb_page_commit(bpage); 1484 return rb_page_commit(bpage);
@@ -2162,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2162 if (likely(ts >= cpu_buffer->write_stamp)) { 2172 if (likely(ts >= cpu_buffer->write_stamp)) {
2163 delta = diff; 2173 delta = diff;
2164 if (unlikely(test_time_stamp(delta))) { 2174 if (unlikely(test_time_stamp(delta))) {
2175 int local_clock_stable = 1;
2176#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
2177 local_clock_stable = sched_clock_stable;
2178#endif
2165 WARN_ONCE(delta > (1ULL << 59), 2179 WARN_ONCE(delta > (1ULL << 59),
2166 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", 2180 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
2167 (unsigned long long)delta, 2181 (unsigned long long)delta,
2168 (unsigned long long)ts, 2182 (unsigned long long)ts,
2169 (unsigned long long)cpu_buffer->write_stamp); 2183 (unsigned long long)cpu_buffer->write_stamp,
2184 local_clock_stable ? "" :
2185 "If you just came from a suspend/resume,\n"
2186 "please switch to the trace global clock:\n"
2187 " echo global > /sys/kernel/debug/tracing/trace_clock\n");
2170 add_timestamp = 1; 2188 add_timestamp = 1;
2171 } 2189 }
2172 } 2190 }
@@ -2914,7 +2932,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2914 /* 2932 /*
2915 * cpu_buffer->pages just needs to point to the buffer, it 2933 * cpu_buffer->pages just needs to point to the buffer, it
2916 * has no specific buffer page to point to. Lets move it out 2934 * has no specific buffer page to point to. Lets move it out
2917 * of our way so we don't accidently swap it. 2935 * of our way so we don't accidentally swap it.
2918 */ 2936 */
2919 cpu_buffer->pages = reader->list.prev; 2937 cpu_buffer->pages = reader->list.prev;
2920 2938
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dc53ecb80589..ee9c921d7f21 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,8 +41,6 @@
41#include "trace.h" 41#include "trace.h"
42#include "trace_output.h" 42#include "trace_output.h"
43 43
44#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
45
46/* 44/*
47 * On boot up, the ring buffer is set to the minimum size, so that 45 * On boot up, the ring buffer is set to the minimum size, so that
48 * we do not waste memory on systems that are not using tracing. 46 * we do not waste memory on systems that are not using tracing.
@@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
340/* trace_flags holds trace_options default values */ 338/* trace_flags holds trace_options default values */
341unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 339unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
342 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 340 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
343 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; 341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
344 342
345static int trace_stop_count; 343static int trace_stop_count;
346static DEFINE_SPINLOCK(tracing_start_lock); 344static DEFINE_SPINLOCK(tracing_start_lock);
@@ -425,6 +423,7 @@ static const char *trace_options[] = {
425 "sleep-time", 423 "sleep-time",
426 "graph-time", 424 "graph-time",
427 "record-cmd", 425 "record-cmd",
426 "overwrite",
428 NULL 427 NULL
429}; 428};
430 429
@@ -780,6 +779,11 @@ __acquires(kernel_lock)
780 tracing_reset_online_cpus(tr); 779 tracing_reset_online_cpus(tr);
781 780
782 current_trace = type; 781 current_trace = type;
782
783 /* If we expanded the buffers, make sure the max is expanded too */
784 if (ring_buffer_expanded && type->use_max_tr)
785 ring_buffer_resize(max_tr.buffer, trace_buf_size);
786
783 /* the test is responsible for initializing and enabling */ 787 /* the test is responsible for initializing and enabling */
784 pr_info("Testing tracer %s: ", type->name); 788 pr_info("Testing tracer %s: ", type->name);
785 ret = type->selftest(type, tr); 789 ret = type->selftest(type, tr);
@@ -792,6 +796,10 @@ __acquires(kernel_lock)
792 /* Only reset on passing, to avoid touching corrupted buffers */ 796 /* Only reset on passing, to avoid touching corrupted buffers */
793 tracing_reset_online_cpus(tr); 797 tracing_reset_online_cpus(tr);
794 798
799 /* Shrink the max buffer again */
800 if (ring_buffer_expanded && type->use_max_tr)
801 ring_buffer_resize(max_tr.buffer, 1);
802
795 printk(KERN_CONT "PASSED\n"); 803 printk(KERN_CONT "PASSED\n");
796 } 804 }
797#endif 805#endif
@@ -1102,7 +1110,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
1102 1110
1103 entry->preempt_count = pc & 0xff; 1111 entry->preempt_count = pc & 0xff;
1104 entry->pid = (tsk) ? tsk->pid : 0; 1112 entry->pid = (tsk) ? tsk->pid : 0;
1105 entry->lock_depth = (tsk) ? tsk->lock_depth : 0; 1113 entry->padding = 0;
1106 entry->flags = 1114 entry->flags =
1107#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 1115#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
1108 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 1116 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1749,10 +1757,9 @@ static void print_lat_help_header(struct seq_file *m)
1749 seq_puts(m, "# | / _----=> need-resched \n"); 1757 seq_puts(m, "# | / _----=> need-resched \n");
1750 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 1758 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1751 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 1759 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1752 seq_puts(m, "# |||| /_--=> lock-depth \n"); 1760 seq_puts(m, "# |||| / delay \n");
1753 seq_puts(m, "# |||||/ delay \n"); 1761 seq_puts(m, "# cmd pid ||||| time | caller \n");
1754 seq_puts(m, "# cmd pid |||||| time | caller \n"); 1762 seq_puts(m, "# \\ / ||||| \\ | / \n");
1755 seq_puts(m, "# \\ / |||||| \\ | / \n");
1756} 1763}
1757 1764
1758static void print_func_help_header(struct seq_file *m) 1765static void print_func_help_header(struct seq_file *m)
@@ -2007,9 +2014,10 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
2007{ 2014{
2008 enum print_line_t ret; 2015 enum print_line_t ret;
2009 2016
2010 if (iter->lost_events) 2017 if (iter->lost_events &&
2011 trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", 2018 !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
2012 iter->cpu, iter->lost_events); 2019 iter->cpu, iter->lost_events))
2020 return TRACE_TYPE_PARTIAL_LINE;
2013 2021
2014 if (iter->trace && iter->trace->print_line) { 2022 if (iter->trace && iter->trace->print_line) {
2015 ret = iter->trace->print_line(iter); 2023 ret = iter->trace->print_line(iter);
@@ -2529,6 +2537,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2529 2537
2530 if (mask == TRACE_ITER_RECORD_CMD) 2538 if (mask == TRACE_ITER_RECORD_CMD)
2531 trace_event_enable_cmd_record(enabled); 2539 trace_event_enable_cmd_record(enabled);
2540
2541 if (mask == TRACE_ITER_OVERWRITE)
2542 ring_buffer_change_overwrite(global_trace.buffer, enabled);
2532} 2543}
2533 2544
2534static ssize_t 2545static ssize_t
@@ -2710,6 +2721,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2710 2721
2711 mutex_lock(&trace_types_lock); 2722 mutex_lock(&trace_types_lock);
2712 if (tracer_enabled ^ val) { 2723 if (tracer_enabled ^ val) {
2724
2725 /* Only need to warn if this is used to change the state */
2726 WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
2727
2713 if (val) { 2728 if (val) {
2714 tracer_enabled = 1; 2729 tracer_enabled = 1;
2715 if (current_trace->start) 2730 if (current_trace->start)
@@ -3216,6 +3231,14 @@ waitagain:
3216 3231
3217 if (iter->seq.len >= cnt) 3232 if (iter->seq.len >= cnt)
3218 break; 3233 break;
3234
3235 /*
3236 * Setting the full flag means we reached the trace_seq buffer
3237 * size and we should leave by partial output condition above.
3238 * One of the trace_seq_* functions is not used properly.
3239 */
3240 WARN_ONCE(iter->seq.full, "full flag set for trace type %d",
3241 iter->ent->type);
3219 } 3242 }
3220 trace_access_unlock(iter->cpu_file); 3243 trace_access_unlock(iter->cpu_file);
3221 trace_event_read_unlock(); 3244 trace_event_read_unlock();
@@ -3226,7 +3249,7 @@ waitagain:
3226 trace_seq_init(&iter->seq); 3249 trace_seq_init(&iter->seq);
3227 3250
3228 /* 3251 /*
3229 * If there was nothing to send to user, inspite of consuming trace 3252 * If there was nothing to send to user, in spite of consuming trace
3230 * entries, go back to wait for more entries. 3253 * entries, go back to wait for more entries.
3231 */ 3254 */
3232 if (sret == -EBUSY) 3255 if (sret == -EBUSY)
@@ -4551,9 +4574,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4551__init static int tracer_alloc_buffers(void) 4574__init static int tracer_alloc_buffers(void)
4552{ 4575{
4553 int ring_buf_size; 4576 int ring_buf_size;
4577 enum ring_buffer_flags rb_flags;
4554 int i; 4578 int i;
4555 int ret = -ENOMEM; 4579 int ret = -ENOMEM;
4556 4580
4581
4557 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) 4582 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
4558 goto out; 4583 goto out;
4559 4584
@@ -4566,12 +4591,13 @@ __init static int tracer_alloc_buffers(void)
4566 else 4591 else
4567 ring_buf_size = 1; 4592 ring_buf_size = 1;
4568 4593
4594 rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
4595
4569 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 4596 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
4570 cpumask_copy(tracing_cpumask, cpu_all_mask); 4597 cpumask_copy(tracing_cpumask, cpu_all_mask);
4571 4598
4572 /* TODO: make the number of buffers hot pluggable with CPUS */ 4599 /* TODO: make the number of buffers hot pluggable with CPUS */
4573 global_trace.buffer = ring_buffer_alloc(ring_buf_size, 4600 global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
4574 TRACE_BUFFER_FLAGS);
4575 if (!global_trace.buffer) { 4601 if (!global_trace.buffer) {
4576 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 4602 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
4577 WARN_ON(1); 4603 WARN_ON(1);
@@ -4581,7 +4607,7 @@ __init static int tracer_alloc_buffers(void)
4581 4607
4582 4608
4583#ifdef CONFIG_TRACER_MAX_TRACE 4609#ifdef CONFIG_TRACER_MAX_TRACE
4584 max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); 4610 max_tr.buffer = ring_buffer_alloc(1, rb_flags);
4585 if (!max_tr.buffer) { 4611 if (!max_tr.buffer) {
4586 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 4612 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
4587 WARN_ON(1); 4613 WARN_ON(1);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9021f8c0c0c3..6b69c4bd306f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -272,8 +272,8 @@ struct tracer {
272 /* If you handled the flag setting, return 0 */ 272 /* If you handled the flag setting, return 0 */
273 int (*set_flag)(u32 old_flags, u32 bit, int set); 273 int (*set_flag)(u32 old_flags, u32 bit, int set);
274 struct tracer *next; 274 struct tracer *next;
275 int print_max;
276 struct tracer_flags *flags; 275 struct tracer_flags *flags;
276 int print_max;
277 int use_max_tr; 277 int use_max_tr;
278}; 278};
279 279
@@ -419,6 +419,8 @@ extern void trace_find_cmdline(int pid, char comm[]);
419extern unsigned long ftrace_update_tot_cnt; 419extern unsigned long ftrace_update_tot_cnt;
420#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func 420#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
421extern int DYN_FTRACE_TEST_NAME(void); 421extern int DYN_FTRACE_TEST_NAME(void);
422#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
423extern int DYN_FTRACE_TEST_NAME2(void);
422#endif 424#endif
423 425
424extern int ring_buffer_expanded; 426extern int ring_buffer_expanded;
@@ -606,6 +608,7 @@ enum trace_iterator_flags {
606 TRACE_ITER_SLEEP_TIME = 0x40000, 608 TRACE_ITER_SLEEP_TIME = 0x40000,
607 TRACE_ITER_GRAPH_TIME = 0x80000, 609 TRACE_ITER_GRAPH_TIME = 0x80000,
608 TRACE_ITER_RECORD_CMD = 0x100000, 610 TRACE_ITER_RECORD_CMD = 0x100000,
611 TRACE_ITER_OVERWRITE = 0x200000,
609}; 612};
610 613
611/* 614/*
@@ -661,8 +664,10 @@ struct ftrace_event_field {
661}; 664};
662 665
663struct event_filter { 666struct event_filter {
664 int n_preds; 667 int n_preds; /* Number assigned */
665 struct filter_pred **preds; 668 int a_preds; /* allocated */
669 struct filter_pred *preds;
670 struct filter_pred *root;
666 char *filter_string; 671 char *filter_string;
667}; 672};
668 673
@@ -674,11 +679,23 @@ struct event_subsystem {
674 int nr_events; 679 int nr_events;
675}; 680};
676 681
682#define FILTER_PRED_INVALID ((unsigned short)-1)
683#define FILTER_PRED_IS_RIGHT (1 << 15)
684#define FILTER_PRED_FOLD (1 << 15)
685
686/*
687 * The max preds is the size of unsigned short with
688 * two flags at the MSBs. One bit is used for both the IS_RIGHT
689 * and FOLD flags. The other is reserved.
690 *
691 * 2^14 preds is way more than enough.
692 */
693#define MAX_FILTER_PRED 16384
694
677struct filter_pred; 695struct filter_pred;
678struct regex; 696struct regex;
679 697
680typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, 698typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
681 int val1, int val2);
682 699
683typedef int (*regex_match_func)(char *str, struct regex *r, int len); 700typedef int (*regex_match_func)(char *str, struct regex *r, int len);
684 701
@@ -700,11 +717,23 @@ struct filter_pred {
700 filter_pred_fn_t fn; 717 filter_pred_fn_t fn;
701 u64 val; 718 u64 val;
702 struct regex regex; 719 struct regex regex;
703 char *field_name; 720 /*
721 * Leaf nodes use field_name, ops is used by AND and OR
722 * nodes. The field_name is always freed when freeing a pred.
723 * We can overload field_name for ops and have it freed
724 * as well.
725 */
726 union {
727 char *field_name;
728 unsigned short *ops;
729 };
704 int offset; 730 int offset;
705 int not; 731 int not;
706 int op; 732 int op;
707 int pop_n; 733 unsigned short index;
734 unsigned short parent;
735 unsigned short left;
736 unsigned short right;
708}; 737};
709 738
710extern struct list_head ftrace_common_fields; 739extern struct list_head ftrace_common_fields;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 685a67d55db0..6302747a1398 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -46,7 +46,7 @@ u64 notrace trace_clock_local(void)
46} 46}
47 47
48/* 48/*
49 * trace_clock(): 'inbetween' trace clock. Not completely serialized, 49 * trace_clock(): 'between' trace clock. Not completely serialized,
50 * but not completely incorrect when crossing CPUs either. 50 * but not completely incorrect when crossing CPUs either.
51 * 51 *
52 * This is based on cpu_clock(), which will allow at most ~1 jiffy of 52 * This is based on cpu_clock(), which will allow at most ~1 jiffy of
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 6cf223764be8..e32744c84d94 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -27,7 +27,7 @@
27 * in the structure. 27 * in the structure.
28 * 28 *
29 * * for structures within structures, the format of the internal 29 * * for structures within structures, the format of the internal
30 * structure is layed out. This allows the internal structure 30 * structure is laid out. This allows the internal structure
31 * to be deciphered for the format file. Although these macros 31 * to be deciphered for the format file. Although these macros
32 * may become out of sync with the internal structure, they 32 * may become out of sync with the internal structure, they
33 * will create a compile error if it happens. Since the 33 * will create a compile error if it happens. Since the
@@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
109 */ 109 */
110#define FTRACE_CTX_FIELDS \ 110#define FTRACE_CTX_FIELDS \
111 __field( unsigned int, prev_pid ) \ 111 __field( unsigned int, prev_pid ) \
112 __field( unsigned int, next_pid ) \
113 __field( unsigned int, next_cpu ) \
112 __field( unsigned char, prev_prio ) \ 114 __field( unsigned char, prev_prio ) \
113 __field( unsigned char, prev_state ) \ 115 __field( unsigned char, prev_state ) \
114 __field( unsigned int, next_pid ) \
115 __field( unsigned char, next_prio ) \ 116 __field( unsigned char, next_prio ) \
116 __field( unsigned char, next_state ) \ 117 __field( unsigned char, next_state )
117 __field( unsigned int, next_cpu )
118 118
119FTRACE_ENTRY(context_switch, ctx_switch_entry, 119FTRACE_ENTRY(context_switch, ctx_switch_entry,
120 120
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5f499e0438a4..2fe110341359 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,7 +116,7 @@ static int trace_define_common_fields(void)
116 __common_field(unsigned char, flags); 116 __common_field(unsigned char, flags);
117 __common_field(unsigned char, preempt_count); 117 __common_field(unsigned char, preempt_count);
118 __common_field(int, pid); 118 __common_field(int, pid);
119 __common_field(int, lock_depth); 119 __common_field(int, padding);
120 120
121 return ret; 121 return ret;
122} 122}
@@ -326,6 +326,7 @@ int trace_set_clr_event(const char *system, const char *event, int set)
326{ 326{
327 return __ftrace_set_clr_event(NULL, system, event, set); 327 return __ftrace_set_clr_event(NULL, system, event, set);
328} 328}
329EXPORT_SYMBOL_GPL(trace_set_clr_event);
329 330
330/* 128 should be much more than enough */ 331/* 128 should be much more than enough */
331#define EVENT_BUF_SIZE 127 332#define EVENT_BUF_SIZE 127
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 36d40104b17f..8008ddcfbf20 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -123,9 +123,13 @@ struct filter_parse_state {
123 } operand; 123 } operand;
124}; 124};
125 125
126struct pred_stack {
127 struct filter_pred **preds;
128 int index;
129};
130
126#define DEFINE_COMPARISON_PRED(type) \ 131#define DEFINE_COMPARISON_PRED(type) \
127static int filter_pred_##type(struct filter_pred *pred, void *event, \ 132static int filter_pred_##type(struct filter_pred *pred, void *event) \
128 int val1, int val2) \
129{ \ 133{ \
130 type *addr = (type *)(event + pred->offset); \ 134 type *addr = (type *)(event + pred->offset); \
131 type val = (type)pred->val; \ 135 type val = (type)pred->val; \
@@ -152,8 +156,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \
152} 156}
153 157
154#define DEFINE_EQUALITY_PRED(size) \ 158#define DEFINE_EQUALITY_PRED(size) \
155static int filter_pred_##size(struct filter_pred *pred, void *event, \ 159static int filter_pred_##size(struct filter_pred *pred, void *event) \
156 int val1, int val2) \
157{ \ 160{ \
158 u##size *addr = (u##size *)(event + pred->offset); \ 161 u##size *addr = (u##size *)(event + pred->offset); \
159 u##size val = (u##size)pred->val; \ 162 u##size val = (u##size)pred->val; \
@@ -178,23 +181,8 @@ DEFINE_EQUALITY_PRED(32);
178DEFINE_EQUALITY_PRED(16); 181DEFINE_EQUALITY_PRED(16);
179DEFINE_EQUALITY_PRED(8); 182DEFINE_EQUALITY_PRED(8);
180 183
181static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
182 void *event __attribute((unused)),
183 int val1, int val2)
184{
185 return val1 && val2;
186}
187
188static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
189 void *event __attribute((unused)),
190 int val1, int val2)
191{
192 return val1 || val2;
193}
194
195/* Filter predicate for fixed sized arrays of characters */ 184/* Filter predicate for fixed sized arrays of characters */
196static int filter_pred_string(struct filter_pred *pred, void *event, 185static int filter_pred_string(struct filter_pred *pred, void *event)
197 int val1, int val2)
198{ 186{
199 char *addr = (char *)(event + pred->offset); 187 char *addr = (char *)(event + pred->offset);
200 int cmp, match; 188 int cmp, match;
@@ -207,8 +195,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
207} 195}
208 196
209/* Filter predicate for char * pointers */ 197/* Filter predicate for char * pointers */
210static int filter_pred_pchar(struct filter_pred *pred, void *event, 198static int filter_pred_pchar(struct filter_pred *pred, void *event)
211 int val1, int val2)
212{ 199{
213 char **addr = (char **)(event + pred->offset); 200 char **addr = (char **)(event + pred->offset);
214 int cmp, match; 201 int cmp, match;
@@ -231,8 +218,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
231 * and add it to the address of the entry, and at last we have 218 * and add it to the address of the entry, and at last we have
232 * the address of the string. 219 * the address of the string.
233 */ 220 */
234static int filter_pred_strloc(struct filter_pred *pred, void *event, 221static int filter_pred_strloc(struct filter_pred *pred, void *event)
235 int val1, int val2)
236{ 222{
237 u32 str_item = *(u32 *)(event + pred->offset); 223 u32 str_item = *(u32 *)(event + pred->offset);
238 int str_loc = str_item & 0xffff; 224 int str_loc = str_item & 0xffff;
@@ -247,8 +233,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
247 return match; 233 return match;
248} 234}
249 235
250static int filter_pred_none(struct filter_pred *pred, void *event, 236static int filter_pred_none(struct filter_pred *pred, void *event)
251 int val1, int val2)
252{ 237{
253 return 0; 238 return 0;
254} 239}
@@ -377,32 +362,147 @@ static void filter_build_regex(struct filter_pred *pred)
377 pred->not ^= not; 362 pred->not ^= not;
378} 363}
379 364
365enum move_type {
366 MOVE_DOWN,
367 MOVE_UP_FROM_LEFT,
368 MOVE_UP_FROM_RIGHT
369};
370
371static struct filter_pred *
372get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
373 int index, enum move_type *move)
374{
375 if (pred->parent & FILTER_PRED_IS_RIGHT)
376 *move = MOVE_UP_FROM_RIGHT;
377 else
378 *move = MOVE_UP_FROM_LEFT;
379 pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT];
380
381 return pred;
382}
383
384/*
385 * A series of AND or ORs where found together. Instead of
386 * climbing up and down the tree branches, an array of the
387 * ops were made in order of checks. We can just move across
388 * the array and short circuit if needed.
389 */
390static int process_ops(struct filter_pred *preds,
391 struct filter_pred *op, void *rec)
392{
393 struct filter_pred *pred;
394 int match = 0;
395 int type;
396 int i;
397
398 /*
399 * Micro-optimization: We set type to true if op
400 * is an OR and false otherwise (AND). Then we
401 * just need to test if the match is equal to
402 * the type, and if it is, we can short circuit the
403 * rest of the checks:
404 *
405 * if ((match && op->op == OP_OR) ||
406 * (!match && op->op == OP_AND))
407 * return match;
408 */
409 type = op->op == OP_OR;
410
411 for (i = 0; i < op->val; i++) {
412 pred = &preds[op->ops[i]];
413 match = pred->fn(pred, rec);
414 if (!!match == type)
415 return match;
416 }
417 return match;
418}
419
380/* return 1 if event matches, 0 otherwise (discard) */ 420/* return 1 if event matches, 0 otherwise (discard) */
381int filter_match_preds(struct event_filter *filter, void *rec) 421int filter_match_preds(struct event_filter *filter, void *rec)
382{ 422{
383 int match, top = 0, val1 = 0, val2 = 0; 423 int match = -1;
384 int stack[MAX_FILTER_PRED]; 424 enum move_type move = MOVE_DOWN;
425 struct filter_pred *preds;
385 struct filter_pred *pred; 426 struct filter_pred *pred;
386 int i; 427 struct filter_pred *root;
428 int n_preds;
429 int done = 0;
430
431 /* no filter is considered a match */
432 if (!filter)
433 return 1;
434
435 n_preds = filter->n_preds;
436
437 if (!n_preds)
438 return 1;
439
440 /*
441 * n_preds, root and filter->preds are protect with preemption disabled.
442 */
443 preds = rcu_dereference_sched(filter->preds);
444 root = rcu_dereference_sched(filter->root);
445 if (!root)
446 return 1;
447
448 pred = root;
387 449
388 for (i = 0; i < filter->n_preds; i++) { 450 /* match is currently meaningless */
389 pred = filter->preds[i]; 451 match = -1;
390 if (!pred->pop_n) { 452
391 match = pred->fn(pred, rec, val1, val2); 453 do {
392 stack[top++] = match; 454 switch (move) {
455 case MOVE_DOWN:
456 /* only AND and OR have children */
457 if (pred->left != FILTER_PRED_INVALID) {
458 /* If ops is set, then it was folded. */
459 if (!pred->ops) {
460 /* keep going to down the left side */
461 pred = &preds[pred->left];
462 continue;
463 }
464 /* We can treat folded ops as a leaf node */
465 match = process_ops(preds, pred, rec);
466 } else
467 match = pred->fn(pred, rec);
468 /* If this pred is the only pred */
469 if (pred == root)
470 break;
471 pred = get_pred_parent(pred, preds,
472 pred->parent, &move);
473 continue;
474 case MOVE_UP_FROM_LEFT:
475 /*
476 * Check for short circuits.
477 *
478 * Optimization: !!match == (pred->op == OP_OR)
479 * is the same as:
480 * if ((match && pred->op == OP_OR) ||
481 * (!match && pred->op == OP_AND))
482 */
483 if (!!match == (pred->op == OP_OR)) {
484 if (pred == root)
485 break;
486 pred = get_pred_parent(pred, preds,
487 pred->parent, &move);
488 continue;
489 }
490 /* now go down the right side of the tree. */
491 pred = &preds[pred->right];
492 move = MOVE_DOWN;
493 continue;
494 case MOVE_UP_FROM_RIGHT:
495 /* We finished this equation. */
496 if (pred == root)
497 break;
498 pred = get_pred_parent(pred, preds,
499 pred->parent, &move);
393 continue; 500 continue;
394 } 501 }
395 if (pred->pop_n > top) { 502 done = 1;
396 WARN_ON_ONCE(1); 503 } while (!done);
397 return 0;
398 }
399 val1 = stack[--top];
400 val2 = stack[--top];
401 match = pred->fn(pred, rec, val1, val2);
402 stack[top++] = match;
403 }
404 504
405 return stack[--top]; 505 return match;
406} 506}
407EXPORT_SYMBOL_GPL(filter_match_preds); 507EXPORT_SYMBOL_GPL(filter_match_preds);
408 508
@@ -414,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos)
414 514
415static void remove_filter_string(struct event_filter *filter) 515static void remove_filter_string(struct event_filter *filter)
416{ 516{
517 if (!filter)
518 return;
519
417 kfree(filter->filter_string); 520 kfree(filter->filter_string);
418 filter->filter_string = NULL; 521 filter->filter_string = NULL;
419} 522}
@@ -473,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps,
473 576
474void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) 577void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
475{ 578{
476 struct event_filter *filter = call->filter; 579 struct event_filter *filter;
477 580
478 mutex_lock(&event_mutex); 581 mutex_lock(&event_mutex);
582 filter = call->filter;
479 if (filter && filter->filter_string) 583 if (filter && filter->filter_string)
480 trace_seq_printf(s, "%s\n", filter->filter_string); 584 trace_seq_printf(s, "%s\n", filter->filter_string);
481 else 585 else
@@ -486,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
486void print_subsystem_event_filter(struct event_subsystem *system, 590void print_subsystem_event_filter(struct event_subsystem *system,
487 struct trace_seq *s) 591 struct trace_seq *s)
488{ 592{
489 struct event_filter *filter = system->filter; 593 struct event_filter *filter;
490 594
491 mutex_lock(&event_mutex); 595 mutex_lock(&event_mutex);
596 filter = system->filter;
492 if (filter && filter->filter_string) 597 if (filter && filter->filter_string)
493 trace_seq_printf(s, "%s\n", filter->filter_string); 598 trace_seq_printf(s, "%s\n", filter->filter_string);
494 else 599 else
@@ -539,10 +644,58 @@ static void filter_clear_pred(struct filter_pred *pred)
539 pred->regex.len = 0; 644 pred->regex.len = 0;
540} 645}
541 646
542static int filter_set_pred(struct filter_pred *dest, 647static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
648{
649 stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
650 if (!stack->preds)
651 return -ENOMEM;
652 stack->index = n_preds;
653 return 0;
654}
655
656static void __free_pred_stack(struct pred_stack *stack)
657{
658 kfree(stack->preds);
659 stack->index = 0;
660}
661
662static int __push_pred_stack(struct pred_stack *stack,
663 struct filter_pred *pred)
664{
665 int index = stack->index;
666
667 if (WARN_ON(index == 0))
668 return -ENOSPC;
669
670 stack->preds[--index] = pred;
671 stack->index = index;
672 return 0;
673}
674
675static struct filter_pred *
676__pop_pred_stack(struct pred_stack *stack)
677{
678 struct filter_pred *pred;
679 int index = stack->index;
680
681 pred = stack->preds[index++];
682 if (!pred)
683 return NULL;
684
685 stack->index = index;
686 return pred;
687}
688
689static int filter_set_pred(struct event_filter *filter,
690 int idx,
691 struct pred_stack *stack,
543 struct filter_pred *src, 692 struct filter_pred *src,
544 filter_pred_fn_t fn) 693 filter_pred_fn_t fn)
545{ 694{
695 struct filter_pred *dest = &filter->preds[idx];
696 struct filter_pred *left;
697 struct filter_pred *right;
698
546 *dest = *src; 699 *dest = *src;
547 if (src->field_name) { 700 if (src->field_name) {
548 dest->field_name = kstrdup(src->field_name, GFP_KERNEL); 701 dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
@@ -550,116 +703,140 @@ static int filter_set_pred(struct filter_pred *dest,
550 return -ENOMEM; 703 return -ENOMEM;
551 } 704 }
552 dest->fn = fn; 705 dest->fn = fn;
706 dest->index = idx;
553 707
554 return 0; 708 if (dest->op == OP_OR || dest->op == OP_AND) {
709 right = __pop_pred_stack(stack);
710 left = __pop_pred_stack(stack);
711 if (!left || !right)
712 return -EINVAL;
713 /*
714 * If both children can be folded
715 * and they are the same op as this op or a leaf,
716 * then this op can be folded.
717 */
718 if (left->index & FILTER_PRED_FOLD &&
719 (left->op == dest->op ||
720 left->left == FILTER_PRED_INVALID) &&
721 right->index & FILTER_PRED_FOLD &&
722 (right->op == dest->op ||
723 right->left == FILTER_PRED_INVALID))
724 dest->index |= FILTER_PRED_FOLD;
725
726 dest->left = left->index & ~FILTER_PRED_FOLD;
727 dest->right = right->index & ~FILTER_PRED_FOLD;
728 left->parent = dest->index & ~FILTER_PRED_FOLD;
729 right->parent = dest->index | FILTER_PRED_IS_RIGHT;
730 } else {
731 /*
732 * Make dest->left invalid to be used as a quick
733 * way to know this is a leaf node.
734 */
735 dest->left = FILTER_PRED_INVALID;
736
737 /* All leafs allow folding the parent ops. */
738 dest->index |= FILTER_PRED_FOLD;
739 }
740
741 return __push_pred_stack(stack, dest);
555} 742}
556 743
557static void filter_disable_preds(struct ftrace_event_call *call) 744static void __free_preds(struct event_filter *filter)
558{ 745{
559 struct event_filter *filter = call->filter;
560 int i; 746 int i;
561 747
562 call->flags &= ~TRACE_EVENT_FL_FILTERED; 748 if (filter->preds) {
749 for (i = 0; i < filter->a_preds; i++)
750 kfree(filter->preds[i].field_name);
751 kfree(filter->preds);
752 filter->preds = NULL;
753 }
754 filter->a_preds = 0;
563 filter->n_preds = 0; 755 filter->n_preds = 0;
564
565 for (i = 0; i < MAX_FILTER_PRED; i++)
566 filter->preds[i]->fn = filter_pred_none;
567} 756}
568 757
569static void __free_preds(struct event_filter *filter) 758static void filter_disable(struct ftrace_event_call *call)
570{ 759{
571 int i; 760 call->flags &= ~TRACE_EVENT_FL_FILTERED;
761}
572 762
763static void __free_filter(struct event_filter *filter)
764{
573 if (!filter) 765 if (!filter)
574 return; 766 return;
575 767
576 for (i = 0; i < MAX_FILTER_PRED; i++) { 768 __free_preds(filter);
577 if (filter->preds[i])
578 filter_free_pred(filter->preds[i]);
579 }
580 kfree(filter->preds);
581 kfree(filter->filter_string); 769 kfree(filter->filter_string);
582 kfree(filter); 770 kfree(filter);
583} 771}
584 772
773/*
774 * Called when destroying the ftrace_event_call.
775 * The call is being freed, so we do not need to worry about
776 * the call being currently used. This is for module code removing
777 * the tracepoints from within it.
778 */
585void destroy_preds(struct ftrace_event_call *call) 779void destroy_preds(struct ftrace_event_call *call)
586{ 780{
587 __free_preds(call->filter); 781 __free_filter(call->filter);
588 call->filter = NULL; 782 call->filter = NULL;
589 call->flags &= ~TRACE_EVENT_FL_FILTERED;
590} 783}
591 784
592static struct event_filter *__alloc_preds(void) 785static struct event_filter *__alloc_filter(void)
593{ 786{
594 struct event_filter *filter; 787 struct event_filter *filter;
788
789 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
790 return filter;
791}
792
793static int __alloc_preds(struct event_filter *filter, int n_preds)
794{
595 struct filter_pred *pred; 795 struct filter_pred *pred;
596 int i; 796 int i;
597 797
598 filter = kzalloc(sizeof(*filter), GFP_KERNEL); 798 if (filter->preds)
599 if (!filter) 799 __free_preds(filter);
600 return ERR_PTR(-ENOMEM);
601 800
602 filter->n_preds = 0; 801 filter->preds =
802 kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
603 803
604 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
605 if (!filter->preds) 804 if (!filter->preds)
606 goto oom; 805 return -ENOMEM;
607 806
608 for (i = 0; i < MAX_FILTER_PRED; i++) { 807 filter->a_preds = n_preds;
609 pred = kzalloc(sizeof(*pred), GFP_KERNEL); 808 filter->n_preds = 0;
610 if (!pred) 809
611 goto oom; 810 for (i = 0; i < n_preds; i++) {
811 pred = &filter->preds[i];
612 pred->fn = filter_pred_none; 812 pred->fn = filter_pred_none;
613 filter->preds[i] = pred;
614 } 813 }
615 814
616 return filter;
617
618oom:
619 __free_preds(filter);
620 return ERR_PTR(-ENOMEM);
621}
622
623static int init_preds(struct ftrace_event_call *call)
624{
625 if (call->filter)
626 return 0;
627
628 call->flags &= ~TRACE_EVENT_FL_FILTERED;
629 call->filter = __alloc_preds();
630 if (IS_ERR(call->filter))
631 return PTR_ERR(call->filter);
632
633 return 0; 815 return 0;
634} 816}
635 817
636static int init_subsystem_preds(struct event_subsystem *system) 818static void filter_free_subsystem_preds(struct event_subsystem *system)
637{ 819{
638 struct ftrace_event_call *call; 820 struct ftrace_event_call *call;
639 int err;
640 821
641 list_for_each_entry(call, &ftrace_events, list) { 822 list_for_each_entry(call, &ftrace_events, list) {
642 if (strcmp(call->class->system, system->name) != 0) 823 if (strcmp(call->class->system, system->name) != 0)
643 continue; 824 continue;
644 825
645 err = init_preds(call); 826 filter_disable(call);
646 if (err) 827 remove_filter_string(call->filter);
647 return err;
648 } 828 }
649
650 return 0;
651} 829}
652 830
653static void filter_free_subsystem_preds(struct event_subsystem *system) 831static void filter_free_subsystem_filters(struct event_subsystem *system)
654{ 832{
655 struct ftrace_event_call *call; 833 struct ftrace_event_call *call;
656 834
657 list_for_each_entry(call, &ftrace_events, list) { 835 list_for_each_entry(call, &ftrace_events, list) {
658 if (strcmp(call->class->system, system->name) != 0) 836 if (strcmp(call->class->system, system->name) != 0)
659 continue; 837 continue;
660 838 __free_filter(call->filter);
661 filter_disable_preds(call); 839 call->filter = NULL;
662 remove_filter_string(call->filter);
663 } 840 }
664} 841}
665 842
@@ -667,18 +844,19 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
667 struct ftrace_event_call *call, 844 struct ftrace_event_call *call,
668 struct event_filter *filter, 845 struct event_filter *filter,
669 struct filter_pred *pred, 846 struct filter_pred *pred,
847 struct pred_stack *stack,
670 filter_pred_fn_t fn) 848 filter_pred_fn_t fn)
671{ 849{
672 int idx, err; 850 int idx, err;
673 851
674 if (filter->n_preds == MAX_FILTER_PRED) { 852 if (WARN_ON(filter->n_preds == filter->a_preds)) {
675 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 853 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
676 return -ENOSPC; 854 return -ENOSPC;
677 } 855 }
678 856
679 idx = filter->n_preds; 857 idx = filter->n_preds;
680 filter_clear_pred(filter->preds[idx]); 858 filter_clear_pred(&filter->preds[idx]);
681 err = filter_set_pred(filter->preds[idx], pred, fn); 859 err = filter_set_pred(filter, idx, stack, pred, fn);
682 if (err) 860 if (err)
683 return err; 861 return err;
684 862
@@ -763,6 +941,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
763 struct ftrace_event_call *call, 941 struct ftrace_event_call *call,
764 struct event_filter *filter, 942 struct event_filter *filter,
765 struct filter_pred *pred, 943 struct filter_pred *pred,
944 struct pred_stack *stack,
766 bool dry_run) 945 bool dry_run)
767{ 946{
768 struct ftrace_event_field *field; 947 struct ftrace_event_field *field;
@@ -770,17 +949,12 @@ static int filter_add_pred(struct filter_parse_state *ps,
770 unsigned long long val; 949 unsigned long long val;
771 int ret; 950 int ret;
772 951
773 pred->fn = filter_pred_none; 952 fn = pred->fn = filter_pred_none;
774 953
775 if (pred->op == OP_AND) { 954 if (pred->op == OP_AND)
776 pred->pop_n = 2;
777 fn = filter_pred_and;
778 goto add_pred_fn; 955 goto add_pred_fn;
779 } else if (pred->op == OP_OR) { 956 else if (pred->op == OP_OR)
780 pred->pop_n = 2;
781 fn = filter_pred_or;
782 goto add_pred_fn; 957 goto add_pred_fn;
783 }
784 958
785 field = find_event_field(call, pred->field_name); 959 field = find_event_field(call, pred->field_name);
786 if (!field) { 960 if (!field) {
@@ -829,7 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
829 1003
830add_pred_fn: 1004add_pred_fn:
831 if (!dry_run) 1005 if (!dry_run)
832 return filter_add_pred_fn(ps, call, filter, pred, fn); 1006 return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
833 return 0; 1007 return 0;
834} 1008}
835 1009
@@ -1187,6 +1361,234 @@ static int check_preds(struct filter_parse_state *ps)
1187 return 0; 1361 return 0;
1188} 1362}
1189 1363
1364static int count_preds(struct filter_parse_state *ps)
1365{
1366 struct postfix_elt *elt;
1367 int n_preds = 0;
1368
1369 list_for_each_entry(elt, &ps->postfix, list) {
1370 if (elt->op == OP_NONE)
1371 continue;
1372 n_preds++;
1373 }
1374
1375 return n_preds;
1376}
1377
1378/*
1379 * The tree is walked at filtering of an event. If the tree is not correctly
1380 * built, it may cause an infinite loop. Check here that the tree does
1381 * indeed terminate.
1382 */
1383static int check_pred_tree(struct event_filter *filter,
1384 struct filter_pred *root)
1385{
1386 struct filter_pred *preds;
1387 struct filter_pred *pred;
1388 enum move_type move = MOVE_DOWN;
1389 int count = 0;
1390 int done = 0;
1391 int max;
1392
1393 /*
1394 * The max that we can hit a node is three times.
1395 * Once going down, once coming up from left, and
1396 * once coming up from right. This is more than enough
1397 * since leafs are only hit a single time.
1398 */
1399 max = 3 * filter->n_preds;
1400
1401 preds = filter->preds;
1402 if (!preds)
1403 return -EINVAL;
1404 pred = root;
1405
1406 do {
1407 if (WARN_ON(count++ > max))
1408 return -EINVAL;
1409
1410 switch (move) {
1411 case MOVE_DOWN:
1412 if (pred->left != FILTER_PRED_INVALID) {
1413 pred = &preds[pred->left];
1414 continue;
1415 }
1416 /* A leaf at the root is just a leaf in the tree */
1417 if (pred == root)
1418 break;
1419 pred = get_pred_parent(pred, preds,
1420 pred->parent, &move);
1421 continue;
1422 case MOVE_UP_FROM_LEFT:
1423 pred = &preds[pred->right];
1424 move = MOVE_DOWN;
1425 continue;
1426 case MOVE_UP_FROM_RIGHT:
1427 if (pred == root)
1428 break;
1429 pred = get_pred_parent(pred, preds,
1430 pred->parent, &move);
1431 continue;
1432 }
1433 done = 1;
1434 } while (!done);
1435
1436 /* We are fine. */
1437 return 0;
1438}
1439
1440static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
1441{
1442 struct filter_pred *pred;
1443 enum move_type move = MOVE_DOWN;
1444 int count = 0;
1445 int done = 0;
1446
1447 pred = root;
1448
1449 do {
1450 switch (move) {
1451 case MOVE_DOWN:
1452 if (pred->left != FILTER_PRED_INVALID) {
1453 pred = &preds[pred->left];
1454 continue;
1455 }
1456 /* A leaf at the root is just a leaf in the tree */
1457 if (pred == root)
1458 return 1;
1459 count++;
1460 pred = get_pred_parent(pred, preds,
1461 pred->parent, &move);
1462 continue;
1463 case MOVE_UP_FROM_LEFT:
1464 pred = &preds[pred->right];
1465 move = MOVE_DOWN;
1466 continue;
1467 case MOVE_UP_FROM_RIGHT:
1468 if (pred == root)
1469 break;
1470 pred = get_pred_parent(pred, preds,
1471 pred->parent, &move);
1472 continue;
1473 }
1474 done = 1;
1475 } while (!done);
1476
1477 return count;
1478}
1479
1480static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1481{
1482 struct filter_pred *pred;
1483 enum move_type move = MOVE_DOWN;
1484 int count = 0;
1485 int children;
1486 int done = 0;
1487
1488 /* No need to keep the fold flag */
1489 root->index &= ~FILTER_PRED_FOLD;
1490
1491 /* If the root is a leaf then do nothing */
1492 if (root->left == FILTER_PRED_INVALID)
1493 return 0;
1494
1495 /* count the children */
1496 children = count_leafs(preds, &preds[root->left]);
1497 children += count_leafs(preds, &preds[root->right]);
1498
1499 root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL);
1500 if (!root->ops)
1501 return -ENOMEM;
1502
1503 root->val = children;
1504
1505 pred = root;
1506 do {
1507 switch (move) {
1508 case MOVE_DOWN:
1509 if (pred->left != FILTER_PRED_INVALID) {
1510 pred = &preds[pred->left];
1511 continue;
1512 }
1513 if (WARN_ON(count == children))
1514 return -EINVAL;
1515 pred->index &= ~FILTER_PRED_FOLD;
1516 root->ops[count++] = pred->index;
1517 pred = get_pred_parent(pred, preds,
1518 pred->parent, &move);
1519 continue;
1520 case MOVE_UP_FROM_LEFT:
1521 pred = &preds[pred->right];
1522 move = MOVE_DOWN;
1523 continue;
1524 case MOVE_UP_FROM_RIGHT:
1525 if (pred == root)
1526 break;
1527 pred = get_pred_parent(pred, preds,
1528 pred->parent, &move);
1529 continue;
1530 }
1531 done = 1;
1532 } while (!done);
1533
1534 return 0;
1535}
1536
1537/*
1538 * To optimize the processing of the ops, if we have several "ors" or
1539 * "ands" together, we can put them in an array and process them all
1540 * together speeding up the filter logic.
1541 */
1542static int fold_pred_tree(struct event_filter *filter,
1543 struct filter_pred *root)
1544{
1545 struct filter_pred *preds;
1546 struct filter_pred *pred;
1547 enum move_type move = MOVE_DOWN;
1548 int done = 0;
1549 int err;
1550
1551 preds = filter->preds;
1552 if (!preds)
1553 return -EINVAL;
1554 pred = root;
1555
1556 do {
1557 switch (move) {
1558 case MOVE_DOWN:
1559 if (pred->index & FILTER_PRED_FOLD) {
1560 err = fold_pred(preds, pred);
1561 if (err)
1562 return err;
1563 /* Folded nodes are like leafs */
1564 } else if (pred->left != FILTER_PRED_INVALID) {
1565 pred = &preds[pred->left];
1566 continue;
1567 }
1568
1569 /* A leaf at the root is just a leaf in the tree */
1570 if (pred == root)
1571 break;
1572 pred = get_pred_parent(pred, preds,
1573 pred->parent, &move);
1574 continue;
1575 case MOVE_UP_FROM_LEFT:
1576 pred = &preds[pred->right];
1577 move = MOVE_DOWN;
1578 continue;
1579 case MOVE_UP_FROM_RIGHT:
1580 if (pred == root)
1581 break;
1582 pred = get_pred_parent(pred, preds,
1583 pred->parent, &move);
1584 continue;
1585 }
1586 done = 1;
1587 } while (!done);
1588
1589 return 0;
1590}
1591
1190static int replace_preds(struct ftrace_event_call *call, 1592static int replace_preds(struct ftrace_event_call *call,
1191 struct event_filter *filter, 1593 struct event_filter *filter,
1192 struct filter_parse_state *ps, 1594 struct filter_parse_state *ps,
@@ -1195,14 +1597,32 @@ static int replace_preds(struct ftrace_event_call *call,
1195{ 1597{
1196 char *operand1 = NULL, *operand2 = NULL; 1598 char *operand1 = NULL, *operand2 = NULL;
1197 struct filter_pred *pred; 1599 struct filter_pred *pred;
1600 struct filter_pred *root;
1198 struct postfix_elt *elt; 1601 struct postfix_elt *elt;
1602 struct pred_stack stack = { }; /* init to NULL */
1199 int err; 1603 int err;
1200 int n_preds = 0; 1604 int n_preds = 0;
1201 1605
1606 n_preds = count_preds(ps);
1607 if (n_preds >= MAX_FILTER_PRED) {
1608 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1609 return -ENOSPC;
1610 }
1611
1202 err = check_preds(ps); 1612 err = check_preds(ps);
1203 if (err) 1613 if (err)
1204 return err; 1614 return err;
1205 1615
1616 if (!dry_run) {
1617 err = __alloc_pred_stack(&stack, n_preds);
1618 if (err)
1619 return err;
1620 err = __alloc_preds(filter, n_preds);
1621 if (err)
1622 goto fail;
1623 }
1624
1625 n_preds = 0;
1206 list_for_each_entry(elt, &ps->postfix, list) { 1626 list_for_each_entry(elt, &ps->postfix, list) {
1207 if (elt->op == OP_NONE) { 1627 if (elt->op == OP_NONE) {
1208 if (!operand1) 1628 if (!operand1)
@@ -1211,14 +1631,16 @@ static int replace_preds(struct ftrace_event_call *call,
1211 operand2 = elt->operand; 1631 operand2 = elt->operand;
1212 else { 1632 else {
1213 parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); 1633 parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
1214 return -EINVAL; 1634 err = -EINVAL;
1635 goto fail;
1215 } 1636 }
1216 continue; 1637 continue;
1217 } 1638 }
1218 1639
1219 if (n_preds++ == MAX_FILTER_PRED) { 1640 if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
1220 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 1641 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1221 return -ENOSPC; 1642 err = -ENOSPC;
1643 goto fail;
1222 } 1644 }
1223 1645
1224 if (elt->op == OP_AND || elt->op == OP_OR) { 1646 if (elt->op == OP_AND || elt->op == OP_OR) {
@@ -1228,76 +1650,181 @@ static int replace_preds(struct ftrace_event_call *call,
1228 1650
1229 if (!operand1 || !operand2) { 1651 if (!operand1 || !operand2) {
1230 parse_error(ps, FILT_ERR_MISSING_FIELD, 0); 1652 parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
1231 return -EINVAL; 1653 err = -EINVAL;
1654 goto fail;
1232 } 1655 }
1233 1656
1234 pred = create_pred(elt->op, operand1, operand2); 1657 pred = create_pred(elt->op, operand1, operand2);
1235add_pred: 1658add_pred:
1236 if (!pred) 1659 if (!pred) {
1237 return -ENOMEM; 1660 err = -ENOMEM;
1238 err = filter_add_pred(ps, call, filter, pred, dry_run); 1661 goto fail;
1662 }
1663 err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
1239 filter_free_pred(pred); 1664 filter_free_pred(pred);
1240 if (err) 1665 if (err)
1241 return err; 1666 goto fail;
1242 1667
1243 operand1 = operand2 = NULL; 1668 operand1 = operand2 = NULL;
1244 } 1669 }
1245 1670
1246 return 0; 1671 if (!dry_run) {
1672 /* We should have one item left on the stack */
1673 pred = __pop_pred_stack(&stack);
1674 if (!pred)
1675 return -EINVAL;
1676 /* This item is where we start from in matching */
1677 root = pred;
1678 /* Make sure the stack is empty */
1679 pred = __pop_pred_stack(&stack);
1680 if (WARN_ON(pred)) {
1681 err = -EINVAL;
1682 filter->root = NULL;
1683 goto fail;
1684 }
1685 err = check_pred_tree(filter, root);
1686 if (err)
1687 goto fail;
1688
1689 /* Optimize the tree */
1690 err = fold_pred_tree(filter, root);
1691 if (err)
1692 goto fail;
1693
1694 /* We don't set root until we know it works */
1695 barrier();
1696 filter->root = root;
1697 }
1698
1699 err = 0;
1700fail:
1701 __free_pred_stack(&stack);
1702 return err;
1247} 1703}
1248 1704
1705struct filter_list {
1706 struct list_head list;
1707 struct event_filter *filter;
1708};
1709
1249static int replace_system_preds(struct event_subsystem *system, 1710static int replace_system_preds(struct event_subsystem *system,
1250 struct filter_parse_state *ps, 1711 struct filter_parse_state *ps,
1251 char *filter_string) 1712 char *filter_string)
1252{ 1713{
1253 struct ftrace_event_call *call; 1714 struct ftrace_event_call *call;
1715 struct filter_list *filter_item;
1716 struct filter_list *tmp;
1717 LIST_HEAD(filter_list);
1254 bool fail = true; 1718 bool fail = true;
1255 int err; 1719 int err;
1256 1720
1257 list_for_each_entry(call, &ftrace_events, list) { 1721 list_for_each_entry(call, &ftrace_events, list) {
1258 struct event_filter *filter = call->filter;
1259 1722
1260 if (strcmp(call->class->system, system->name) != 0) 1723 if (strcmp(call->class->system, system->name) != 0)
1261 continue; 1724 continue;
1262 1725
1263 /* try to see if the filter can be applied */ 1726 /*
1264 err = replace_preds(call, filter, ps, filter_string, true); 1727 * Try to see if the filter can be applied
1728 * (filter arg is ignored on dry_run)
1729 */
1730 err = replace_preds(call, NULL, ps, filter_string, true);
1265 if (err) 1731 if (err)
1732 goto fail;
1733 }
1734
1735 list_for_each_entry(call, &ftrace_events, list) {
1736 struct event_filter *filter;
1737
1738 if (strcmp(call->class->system, system->name) != 0)
1266 continue; 1739 continue;
1267 1740
1268 /* really apply the filter */ 1741 filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
1269 filter_disable_preds(call); 1742 if (!filter_item)
1270 err = replace_preds(call, filter, ps, filter_string, false); 1743 goto fail_mem;
1744
1745 list_add_tail(&filter_item->list, &filter_list);
1746
1747 filter_item->filter = __alloc_filter();
1748 if (!filter_item->filter)
1749 goto fail_mem;
1750 filter = filter_item->filter;
1751
1752 /* Can only fail on no memory */
1753 err = replace_filter_string(filter, filter_string);
1271 if (err) 1754 if (err)
1272 filter_disable_preds(call); 1755 goto fail_mem;
1273 else { 1756
1757 err = replace_preds(call, filter, ps, filter_string, false);
1758 if (err) {
1759 filter_disable(call);
1760 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1761 append_filter_err(ps, filter);
1762 } else
1274 call->flags |= TRACE_EVENT_FL_FILTERED; 1763 call->flags |= TRACE_EVENT_FL_FILTERED;
1275 replace_filter_string(filter, filter_string); 1764 /*
1276 } 1765 * Regardless of if this returned an error, we still
1766 * replace the filter for the call.
1767 */
1768 filter = call->filter;
1769 call->filter = filter_item->filter;
1770 filter_item->filter = filter;
1771
1277 fail = false; 1772 fail = false;
1278 } 1773 }
1279 1774
1280 if (fail) { 1775 if (fail)
1281 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 1776 goto fail;
1282 return -EINVAL; 1777
1778 /*
1779 * The calls can still be using the old filters.
1780 * Do a synchronize_sched() to ensure all calls are
1781 * done with them before we free them.
1782 */
1783 synchronize_sched();
1784 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1785 __free_filter(filter_item->filter);
1786 list_del(&filter_item->list);
1787 kfree(filter_item);
1283 } 1788 }
1284 return 0; 1789 return 0;
1790 fail:
1791 /* No call succeeded */
1792 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1793 list_del(&filter_item->list);
1794 kfree(filter_item);
1795 }
1796 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1797 return -EINVAL;
1798 fail_mem:
1799 /* If any call succeeded, we still need to sync */
1800 if (!fail)
1801 synchronize_sched();
1802 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1803 __free_filter(filter_item->filter);
1804 list_del(&filter_item->list);
1805 kfree(filter_item);
1806 }
1807 return -ENOMEM;
1285} 1808}
1286 1809
1287int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1810int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1288{ 1811{
1289 int err;
1290 struct filter_parse_state *ps; 1812 struct filter_parse_state *ps;
1813 struct event_filter *filter;
1814 struct event_filter *tmp;
1815 int err = 0;
1291 1816
1292 mutex_lock(&event_mutex); 1817 mutex_lock(&event_mutex);
1293 1818
1294 err = init_preds(call);
1295 if (err)
1296 goto out_unlock;
1297
1298 if (!strcmp(strstrip(filter_string), "0")) { 1819 if (!strcmp(strstrip(filter_string), "0")) {
1299 filter_disable_preds(call); 1820 filter_disable(call);
1300 remove_filter_string(call->filter); 1821 filter = call->filter;
1822 if (!filter)
1823 goto out_unlock;
1824 call->filter = NULL;
1825 /* Make sure the filter is not being used */
1826 synchronize_sched();
1827 __free_filter(filter);
1301 goto out_unlock; 1828 goto out_unlock;
1302 } 1829 }
1303 1830
@@ -1306,22 +1833,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1306 if (!ps) 1833 if (!ps)
1307 goto out_unlock; 1834 goto out_unlock;
1308 1835
1309 filter_disable_preds(call); 1836 filter = __alloc_filter();
1310 replace_filter_string(call->filter, filter_string); 1837 if (!filter) {
1838 kfree(ps);
1839 goto out_unlock;
1840 }
1841
1842 replace_filter_string(filter, filter_string);
1311 1843
1312 parse_init(ps, filter_ops, filter_string); 1844 parse_init(ps, filter_ops, filter_string);
1313 err = filter_parse(ps); 1845 err = filter_parse(ps);
1314 if (err) { 1846 if (err) {
1315 append_filter_err(ps, call->filter); 1847 append_filter_err(ps, filter);
1316 goto out; 1848 goto out;
1317 } 1849 }
1318 1850
1319 err = replace_preds(call, call->filter, ps, filter_string, false); 1851 err = replace_preds(call, filter, ps, filter_string, false);
1320 if (err) 1852 if (err) {
1321 append_filter_err(ps, call->filter); 1853 filter_disable(call);
1322 else 1854 append_filter_err(ps, filter);
1855 } else
1323 call->flags |= TRACE_EVENT_FL_FILTERED; 1856 call->flags |= TRACE_EVENT_FL_FILTERED;
1324out: 1857out:
1858 /*
1859 * Always swap the call filter with the new filter
1860 * even if there was an error. If there was an error
1861 * in the filter, we disable the filter and show the error
1862 * string
1863 */
1864 tmp = call->filter;
1865 call->filter = filter;
1866 if (tmp) {
1867 /* Make sure the call is done with the filter */
1868 synchronize_sched();
1869 __free_filter(tmp);
1870 }
1325 filter_opstack_clear(ps); 1871 filter_opstack_clear(ps);
1326 postfix_clear(ps); 1872 postfix_clear(ps);
1327 kfree(ps); 1873 kfree(ps);
@@ -1334,18 +1880,21 @@ out_unlock:
1334int apply_subsystem_event_filter(struct event_subsystem *system, 1880int apply_subsystem_event_filter(struct event_subsystem *system,
1335 char *filter_string) 1881 char *filter_string)
1336{ 1882{
1337 int err;
1338 struct filter_parse_state *ps; 1883 struct filter_parse_state *ps;
1884 struct event_filter *filter;
1885 int err = 0;
1339 1886
1340 mutex_lock(&event_mutex); 1887 mutex_lock(&event_mutex);
1341 1888
1342 err = init_subsystem_preds(system);
1343 if (err)
1344 goto out_unlock;
1345
1346 if (!strcmp(strstrip(filter_string), "0")) { 1889 if (!strcmp(strstrip(filter_string), "0")) {
1347 filter_free_subsystem_preds(system); 1890 filter_free_subsystem_preds(system);
1348 remove_filter_string(system->filter); 1891 remove_filter_string(system->filter);
1892 filter = system->filter;
1893 system->filter = NULL;
1894 /* Ensure all filters are no longer used */
1895 synchronize_sched();
1896 filter_free_subsystem_filters(system);
1897 __free_filter(filter);
1349 goto out_unlock; 1898 goto out_unlock;
1350 } 1899 }
1351 1900
@@ -1354,7 +1903,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1354 if (!ps) 1903 if (!ps)
1355 goto out_unlock; 1904 goto out_unlock;
1356 1905
1357 replace_filter_string(system->filter, filter_string); 1906 filter = __alloc_filter();
1907 if (!filter)
1908 goto out;
1909
1910 replace_filter_string(filter, filter_string);
1911 /*
1912 * No event actually uses the system filter
1913 * we can free it without synchronize_sched().
1914 */
1915 __free_filter(system->filter);
1916 system->filter = filter;
1358 1917
1359 parse_init(ps, filter_ops, filter_string); 1918 parse_init(ps, filter_ops, filter_string);
1360 err = filter_parse(ps); 1919 err = filter_parse(ps);
@@ -1384,7 +1943,7 @@ void ftrace_profile_free_filter(struct perf_event *event)
1384 struct event_filter *filter = event->filter; 1943 struct event_filter *filter = event->filter;
1385 1944
1386 event->filter = NULL; 1945 event->filter = NULL;
1387 __free_preds(filter); 1946 __free_filter(filter);
1388} 1947}
1389 1948
1390int ftrace_profile_set_filter(struct perf_event *event, int event_id, 1949int ftrace_profile_set_filter(struct perf_event *event, int event_id,
@@ -1410,8 +1969,8 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1410 if (event->filter) 1969 if (event->filter)
1411 goto out_unlock; 1970 goto out_unlock;
1412 1971
1413 filter = __alloc_preds(); 1972 filter = __alloc_filter();
1414 if (IS_ERR(filter)) { 1973 if (!filter) {
1415 err = PTR_ERR(filter); 1974 err = PTR_ERR(filter);
1416 goto out_unlock; 1975 goto out_unlock;
1417 } 1976 }
@@ -1419,7 +1978,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1419 err = -ENOMEM; 1978 err = -ENOMEM;
1420 ps = kzalloc(sizeof(*ps), GFP_KERNEL); 1979 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1421 if (!ps) 1980 if (!ps)
1422 goto free_preds; 1981 goto free_filter;
1423 1982
1424 parse_init(ps, filter_ops, filter_str); 1983 parse_init(ps, filter_ops, filter_str);
1425 err = filter_parse(ps); 1984 err = filter_parse(ps);
@@ -1435,9 +1994,9 @@ free_ps:
1435 postfix_clear(ps); 1994 postfix_clear(ps);
1436 kfree(ps); 1995 kfree(ps);
1437 1996
1438free_preds: 1997free_filter:
1439 if (err) 1998 if (err)
1440 __free_preds(filter); 1999 __free_filter(filter);
1441 2000
1442out_unlock: 2001out_unlock:
1443 mutex_unlock(&event_mutex); 2002 mutex_unlock(&event_mutex);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 16aee4d44e8f..8d0e1cc4e974 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -149,11 +149,13 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
149static struct ftrace_ops trace_ops __read_mostly = 149static struct ftrace_ops trace_ops __read_mostly =
150{ 150{
151 .func = function_trace_call, 151 .func = function_trace_call,
152 .flags = FTRACE_OPS_FL_GLOBAL,
152}; 153};
153 154
154static struct ftrace_ops trace_stack_ops __read_mostly = 155static struct ftrace_ops trace_stack_ops __read_mostly =
155{ 156{
156 .func = function_stack_trace_call, 157 .func = function_stack_trace_call,
158 .flags = FTRACE_OPS_FL_GLOBAL,
157}; 159};
158 160
159/* Our two options */ 161/* Our two options */
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 76b05980225c..962cdb24ed81 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -905,7 +905,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
905 * 905 *
906 * returns 1 if 906 * returns 1 if
907 * - we are inside irq code 907 * - we are inside irq code
908 * - we just extered irq code 908 * - we just entered irq code
909 * 909 *
910 * retunns 0 if 910 * retunns 0 if
911 * - funcgraph-interrupts option is set 911 * - funcgraph-interrupts option is set
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 92b6e1e12d98..c77424be284d 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -80,7 +80,7 @@ static struct tracer_flags tracer_flags = {
80 * skip the latency if the sequence has changed - some other section 80 * skip the latency if the sequence has changed - some other section
81 * did a maximum and could disturb our measurement with serial console 81 * did a maximum and could disturb our measurement with serial console
82 * printouts, etc. Truly coinciding maximum latencies should be rare 82 * printouts, etc. Truly coinciding maximum latencies should be rare
83 * and what happens together happens separately as well, so this doesnt 83 * and what happens together happens separately as well, so this doesn't
84 * decrease the validity of the maximum found: 84 * decrease the validity of the maximum found:
85 */ 85 */
86static __cacheline_aligned_in_smp unsigned long max_sequence; 86static __cacheline_aligned_in_smp unsigned long max_sequence;
@@ -153,6 +153,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
153static struct ftrace_ops trace_ops __read_mostly = 153static struct ftrace_ops trace_ops __read_mostly =
154{ 154{
155 .func = irqsoff_tracer_call, 155 .func = irqsoff_tracer_call,
156 .flags = FTRACE_OPS_FL_GLOBAL,
156}; 157};
157#endif /* CONFIG_FUNCTION_TRACER */ 158#endif /* CONFIG_FUNCTION_TRACER */
158 159
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 2dec9bcde8b4..f925c45f0afa 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -53,7 +53,6 @@ const char *reserved_field_names[] = {
53 "common_preempt_count", 53 "common_preempt_count",
54 "common_pid", 54 "common_pid",
55 "common_tgid", 55 "common_tgid",
56 "common_lock_depth",
57 FIELD_STRING_IP, 56 FIELD_STRING_IP,
58 FIELD_STRING_RETIP, 57 FIELD_STRING_RETIP,
59 FIELD_STRING_FUNC, 58 FIELD_STRING_FUNC,
@@ -353,6 +352,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
353 kfree(data); 352 kfree(data);
354} 353}
355 354
355/* Bitfield fetch function */
356struct bitfield_fetch_param {
357 struct fetch_param orig;
358 unsigned char hi_shift;
359 unsigned char low_shift;
360};
361
362#define DEFINE_FETCH_bitfield(type) \
363static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
364 void *data, void *dest) \
365{ \
366 struct bitfield_fetch_param *bprm = data; \
367 type buf = 0; \
368 call_fetch(&bprm->orig, regs, &buf); \
369 if (buf) { \
370 buf <<= bprm->hi_shift; \
371 buf >>= bprm->low_shift; \
372 } \
373 *(type *)dest = buf; \
374}
375DEFINE_BASIC_FETCH_FUNCS(bitfield)
376#define fetch_bitfield_string NULL
377#define fetch_bitfield_string_size NULL
378
379static __kprobes void
380free_bitfield_fetch_param(struct bitfield_fetch_param *data)
381{
382 /*
383 * Don't check the bitfield itself, because this must be the
384 * last fetch function.
385 */
386 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
387 free_deref_fetch_param(data->orig.data);
388 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
389 free_symbol_cache(data->orig.data);
390 kfree(data);
391}
356/* Default (unsigned long) fetch type */ 392/* Default (unsigned long) fetch type */
357#define __DEFAULT_FETCH_TYPE(t) u##t 393#define __DEFAULT_FETCH_TYPE(t) u##t
358#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) 394#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -367,6 +403,7 @@ enum {
367 FETCH_MTD_memory, 403 FETCH_MTD_memory,
368 FETCH_MTD_symbol, 404 FETCH_MTD_symbol,
369 FETCH_MTD_deref, 405 FETCH_MTD_deref,
406 FETCH_MTD_bitfield,
370 FETCH_MTD_END, 407 FETCH_MTD_END,
371}; 408};
372 409
@@ -387,6 +424,7 @@ ASSIGN_FETCH_FUNC(retval, ftype), \
387ASSIGN_FETCH_FUNC(memory, ftype), \ 424ASSIGN_FETCH_FUNC(memory, ftype), \
388ASSIGN_FETCH_FUNC(symbol, ftype), \ 425ASSIGN_FETCH_FUNC(symbol, ftype), \
389ASSIGN_FETCH_FUNC(deref, ftype), \ 426ASSIGN_FETCH_FUNC(deref, ftype), \
427ASSIGN_FETCH_FUNC(bitfield, ftype), \
390 } \ 428 } \
391 } 429 }
392 430
@@ -430,9 +468,33 @@ static const struct fetch_type *find_fetch_type(const char *type)
430 if (!type) 468 if (!type)
431 type = DEFAULT_FETCH_TYPE_STR; 469 type = DEFAULT_FETCH_TYPE_STR;
432 470
471 /* Special case: bitfield */
472 if (*type == 'b') {
473 unsigned long bs;
474 type = strchr(type, '/');
475 if (!type)
476 goto fail;
477 type++;
478 if (strict_strtoul(type, 0, &bs))
479 goto fail;
480 switch (bs) {
481 case 8:
482 return find_fetch_type("u8");
483 case 16:
484 return find_fetch_type("u16");
485 case 32:
486 return find_fetch_type("u32");
487 case 64:
488 return find_fetch_type("u64");
489 default:
490 goto fail;
491 }
492 }
493
433 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) 494 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
434 if (strcmp(type, fetch_type_table[i].name) == 0) 495 if (strcmp(type, fetch_type_table[i].name) == 0)
435 return &fetch_type_table[i]; 496 return &fetch_type_table[i];
497fail:
436 return NULL; 498 return NULL;
437} 499}
438 500
@@ -586,7 +648,9 @@ error:
586 648
587static void free_probe_arg(struct probe_arg *arg) 649static void free_probe_arg(struct probe_arg *arg)
588{ 650{
589 if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) 651 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
652 free_bitfield_fetch_param(arg->fetch.data);
653 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
590 free_deref_fetch_param(arg->fetch.data); 654 free_deref_fetch_param(arg->fetch.data);
591 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) 655 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
592 free_symbol_cache(arg->fetch.data); 656 free_symbol_cache(arg->fetch.data);
@@ -767,16 +831,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
767 } 831 }
768 break; 832 break;
769 case '+': /* deref memory */ 833 case '+': /* deref memory */
834 arg++; /* Skip '+', because strict_strtol() rejects it. */
770 case '-': 835 case '-':
771 tmp = strchr(arg, '('); 836 tmp = strchr(arg, '(');
772 if (!tmp) 837 if (!tmp)
773 break; 838 break;
774 *tmp = '\0'; 839 *tmp = '\0';
775 ret = strict_strtol(arg + 1, 0, &offset); 840 ret = strict_strtol(arg, 0, &offset);
776 if (ret) 841 if (ret)
777 break; 842 break;
778 if (arg[0] == '-')
779 offset = -offset;
780 arg = tmp + 1; 843 arg = tmp + 1;
781 tmp = strrchr(arg, ')'); 844 tmp = strrchr(arg, ')');
782 if (tmp) { 845 if (tmp) {
@@ -807,6 +870,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
807 return ret; 870 return ret;
808} 871}
809 872
873#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
874
875/* Bitfield type needs to be parsed into a fetch function */
876static int __parse_bitfield_probe_arg(const char *bf,
877 const struct fetch_type *t,
878 struct fetch_param *f)
879{
880 struct bitfield_fetch_param *bprm;
881 unsigned long bw, bo;
882 char *tail;
883
884 if (*bf != 'b')
885 return 0;
886
887 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
888 if (!bprm)
889 return -ENOMEM;
890 bprm->orig = *f;
891 f->fn = t->fetch[FETCH_MTD_bitfield];
892 f->data = (void *)bprm;
893
894 bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
895 if (bw == 0 || *tail != '@')
896 return -EINVAL;
897
898 bf = tail + 1;
899 bo = simple_strtoul(bf, &tail, 0);
900 if (tail == bf || *tail != '/')
901 return -EINVAL;
902
903 bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
904 bprm->low_shift = bprm->hi_shift + bo;
905 return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
906}
907
810/* String length checking wrapper */ 908/* String length checking wrapper */
811static int parse_probe_arg(char *arg, struct trace_probe *tp, 909static int parse_probe_arg(char *arg, struct trace_probe *tp,
812 struct probe_arg *parg, int is_return) 910 struct probe_arg *parg, int is_return)
@@ -836,6 +934,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
836 parg->offset = tp->size; 934 parg->offset = tp->size;
837 tp->size += parg->type->size; 935 tp->size += parg->type->size;
838 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); 936 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
937 if (ret >= 0 && t != NULL)
938 ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
839 if (ret >= 0) { 939 if (ret >= 0) {
840 parg->fetch_size.fn = get_fetch_size_function(parg->type, 940 parg->fetch_size.fn = get_fetch_size_function(parg->type,
841 parg->fetch.fn); 941 parg->fetch.fn);
@@ -1130,7 +1230,7 @@ static int command_trace_probe(const char *buf)
1130 return ret; 1230 return ret;
1131} 1231}
1132 1232
1133#define WRITE_BUFSIZE 128 1233#define WRITE_BUFSIZE 4096
1134 1234
1135static ssize_t probes_write(struct file *file, const char __user *buffer, 1235static ssize_t probes_write(struct file *file, const char __user *buffer,
1136 size_t count, loff_t *ppos) 1236 size_t count, loff_t *ppos)
@@ -1738,7 +1838,7 @@ static void unregister_probe_event(struct trace_probe *tp)
1738 kfree(tp->call.print_fmt); 1838 kfree(tp->call.print_fmt);
1739} 1839}
1740 1840
1741/* Make a debugfs interface for controling probe points */ 1841/* Make a debugfs interface for controlling probe points */
1742static __init int init_kprobe_trace(void) 1842static __init int init_kprobe_trace(void)
1743{ 1843{
1744 struct dentry *d_tracer; 1844 struct dentry *d_tracer;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 02272baa2206..cf535ccedc86 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -529,24 +529,34 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
529 * @entry: The trace entry field from the ring buffer 529 * @entry: The trace entry field from the ring buffer
530 * 530 *
531 * Prints the generic fields of irqs off, in hard or softirq, preempt 531 * Prints the generic fields of irqs off, in hard or softirq, preempt
532 * count and lock depth. 532 * count.
533 */ 533 */
534int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) 534int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
535{ 535{
536 int hardirq, softirq; 536 char hardsoft_irq;
537 char need_resched;
538 char irqs_off;
539 int hardirq;
540 int softirq;
537 int ret; 541 int ret;
538 542
539 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 543 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
540 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 544 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
541 545
546 irqs_off =
547 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
548 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
549 '.';
550 need_resched =
551 (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
552 hardsoft_irq =
553 (hardirq && softirq) ? 'H' :
554 hardirq ? 'h' :
555 softirq ? 's' :
556 '.';
557
542 if (!trace_seq_printf(s, "%c%c%c", 558 if (!trace_seq_printf(s, "%c%c%c",
543 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 559 irqs_off, need_resched, hardsoft_irq))
544 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
545 'X' : '.',
546 (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
547 'N' : '.',
548 (hardirq && softirq) ? 'H' :
549 hardirq ? 'h' : softirq ? 's' : '.'))
550 return 0; 560 return 0;
551 561
552 if (entry->preempt_count) 562 if (entry->preempt_count)
@@ -554,13 +564,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
554 else 564 else
555 ret = trace_seq_putc(s, '.'); 565 ret = trace_seq_putc(s, '.');
556 566
557 if (!ret) 567 return ret;
558 return 0;
559
560 if (entry->lock_depth < 0)
561 return trace_seq_putc(s, '.');
562
563 return trace_seq_printf(s, "%d", entry->lock_depth);
564} 568}
565 569
566static int 570static int
@@ -826,6 +830,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
826enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, 830enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
827 struct trace_event *event) 831 struct trace_event *event)
828{ 832{
833 if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type))
834 return TRACE_TYPE_PARTIAL_LINE;
835
829 return TRACE_TYPE_HANDLED; 836 return TRACE_TYPE_HANDLED;
830} 837}
831 838
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2547d8813cf0..dff763b7baf1 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -32,7 +32,7 @@ static DEFINE_MUTEX(btrace_mutex);
32 32
33struct trace_bprintk_fmt { 33struct trace_bprintk_fmt {
34 struct list_head list; 34 struct list_head list;
35 char fmt[0]; 35 const char *fmt;
36}; 36};
37 37
38static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) 38static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
@@ -49,6 +49,7 @@ static
49void hold_module_trace_bprintk_format(const char **start, const char **end) 49void hold_module_trace_bprintk_format(const char **start, const char **end)
50{ 50{
51 const char **iter; 51 const char **iter;
52 char *fmt;
52 53
53 mutex_lock(&btrace_mutex); 54 mutex_lock(&btrace_mutex);
54 for (iter = start; iter < end; iter++) { 55 for (iter = start; iter < end; iter++) {
@@ -58,14 +59,18 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
58 continue; 59 continue;
59 } 60 }
60 61
61 tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt) 62 tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
62 + strlen(*iter) + 1, GFP_KERNEL); 63 if (tb_fmt)
63 if (tb_fmt) { 64 fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
65 if (tb_fmt && fmt) {
64 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); 66 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
65 strcpy(tb_fmt->fmt, *iter); 67 strcpy(fmt, *iter);
68 tb_fmt->fmt = fmt;
66 *iter = tb_fmt->fmt; 69 *iter = tb_fmt->fmt;
67 } else 70 } else {
71 kfree(tb_fmt);
68 *iter = NULL; 72 *iter = NULL;
73 }
69 } 74 }
70 mutex_unlock(&btrace_mutex); 75 mutex_unlock(&btrace_mutex);
71} 76}
@@ -84,6 +89,76 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self,
84 return 0; 89 return 0;
85} 90}
86 91
92/*
93 * The debugfs/tracing/printk_formats file maps the addresses with
94 * the ASCII formats that are used in the bprintk events in the
95 * buffer. For userspace tools to be able to decode the events from
96 * the buffer, they need to be able to map the address with the format.
97 *
98 * The addresses of the bprintk formats are in their own section
99 * __trace_printk_fmt. But for modules we copy them into a link list.
100 * The code to print the formats and their addresses passes around the
101 * address of the fmt string. If the fmt address passed into the seq
102 * functions is within the kernel core __trace_printk_fmt section, then
103 * it simply uses the next pointer in the list.
104 *
105 * When the fmt pointer is outside the kernel core __trace_printk_fmt
106 * section, then we need to read the link list pointers. The trick is
107 * we pass the address of the string to the seq function just like
108 * we do for the kernel core formats. To get back the structure that
109 * holds the format, we simply use containerof() and then go to the
110 * next format in the list.
111 */
112static const char **
113find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
114{
115 struct trace_bprintk_fmt *mod_fmt;
116
117 if (list_empty(&trace_bprintk_fmt_list))
118 return NULL;
119
120 /*
121 * v will point to the address of the fmt record from t_next
122 * v will be NULL from t_start.
123 * If this is the first pointer or called from start
124 * then we need to walk the list.
125 */
126 if (!v || start_index == *pos) {
127 struct trace_bprintk_fmt *p;
128
129 /* search the module list */
130 list_for_each_entry(p, &trace_bprintk_fmt_list, list) {
131 if (start_index == *pos)
132 return &p->fmt;
133 start_index++;
134 }
135 /* pos > index */
136 return NULL;
137 }
138
139 /*
140 * v points to the address of the fmt field in the mod list
141 * structure that holds the module print format.
142 */
143 mod_fmt = container_of(v, typeof(*mod_fmt), fmt);
144 if (mod_fmt->list.next == &trace_bprintk_fmt_list)
145 return NULL;
146
147 mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list);
148
149 return &mod_fmt->fmt;
150}
151
152static void format_mod_start(void)
153{
154 mutex_lock(&btrace_mutex);
155}
156
157static void format_mod_stop(void)
158{
159 mutex_unlock(&btrace_mutex);
160}
161
87#else /* !CONFIG_MODULES */ 162#else /* !CONFIG_MODULES */
88__init static int 163__init static int
89module_trace_bprintk_format_notify(struct notifier_block *self, 164module_trace_bprintk_format_notify(struct notifier_block *self,
@@ -91,6 +166,13 @@ module_trace_bprintk_format_notify(struct notifier_block *self,
91{ 166{
92 return 0; 167 return 0;
93} 168}
169static inline const char **
170find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
171{
172 return NULL;
173}
174static inline void format_mod_start(void) { }
175static inline void format_mod_stop(void) { }
94#endif /* CONFIG_MODULES */ 176#endif /* CONFIG_MODULES */
95 177
96 178
@@ -153,20 +235,33 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
153} 235}
154EXPORT_SYMBOL_GPL(__ftrace_vprintk); 236EXPORT_SYMBOL_GPL(__ftrace_vprintk);
155 237
238static const char **find_next(void *v, loff_t *pos)
239{
240 const char **fmt = v;
241 int start_index;
242
243 if (!fmt)
244 fmt = __start___trace_bprintk_fmt + *pos;
245
246 start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt;
247
248 if (*pos < start_index)
249 return fmt;
250
251 return find_next_mod_format(start_index, v, fmt, pos);
252}
253
156static void * 254static void *
157t_start(struct seq_file *m, loff_t *pos) 255t_start(struct seq_file *m, loff_t *pos)
158{ 256{
159 const char **fmt = __start___trace_bprintk_fmt + *pos; 257 format_mod_start();
160 258 return find_next(NULL, pos);
161 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
162 return NULL;
163 return fmt;
164} 259}
165 260
166static void *t_next(struct seq_file *m, void * v, loff_t *pos) 261static void *t_next(struct seq_file *m, void * v, loff_t *pos)
167{ 262{
168 (*pos)++; 263 (*pos)++;
169 return t_start(m, pos); 264 return find_next(v, pos);
170} 265}
171 266
172static int t_show(struct seq_file *m, void *v) 267static int t_show(struct seq_file *m, void *v)
@@ -205,6 +300,7 @@ static int t_show(struct seq_file *m, void *v)
205 300
206static void t_stop(struct seq_file *m, void *p) 301static void t_stop(struct seq_file *m, void *p)
207{ 302{
303 format_mod_stop();
208} 304}
209 305
210static const struct seq_operations show_format_seq_ops = { 306static const struct seq_operations show_format_seq_ops = {
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8f758d070c43..7e62c0a18456 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
247 ctx_trace = tr; 247 ctx_trace = tr;
248} 248}
249 249
250static void stop_sched_trace(struct trace_array *tr)
251{
252 tracing_stop_sched_switch_record();
253}
254
255static int sched_switch_trace_init(struct trace_array *tr)
256{
257 ctx_trace = tr;
258 tracing_reset_online_cpus(tr);
259 tracing_start_sched_switch_record();
260 return 0;
261}
262
263static void sched_switch_trace_reset(struct trace_array *tr)
264{
265 if (sched_ref)
266 stop_sched_trace(tr);
267}
268
269static void sched_switch_trace_start(struct trace_array *tr)
270{
271 sched_stopped = 0;
272}
273
274static void sched_switch_trace_stop(struct trace_array *tr)
275{
276 sched_stopped = 1;
277}
278
279static struct tracer sched_switch_trace __read_mostly =
280{
281 .name = "sched_switch",
282 .init = sched_switch_trace_init,
283 .reset = sched_switch_trace_reset,
284 .start = sched_switch_trace_start,
285 .stop = sched_switch_trace_stop,
286 .wait_pipe = poll_wait_pipe,
287#ifdef CONFIG_FTRACE_SELFTEST
288 .selftest = trace_selftest_startup_sched_switch,
289#endif
290};
291
292__init static int init_sched_switch_trace(void)
293{
294 return register_tracer(&sched_switch_trace);
295}
296device_initcall(init_sched_switch_trace);
297
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 7319559ed59f..f029dd4fd2ca 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -129,6 +129,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
129static struct ftrace_ops trace_ops __read_mostly = 129static struct ftrace_ops trace_ops __read_mostly =
130{ 130{
131 .func = wakeup_tracer_call, 131 .func = wakeup_tracer_call,
132 .flags = FTRACE_OPS_FL_GLOBAL,
132}; 133};
133#endif /* CONFIG_FUNCTION_TRACER */ 134#endif /* CONFIG_FUNCTION_TRACER */
134 135
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 659732eba07c..288541f977fb 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -101,6 +101,206 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
101 101
102#ifdef CONFIG_DYNAMIC_FTRACE 102#ifdef CONFIG_DYNAMIC_FTRACE
103 103
104static int trace_selftest_test_probe1_cnt;
105static void trace_selftest_test_probe1_func(unsigned long ip,
106 unsigned long pip)
107{
108 trace_selftest_test_probe1_cnt++;
109}
110
111static int trace_selftest_test_probe2_cnt;
112static void trace_selftest_test_probe2_func(unsigned long ip,
113 unsigned long pip)
114{
115 trace_selftest_test_probe2_cnt++;
116}
117
118static int trace_selftest_test_probe3_cnt;
119static void trace_selftest_test_probe3_func(unsigned long ip,
120 unsigned long pip)
121{
122 trace_selftest_test_probe3_cnt++;
123}
124
125static int trace_selftest_test_global_cnt;
126static void trace_selftest_test_global_func(unsigned long ip,
127 unsigned long pip)
128{
129 trace_selftest_test_global_cnt++;
130}
131
132static int trace_selftest_test_dyn_cnt;
133static void trace_selftest_test_dyn_func(unsigned long ip,
134 unsigned long pip)
135{
136 trace_selftest_test_dyn_cnt++;
137}
138
139static struct ftrace_ops test_probe1 = {
140 .func = trace_selftest_test_probe1_func,
141};
142
143static struct ftrace_ops test_probe2 = {
144 .func = trace_selftest_test_probe2_func,
145};
146
147static struct ftrace_ops test_probe3 = {
148 .func = trace_selftest_test_probe3_func,
149};
150
151static struct ftrace_ops test_global = {
152 .func = trace_selftest_test_global_func,
153 .flags = FTRACE_OPS_FL_GLOBAL,
154};
155
156static void print_counts(void)
157{
158 printk("(%d %d %d %d %d) ",
159 trace_selftest_test_probe1_cnt,
160 trace_selftest_test_probe2_cnt,
161 trace_selftest_test_probe3_cnt,
162 trace_selftest_test_global_cnt,
163 trace_selftest_test_dyn_cnt);
164}
165
166static void reset_counts(void)
167{
168 trace_selftest_test_probe1_cnt = 0;
169 trace_selftest_test_probe2_cnt = 0;
170 trace_selftest_test_probe3_cnt = 0;
171 trace_selftest_test_global_cnt = 0;
172 trace_selftest_test_dyn_cnt = 0;
173}
174
175static int trace_selftest_ops(int cnt)
176{
177 int save_ftrace_enabled = ftrace_enabled;
178 struct ftrace_ops *dyn_ops;
179 char *func1_name;
180 char *func2_name;
181 int len1;
182 int len2;
183 int ret = -1;
184
185 printk(KERN_CONT "PASSED\n");
186 pr_info("Testing dynamic ftrace ops #%d: ", cnt);
187
188 ftrace_enabled = 1;
189 reset_counts();
190
191 /* Handle PPC64 '.' name */
192 func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
193 func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2);
194 len1 = strlen(func1_name);
195 len2 = strlen(func2_name);
196
197 /*
198 * Probe 1 will trace function 1.
199 * Probe 2 will trace function 2.
200 * Probe 3 will trace functions 1 and 2.
201 */
202 ftrace_set_filter(&test_probe1, func1_name, len1, 1);
203 ftrace_set_filter(&test_probe2, func2_name, len2, 1);
204 ftrace_set_filter(&test_probe3, func1_name, len1, 1);
205 ftrace_set_filter(&test_probe3, func2_name, len2, 0);
206
207 register_ftrace_function(&test_probe1);
208 register_ftrace_function(&test_probe2);
209 register_ftrace_function(&test_probe3);
210 register_ftrace_function(&test_global);
211
212 DYN_FTRACE_TEST_NAME();
213
214 print_counts();
215
216 if (trace_selftest_test_probe1_cnt != 1)
217 goto out;
218 if (trace_selftest_test_probe2_cnt != 0)
219 goto out;
220 if (trace_selftest_test_probe3_cnt != 1)
221 goto out;
222 if (trace_selftest_test_global_cnt == 0)
223 goto out;
224
225 DYN_FTRACE_TEST_NAME2();
226
227 print_counts();
228
229 if (trace_selftest_test_probe1_cnt != 1)
230 goto out;
231 if (trace_selftest_test_probe2_cnt != 1)
232 goto out;
233 if (trace_selftest_test_probe3_cnt != 2)
234 goto out;
235
236 /* Add a dynamic probe */
237 dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL);
238 if (!dyn_ops) {
239 printk("MEMORY ERROR ");
240 goto out;
241 }
242
243 dyn_ops->func = trace_selftest_test_dyn_func;
244
245 register_ftrace_function(dyn_ops);
246
247 trace_selftest_test_global_cnt = 0;
248
249 DYN_FTRACE_TEST_NAME();
250
251 print_counts();
252
253 if (trace_selftest_test_probe1_cnt != 2)
254 goto out_free;
255 if (trace_selftest_test_probe2_cnt != 1)
256 goto out_free;
257 if (trace_selftest_test_probe3_cnt != 3)
258 goto out_free;
259 if (trace_selftest_test_global_cnt == 0)
260 goto out;
261 if (trace_selftest_test_dyn_cnt == 0)
262 goto out_free;
263
264 DYN_FTRACE_TEST_NAME2();
265
266 print_counts();
267
268 if (trace_selftest_test_probe1_cnt != 2)
269 goto out_free;
270 if (trace_selftest_test_probe2_cnt != 2)
271 goto out_free;
272 if (trace_selftest_test_probe3_cnt != 4)
273 goto out_free;
274
275 ret = 0;
276 out_free:
277 unregister_ftrace_function(dyn_ops);
278 kfree(dyn_ops);
279
280 out:
281 /* Purposely unregister in the same order */
282 unregister_ftrace_function(&test_probe1);
283 unregister_ftrace_function(&test_probe2);
284 unregister_ftrace_function(&test_probe3);
285 unregister_ftrace_function(&test_global);
286
287 /* Make sure everything is off */
288 reset_counts();
289 DYN_FTRACE_TEST_NAME();
290 DYN_FTRACE_TEST_NAME();
291
292 if (trace_selftest_test_probe1_cnt ||
293 trace_selftest_test_probe2_cnt ||
294 trace_selftest_test_probe3_cnt ||
295 trace_selftest_test_global_cnt ||
296 trace_selftest_test_dyn_cnt)
297 ret = -1;
298
299 ftrace_enabled = save_ftrace_enabled;
300
301 return ret;
302}
303
104/* Test dynamic code modification and ftrace filters */ 304/* Test dynamic code modification and ftrace filters */
105int trace_selftest_startup_dynamic_tracing(struct tracer *trace, 305int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
106 struct trace_array *tr, 306 struct trace_array *tr,
@@ -131,7 +331,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
131 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); 331 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
132 332
133 /* filter only on our function */ 333 /* filter only on our function */
134 ftrace_set_filter(func_name, strlen(func_name), 1); 334 ftrace_set_global_filter(func_name, strlen(func_name), 1);
135 335
136 /* enable tracing */ 336 /* enable tracing */
137 ret = tracer_init(trace, tr); 337 ret = tracer_init(trace, tr);
@@ -166,22 +366,30 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
166 366
167 /* check the trace buffer */ 367 /* check the trace buffer */
168 ret = trace_test_buffer(tr, &count); 368 ret = trace_test_buffer(tr, &count);
169 trace->reset(tr);
170 tracing_start(); 369 tracing_start();
171 370
172 /* we should only have one item */ 371 /* we should only have one item */
173 if (!ret && count != 1) { 372 if (!ret && count != 1) {
373 trace->reset(tr);
174 printk(KERN_CONT ".. filter failed count=%ld ..", count); 374 printk(KERN_CONT ".. filter failed count=%ld ..", count);
175 ret = -1; 375 ret = -1;
176 goto out; 376 goto out;
177 } 377 }
178 378
379 /* Test the ops with global tracing running */
380 ret = trace_selftest_ops(1);
381 trace->reset(tr);
382
179 out: 383 out:
180 ftrace_enabled = save_ftrace_enabled; 384 ftrace_enabled = save_ftrace_enabled;
181 tracer_enabled = save_tracer_enabled; 385 tracer_enabled = save_tracer_enabled;
182 386
183 /* Enable tracing on all functions again */ 387 /* Enable tracing on all functions again */
184 ftrace_set_filter(NULL, 0, 1); 388 ftrace_set_global_filter(NULL, 0, 1);
389
390 /* Test the ops with global tracing off */
391 if (!ret)
392 ret = trace_selftest_ops(2);
185 393
186 return ret; 394 return ret;
187} 395}
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
index 54dd77cce5bf..b4c475a0a48b 100644
--- a/kernel/trace/trace_selftest_dynamic.c
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -5,3 +5,9 @@ int DYN_FTRACE_TEST_NAME(void)
5 /* used to call mcount */ 5 /* used to call mcount */
6 return 0; 6 return 0;
7} 7}
8
9int DYN_FTRACE_TEST_NAME2(void)
10{
11 /* used to call mcount */
12 return 0;
13}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 4c5dead0c239..b0b53b8e4c25 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -133,6 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
133static struct ftrace_ops trace_ops __read_mostly = 133static struct ftrace_ops trace_ops __read_mostly =
134{ 134{
135 .func = stack_trace_call, 135 .func = stack_trace_call,
136 .flags = FTRACE_OPS_FL_GLOBAL,
136}; 137};
137 138
138static ssize_t 139static ssize_t
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5c9fe08d2093..ee7b5a0bb9f8 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -60,6 +60,19 @@ extern struct syscall_metadata *__stop_syscalls_metadata[];
60 60
61static struct syscall_metadata **syscalls_metadata; 61static struct syscall_metadata **syscalls_metadata;
62 62
63#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
64static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
65{
66 /*
67 * Only compare after the "sys" prefix. Archs that use
68 * syscall wrappers may have syscalls symbols aliases prefixed
69 * with "SyS" instead of "sys", leading to an unwanted
70 * mismatch.
71 */
72 return !strcmp(sym + 3, name + 3);
73}
74#endif
75
63static __init struct syscall_metadata * 76static __init struct syscall_metadata *
64find_syscall_meta(unsigned long syscall) 77find_syscall_meta(unsigned long syscall)
65{ 78{
@@ -72,14 +85,11 @@ find_syscall_meta(unsigned long syscall)
72 stop = __stop_syscalls_metadata; 85 stop = __stop_syscalls_metadata;
73 kallsyms_lookup(syscall, NULL, NULL, NULL, str); 86 kallsyms_lookup(syscall, NULL, NULL, NULL, str);
74 87
88 if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
89 return NULL;
90
75 for ( ; start < stop; start++) { 91 for ( ; start < stop; start++) {
76 /* 92 if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
77 * Only compare after the "sys" prefix. Archs that use
78 * syscall wrappers may have syscalls symbols aliases prefixed
79 * with "SyS" instead of "sys", leading to an unwanted
80 * mismatch.
81 */
82 if ((*start)->name && !strcmp((*start)->name + 3, str + 3))
83 return *start; 93 return *start;
84 } 94 }
85 return NULL; 95 return NULL;
@@ -359,7 +369,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
359 int num; 369 int num;
360 370
361 num = ((struct syscall_metadata *)call->data)->syscall_nr; 371 num = ((struct syscall_metadata *)call->data)->syscall_nr;
362 if (num < 0 || num >= NR_syscalls) 372 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
363 return -ENOSYS; 373 return -ENOSYS;
364 mutex_lock(&syscall_trace_lock); 374 mutex_lock(&syscall_trace_lock);
365 if (!sys_refcount_enter) 375 if (!sys_refcount_enter)
@@ -377,7 +387,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
377 int num; 387 int num;
378 388
379 num = ((struct syscall_metadata *)call->data)->syscall_nr; 389 num = ((struct syscall_metadata *)call->data)->syscall_nr;
380 if (num < 0 || num >= NR_syscalls) 390 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
381 return; 391 return;
382 mutex_lock(&syscall_trace_lock); 392 mutex_lock(&syscall_trace_lock);
383 sys_refcount_enter--; 393 sys_refcount_enter--;
@@ -393,7 +403,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
393 int num; 403 int num;
394 404
395 num = ((struct syscall_metadata *)call->data)->syscall_nr; 405 num = ((struct syscall_metadata *)call->data)->syscall_nr;
396 if (num < 0 || num >= NR_syscalls) 406 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
397 return -ENOSYS; 407 return -ENOSYS;
398 mutex_lock(&syscall_trace_lock); 408 mutex_lock(&syscall_trace_lock);
399 if (!sys_refcount_exit) 409 if (!sys_refcount_exit)
@@ -411,7 +421,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
411 int num; 421 int num;
412 422
413 num = ((struct syscall_metadata *)call->data)->syscall_nr; 423 num = ((struct syscall_metadata *)call->data)->syscall_nr;
414 if (num < 0 || num >= NR_syscalls) 424 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
415 return; 425 return;
416 mutex_lock(&syscall_trace_lock); 426 mutex_lock(&syscall_trace_lock);
417 sys_refcount_exit--; 427 sys_refcount_exit--;
@@ -424,6 +434,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
424int init_syscall_trace(struct ftrace_event_call *call) 434int init_syscall_trace(struct ftrace_event_call *call)
425{ 435{
426 int id; 436 int id;
437 int num;
438
439 num = ((struct syscall_metadata *)call->data)->syscall_nr;
440 if (num < 0 || num >= NR_syscalls) {
441 pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
442 ((struct syscall_metadata *)call->data)->name);
443 return -ENOSYS;
444 }
427 445
428 if (set_syscall_print_fmt(call) < 0) 446 if (set_syscall_print_fmt(call) < 0)
429 return -ENOMEM; 447 return -ENOMEM;
@@ -438,7 +456,7 @@ int init_syscall_trace(struct ftrace_event_call *call)
438 return id; 456 return id;
439} 457}
440 458
441unsigned long __init arch_syscall_addr(int nr) 459unsigned long __init __weak arch_syscall_addr(int nr)
442{ 460{
443 return (unsigned long)sys_call_table[nr]; 461 return (unsigned long)sys_call_table[nr];
444} 462}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 68187af4889e..b219f1449c54 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -251,9 +251,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
251{ 251{
252 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 252 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
253 253
254 if (elem->regfunc && !elem->state && active) 254 if (elem->regfunc && !jump_label_enabled(&elem->key) && active)
255 elem->regfunc(); 255 elem->regfunc();
256 else if (elem->unregfunc && elem->state && !active) 256 else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active)
257 elem->unregfunc(); 257 elem->unregfunc();
258 258
259 /* 259 /*
@@ -264,13 +264,10 @@ static void set_tracepoint(struct tracepoint_entry **entry,
264 * is used. 264 * is used.
265 */ 265 */
266 rcu_assign_pointer(elem->funcs, (*entry)->funcs); 266 rcu_assign_pointer(elem->funcs, (*entry)->funcs);
267 if (!elem->state && active) { 267 if (active && !jump_label_enabled(&elem->key))
268 jump_label_enable(&elem->state); 268 jump_label_inc(&elem->key);
269 elem->state = active; 269 else if (!active && jump_label_enabled(&elem->key))
270 } else if (elem->state && !active) { 270 jump_label_dec(&elem->key);
271 jump_label_disable(&elem->state);
272 elem->state = active;
273 }
274} 271}
275 272
276/* 273/*
@@ -281,13 +278,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
281 */ 278 */
282static void disable_tracepoint(struct tracepoint *elem) 279static void disable_tracepoint(struct tracepoint *elem)
283{ 280{
284 if (elem->unregfunc && elem->state) 281 if (elem->unregfunc && jump_label_enabled(&elem->key))
285 elem->unregfunc(); 282 elem->unregfunc();
286 283
287 if (elem->state) { 284 if (jump_label_enabled(&elem->key))
288 jump_label_disable(&elem->state); 285 jump_label_dec(&elem->key);
289 elem->state = 0;
290 }
291 rcu_assign_pointer(elem->funcs, NULL); 286 rcu_assign_pointer(elem->funcs, NULL);
292} 287}
293 288
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 419209893d87..51c6e89e8619 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
189 struct group_info *group_info; 189 struct group_info *group_info;
190 int retval; 190 int retval;
191 191
192 if (!capable(CAP_SETGID)) 192 if (!nsown_capable(CAP_SETGID))
193 return -EPERM; 193 return -EPERM;
194 if ((unsigned)gidsetsize > NGROUPS_MAX) 194 if ((unsigned)gidsetsize > NGROUPS_MAX)
195 return -EINVAL; 195 return -EINVAL;
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index eb27fd3430a2..92cb706c7fc8 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -20,7 +20,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register);
20 20
21/* 21/*
22 * Removes a registered user return notifier. Must be called from atomic 22 * Removes a registered user return notifier. Must be called from atomic
23 * context, and from the same cpu registration occured in. 23 * context, and from the same cpu registration occurred in.
24 */ 24 */
25void user_return_notifier_unregister(struct user_return_notifier *urn) 25void user_return_notifier_unregister(struct user_return_notifier *urn)
26{ 26{
diff --git a/kernel/user.c b/kernel/user.c
index 5c598ca781df..9e03e9c1df8d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -17,9 +17,13 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19 19
20/*
21 * userns count is 1 for root user, 1 for init_uts_ns,
22 * and 1 for... ?
23 */
20struct user_namespace init_user_ns = { 24struct user_namespace init_user_ns = {
21 .kref = { 25 .kref = {
22 .refcount = ATOMIC_INIT(2), 26 .refcount = ATOMIC_INIT(3),
23 }, 27 },
24 .creator = &root_user, 28 .creator = &root_user,
25}; 29};
@@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep;
47 */ 51 */
48static DEFINE_SPINLOCK(uidhash_lock); 52static DEFINE_SPINLOCK(uidhash_lock);
49 53
50/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */ 54/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */
51struct user_struct root_user = { 55struct user_struct root_user = {
52 .__count = ATOMIC_INIT(2), 56 .__count = ATOMIC_INIT(2),
53 .processes = ATOMIC_INIT(1), 57 .processes = ATOMIC_INIT(1),
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 8a82b4b8ea52..bff131b9510a 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,8 @@
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/user_namespace.h>
18#include <linux/proc_fs.h>
17 19
18static struct uts_namespace *create_uts_ns(void) 20static struct uts_namespace *create_uts_ns(void)
19{ 21{
@@ -30,7 +32,8 @@ static struct uts_namespace *create_uts_ns(void)
30 * @old_ns: namespace to clone 32 * @old_ns: namespace to clone
31 * Return NULL on error (failure to kmalloc), new ns otherwise 33 * Return NULL on error (failure to kmalloc), new ns otherwise
32 */ 34 */
33static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) 35static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
36 struct uts_namespace *old_ns)
34{ 37{
35 struct uts_namespace *ns; 38 struct uts_namespace *ns;
36 39
@@ -40,6 +43,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
40 43
41 down_read(&uts_sem); 44 down_read(&uts_sem);
42 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 45 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
46 ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
43 up_read(&uts_sem); 47 up_read(&uts_sem);
44 return ns; 48 return ns;
45} 49}
@@ -50,8 +54,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
50 * utsname of this process won't be seen by parent, and vice 54 * utsname of this process won't be seen by parent, and vice
51 * versa. 55 * versa.
52 */ 56 */
53struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns) 57struct uts_namespace *copy_utsname(unsigned long flags,
58 struct task_struct *tsk)
54{ 59{
60 struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
55 struct uts_namespace *new_ns; 61 struct uts_namespace *new_ns;
56 62
57 BUG_ON(!old_ns); 63 BUG_ON(!old_ns);
@@ -60,7 +66,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol
60 if (!(flags & CLONE_NEWUTS)) 66 if (!(flags & CLONE_NEWUTS))
61 return old_ns; 67 return old_ns;
62 68
63 new_ns = clone_uts_ns(old_ns); 69 new_ns = clone_uts_ns(tsk, old_ns);
64 70
65 put_uts_ns(old_ns); 71 put_uts_ns(old_ns);
66 return new_ns; 72 return new_ns;
@@ -71,5 +77,44 @@ void free_uts_ns(struct kref *kref)
71 struct uts_namespace *ns; 77 struct uts_namespace *ns;
72 78
73 ns = container_of(kref, struct uts_namespace, kref); 79 ns = container_of(kref, struct uts_namespace, kref);
80 put_user_ns(ns->user_ns);
74 kfree(ns); 81 kfree(ns);
75} 82}
83
84static void *utsns_get(struct task_struct *task)
85{
86 struct uts_namespace *ns = NULL;
87 struct nsproxy *nsproxy;
88
89 rcu_read_lock();
90 nsproxy = task_nsproxy(task);
91 if (nsproxy) {
92 ns = nsproxy->uts_ns;
93 get_uts_ns(ns);
94 }
95 rcu_read_unlock();
96
97 return ns;
98}
99
100static void utsns_put(void *ns)
101{
102 put_uts_ns(ns);
103}
104
105static int utsns_install(struct nsproxy *nsproxy, void *ns)
106{
107 get_uts_ns(ns);
108 put_uts_ns(nsproxy->uts_ns);
109 nsproxy->uts_ns = ns;
110 return 0;
111}
112
113const struct proc_ns_operations utsns_operations = {
114 .name = "uts",
115 .type = CLONE_NEWUTS,
116 .get = utsns_get,
117 .put = utsns_put,
118 .install = utsns_install,
119};
120
diff --git a/kernel/wait.c b/kernel/wait.c
index b0310eb6cc1e..f45ea8d2a1ce 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -142,7 +142,7 @@ EXPORT_SYMBOL(finish_wait);
142 * woken up through the queue. 142 * woken up through the queue.
143 * 143 *
144 * This prevents waiter starvation where an exclusive waiter 144 * This prevents waiter starvation where an exclusive waiter
145 * aborts and is woken up concurrently and noone wakes up 145 * aborts and is woken up concurrently and no one wakes up
146 * the next waiter. 146 * the next waiter.
147 */ 147 */
148void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, 148void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index f37f974aa81b..7daa4b072e9f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -28,7 +28,7 @@
28#include <linux/perf_event.h> 28#include <linux/perf_event.h>
29 29
30int watchdog_enabled = 1; 30int watchdog_enabled = 1;
31int __read_mostly softlockup_thresh = 60; 31int __read_mostly watchdog_thresh = 10;
32 32
33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); 34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -48,12 +48,15 @@ static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
48 * Should we panic when a soft-lockup or hard-lockup occurs: 48 * Should we panic when a soft-lockup or hard-lockup occurs:
49 */ 49 */
50#ifdef CONFIG_HARDLOCKUP_DETECTOR 50#ifdef CONFIG_HARDLOCKUP_DETECTOR
51static int hardlockup_panic; 51static int hardlockup_panic =
52 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
52 53
53static int __init hardlockup_panic_setup(char *str) 54static int __init hardlockup_panic_setup(char *str)
54{ 55{
55 if (!strncmp(str, "panic", 5)) 56 if (!strncmp(str, "panic", 5))
56 hardlockup_panic = 1; 57 hardlockup_panic = 1;
58 else if (!strncmp(str, "nopanic", 7))
59 hardlockup_panic = 0;
57 else if (!strncmp(str, "0", 1)) 60 else if (!strncmp(str, "0", 1))
58 watchdog_enabled = 0; 61 watchdog_enabled = 0;
59 return 1; 62 return 1;
@@ -88,6 +91,17 @@ static int __init nosoftlockup_setup(char *str)
88__setup("nosoftlockup", nosoftlockup_setup); 91__setup("nosoftlockup", nosoftlockup_setup);
89/* */ 92/* */
90 93
94/*
95 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
96 * lockups can have false positives under extreme conditions. So we generally
97 * want a higher threshold for soft lockups than for hard lockups. So we couple
98 * the thresholds with a factor: we make the soft threshold twice the amount of
99 * time the hard threshold is.
100 */
101static int get_softlockup_thresh(void)
102{
103 return watchdog_thresh * 2;
104}
91 105
92/* 106/*
93 * Returns seconds, approximately. We don't need nanosecond 107 * Returns seconds, approximately. We don't need nanosecond
@@ -102,12 +116,12 @@ static unsigned long get_timestamp(int this_cpu)
102static unsigned long get_sample_period(void) 116static unsigned long get_sample_period(void)
103{ 117{
104 /* 118 /*
105 * convert softlockup_thresh from seconds to ns 119 * convert watchdog_thresh from seconds to ns
106 * the divide by 5 is to give hrtimer 5 chances to 120 * the divide by 5 is to give hrtimer 5 chances to
107 * increment before the hardlockup detector generates 121 * increment before the hardlockup detector generates
108 * a warning 122 * a warning
109 */ 123 */
110 return softlockup_thresh / 5 * NSEC_PER_SEC; 124 return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
111} 125}
112 126
113/* Commands for resetting the watchdog */ 127/* Commands for resetting the watchdog */
@@ -179,7 +193,7 @@ static int is_softlockup(unsigned long touch_ts)
179 unsigned long now = get_timestamp(smp_processor_id()); 193 unsigned long now = get_timestamp(smp_processor_id());
180 194
181 /* Warn about unreasonable delays: */ 195 /* Warn about unreasonable delays: */
182 if (time_after(now, touch_ts + softlockup_thresh)) 196 if (time_after(now, touch_ts + get_softlockup_thresh()))
183 return now - touch_ts; 197 return now - touch_ts;
184 198
185 return 0; 199 return 0;
@@ -356,15 +370,21 @@ static int watchdog_nmi_enable(int cpu)
356 370
357 /* Try to register using hardware perf events */ 371 /* Try to register using hardware perf events */
358 wd_attr = &wd_hw_attr; 372 wd_attr = &wd_hw_attr;
359 wd_attr->sample_period = hw_nmi_get_sample_period(); 373 wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
360 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); 374 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
361 if (!IS_ERR(event)) { 375 if (!IS_ERR(event)) {
362 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); 376 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
363 goto out_save; 377 goto out_save;
364 } 378 }
365 379
366 printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n", 380
367 cpu, PTR_ERR(event)); 381 /* vary the KERN level based on the returned errno */
382 if (PTR_ERR(event) == -EOPNOTSUPP)
383 printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
384 else if (PTR_ERR(event) == -ENOENT)
385 printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu);
386 else
387 printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event));
368 return PTR_ERR(event); 388 return PTR_ERR(event);
369 389
370 /* success path */ 390 /* success path */
@@ -409,19 +429,25 @@ static int watchdog_prepare_cpu(int cpu)
409static int watchdog_enable(int cpu) 429static int watchdog_enable(int cpu)
410{ 430{
411 struct task_struct *p = per_cpu(softlockup_watchdog, cpu); 431 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
412 int err; 432 int err = 0;
413 433
414 /* enable the perf event */ 434 /* enable the perf event */
415 err = watchdog_nmi_enable(cpu); 435 err = watchdog_nmi_enable(cpu);
416 if (err) 436
417 return err; 437 /* Regardless of err above, fall through and start softlockup */
418 438
419 /* create the watchdog thread */ 439 /* create the watchdog thread */
420 if (!p) { 440 if (!p) {
421 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); 441 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
422 if (IS_ERR(p)) { 442 if (IS_ERR(p)) {
423 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); 443 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
424 return PTR_ERR(p); 444 if (!err) {
445 /* if hardlockup hasn't already set this */
446 err = PTR_ERR(p);
447 /* and disable the perf event */
448 watchdog_nmi_disable(cpu);
449 }
450 goto out;
425 } 451 }
426 kthread_bind(p, cpu); 452 kthread_bind(p, cpu);
427 per_cpu(watchdog_touch_ts, cpu) = 0; 453 per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -429,7 +455,8 @@ static int watchdog_enable(int cpu)
429 wake_up_process(p); 455 wake_up_process(p);
430 } 456 }
431 457
432 return 0; 458out:
459 return err;
433} 460}
434 461
435static void watchdog_disable(int cpu) 462static void watchdog_disable(int cpu)
@@ -485,28 +512,25 @@ static void watchdog_disable_all_cpus(void)
485/* sysctl functions */ 512/* sysctl functions */
486#ifdef CONFIG_SYSCTL 513#ifdef CONFIG_SYSCTL
487/* 514/*
488 * proc handler for /proc/sys/kernel/nmi_watchdog 515 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
489 */ 516 */
490 517
491int proc_dowatchdog_enabled(struct ctl_table *table, int write, 518int proc_dowatchdog(struct ctl_table *table, int write,
492 void __user *buffer, size_t *length, loff_t *ppos) 519 void __user *buffer, size_t *lenp, loff_t *ppos)
493{ 520{
494 proc_dointvec(table, write, buffer, length, ppos); 521 int ret;
495 522
496 if (write) { 523 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
497 if (watchdog_enabled) 524 if (ret || !write)
498 watchdog_enable_all_cpus(); 525 goto out;
499 else
500 watchdog_disable_all_cpus();
501 }
502 return 0;
503}
504 526
505int proc_dowatchdog_thresh(struct ctl_table *table, int write, 527 if (watchdog_enabled && watchdog_thresh)
506 void __user *buffer, 528 watchdog_enable_all_cpus();
507 size_t *lenp, loff_t *ppos) 529 else
508{ 530 watchdog_disable_all_cpus();
509 return proc_dointvec_minmax(table, write, buffer, lenp, ppos); 531
532out:
533 return ret;
510} 534}
511#endif /* CONFIG_SYSCTL */ 535#endif /* CONFIG_SYSCTL */
512 536
@@ -541,7 +565,13 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
541 break; 565 break;
542#endif /* CONFIG_HOTPLUG_CPU */ 566#endif /* CONFIG_HOTPLUG_CPU */
543 } 567 }
544 return notifier_from_errno(err); 568
569 /*
570 * hardlockup and softlockup are not important enough
571 * to block cpu bring up. Just always succeed and
572 * rely on printk output to flag problems.
573 */
574 return NOTIFY_OK;
545} 575}
546 576
547static struct notifier_block __cpuinitdata cpu_nfb = { 577static struct notifier_block __cpuinitdata cpu_nfb = {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 11869faa6819..0400553f0d04 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -79,7 +79,9 @@ enum {
79 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ 79 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
80 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ 80 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
81 81
82 MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */ 82 MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2,
83 /* call for help after 10ms
84 (min two ticks) */
83 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ 85 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
84 CREATE_COOLDOWN = HZ, /* time to breath after fail */ 86 CREATE_COOLDOWN = HZ, /* time to breath after fail */
85 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ 87 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
@@ -249,10 +251,12 @@ struct workqueue_struct *system_wq __read_mostly;
249struct workqueue_struct *system_long_wq __read_mostly; 251struct workqueue_struct *system_long_wq __read_mostly;
250struct workqueue_struct *system_nrt_wq __read_mostly; 252struct workqueue_struct *system_nrt_wq __read_mostly;
251struct workqueue_struct *system_unbound_wq __read_mostly; 253struct workqueue_struct *system_unbound_wq __read_mostly;
254struct workqueue_struct *system_freezable_wq __read_mostly;
252EXPORT_SYMBOL_GPL(system_wq); 255EXPORT_SYMBOL_GPL(system_wq);
253EXPORT_SYMBOL_GPL(system_long_wq); 256EXPORT_SYMBOL_GPL(system_long_wq);
254EXPORT_SYMBOL_GPL(system_nrt_wq); 257EXPORT_SYMBOL_GPL(system_nrt_wq);
255EXPORT_SYMBOL_GPL(system_unbound_wq); 258EXPORT_SYMBOL_GPL(system_unbound_wq);
259EXPORT_SYMBOL_GPL(system_freezable_wq);
256 260
257#define CREATE_TRACE_POINTS 261#define CREATE_TRACE_POINTS
258#include <trace/events/workqueue.h> 262#include <trace/events/workqueue.h>
@@ -314,6 +318,11 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
314 318
315static struct debug_obj_descr work_debug_descr; 319static struct debug_obj_descr work_debug_descr;
316 320
321static void *work_debug_hint(void *addr)
322{
323 return ((struct work_struct *) addr)->func;
324}
325
317/* 326/*
318 * fixup_init is called when: 327 * fixup_init is called when:
319 * - an active object is initialized 328 * - an active object is initialized
@@ -385,6 +394,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state)
385 394
386static struct debug_obj_descr work_debug_descr = { 395static struct debug_obj_descr work_debug_descr = {
387 .name = "work_struct", 396 .name = "work_struct",
397 .debug_hint = work_debug_hint,
388 .fixup_init = work_fixup_init, 398 .fixup_init = work_fixup_init,
389 .fixup_activate = work_fixup_activate, 399 .fixup_activate = work_fixup_activate,
390 .fixup_free = work_fixup_free, 400 .fixup_free = work_fixup_free,
@@ -1281,8 +1291,14 @@ __acquires(&gcwq->lock)
1281 return true; 1291 return true;
1282 spin_unlock_irq(&gcwq->lock); 1292 spin_unlock_irq(&gcwq->lock);
1283 1293
1284 /* CPU has come up inbetween, retry migration */ 1294 /*
1295 * We've raced with CPU hot[un]plug. Give it a breather
1296 * and retry migration. cond_resched() is required here;
1297 * otherwise, we might deadlock against cpu_stop trying to
1298 * bring down the CPU on non-preemptive kernel.
1299 */
1285 cpu_relax(); 1300 cpu_relax();
1301 cond_resched();
1286 } 1302 }
1287} 1303}
1288 1304
@@ -1356,8 +1372,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
1356 worker->id = id; 1372 worker->id = id;
1357 1373
1358 if (!on_unbound_cpu) 1374 if (!on_unbound_cpu)
1359 worker->task = kthread_create(worker_thread, worker, 1375 worker->task = kthread_create_on_node(worker_thread,
1360 "kworker/%u:%d", gcwq->cpu, id); 1376 worker,
1377 cpu_to_node(gcwq->cpu),
1378 "kworker/%u:%d", gcwq->cpu, id);
1361 else 1379 else
1362 worker->task = kthread_create(worker_thread, worker, 1380 worker->task = kthread_create(worker_thread, worker,
1363 "kworker/u:%d", id); 1381 "kworker/u:%d", id);
@@ -2047,6 +2065,15 @@ repeat:
2047 move_linked_works(work, scheduled, &n); 2065 move_linked_works(work, scheduled, &n);
2048 2066
2049 process_scheduled_works(rescuer); 2067 process_scheduled_works(rescuer);
2068
2069 /*
2070 * Leave this gcwq. If keep_working() is %true, notify a
2071 * regular worker; otherwise, we end up with 0 concurrency
2072 * and stalling the execution.
2073 */
2074 if (keep_working(gcwq))
2075 wake_up_worker(gcwq);
2076
2050 spin_unlock_irq(&gcwq->lock); 2077 spin_unlock_irq(&gcwq->lock);
2051 } 2078 }
2052 2079
@@ -2839,9 +2866,7 @@ static int alloc_cwqs(struct workqueue_struct *wq)
2839 } 2866 }
2840 } 2867 }
2841 2868
2842 /* just in case, make sure it's actually aligned 2869 /* just in case, make sure it's actually aligned */
2843 * - this is affected by PERCPU() alignment in vmlinux.lds.S
2844 */
2845 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); 2870 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
2846 return wq->cpu_wq.v ? 0 : -ENOMEM; 2871 return wq->cpu_wq.v ? 0 : -ENOMEM;
2847} 2872}
@@ -2956,7 +2981,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
2956 */ 2981 */
2957 spin_lock(&workqueue_lock); 2982 spin_lock(&workqueue_lock);
2958 2983
2959 if (workqueue_freezing && wq->flags & WQ_FREEZEABLE) 2984 if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
2960 for_each_cwq_cpu(cpu, wq) 2985 for_each_cwq_cpu(cpu, wq)
2961 get_cwq(cpu, wq)->max_active = 0; 2986 get_cwq(cpu, wq)->max_active = 0;
2962 2987
@@ -3068,7 +3093,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
3068 3093
3069 spin_lock_irq(&gcwq->lock); 3094 spin_lock_irq(&gcwq->lock);
3070 3095
3071 if (!(wq->flags & WQ_FREEZEABLE) || 3096 if (!(wq->flags & WQ_FREEZABLE) ||
3072 !(gcwq->flags & GCWQ_FREEZING)) 3097 !(gcwq->flags & GCWQ_FREEZING))
3073 get_cwq(gcwq->cpu, wq)->max_active = max_active; 3098 get_cwq(gcwq->cpu, wq)->max_active = max_active;
3074 3099
@@ -3318,7 +3343,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
3318 * want to get it over with ASAP - spam rescuers, wake up as 3343 * want to get it over with ASAP - spam rescuers, wake up as
3319 * many idlers as necessary and create new ones till the 3344 * many idlers as necessary and create new ones till the
3320 * worklist is empty. Note that if the gcwq is frozen, there 3345 * worklist is empty. Note that if the gcwq is frozen, there
3321 * may be frozen works in freezeable cwqs. Don't declare 3346 * may be frozen works in freezable cwqs. Don't declare
3322 * completion while frozen. 3347 * completion while frozen.
3323 */ 3348 */
3324 while (gcwq->nr_workers != gcwq->nr_idle || 3349 while (gcwq->nr_workers != gcwq->nr_idle ||
@@ -3576,9 +3601,9 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
3576/** 3601/**
3577 * freeze_workqueues_begin - begin freezing workqueues 3602 * freeze_workqueues_begin - begin freezing workqueues
3578 * 3603 *
3579 * Start freezing workqueues. After this function returns, all 3604 * Start freezing workqueues. After this function returns, all freezable
3580 * freezeable workqueues will queue new works to their frozen_works 3605 * workqueues will queue new works to their frozen_works list instead of
3581 * list instead of gcwq->worklist. 3606 * gcwq->worklist.
3582 * 3607 *
3583 * CONTEXT: 3608 * CONTEXT:
3584 * Grabs and releases workqueue_lock and gcwq->lock's. 3609 * Grabs and releases workqueue_lock and gcwq->lock's.
@@ -3604,7 +3629,7 @@ void freeze_workqueues_begin(void)
3604 list_for_each_entry(wq, &workqueues, list) { 3629 list_for_each_entry(wq, &workqueues, list) {
3605 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3630 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3606 3631
3607 if (cwq && wq->flags & WQ_FREEZEABLE) 3632 if (cwq && wq->flags & WQ_FREEZABLE)
3608 cwq->max_active = 0; 3633 cwq->max_active = 0;
3609 } 3634 }
3610 3635
@@ -3615,7 +3640,7 @@ void freeze_workqueues_begin(void)
3615} 3640}
3616 3641
3617/** 3642/**
3618 * freeze_workqueues_busy - are freezeable workqueues still busy? 3643 * freeze_workqueues_busy - are freezable workqueues still busy?
3619 * 3644 *
3620 * Check whether freezing is complete. This function must be called 3645 * Check whether freezing is complete. This function must be called
3621 * between freeze_workqueues_begin() and thaw_workqueues(). 3646 * between freeze_workqueues_begin() and thaw_workqueues().
@@ -3624,8 +3649,8 @@ void freeze_workqueues_begin(void)
3624 * Grabs and releases workqueue_lock. 3649 * Grabs and releases workqueue_lock.
3625 * 3650 *
3626 * RETURNS: 3651 * RETURNS:
3627 * %true if some freezeable workqueues are still busy. %false if 3652 * %true if some freezable workqueues are still busy. %false if freezing
3628 * freezing is complete. 3653 * is complete.
3629 */ 3654 */
3630bool freeze_workqueues_busy(void) 3655bool freeze_workqueues_busy(void)
3631{ 3656{
@@ -3645,7 +3670,7 @@ bool freeze_workqueues_busy(void)
3645 list_for_each_entry(wq, &workqueues, list) { 3670 list_for_each_entry(wq, &workqueues, list) {
3646 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3671 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3647 3672
3648 if (!cwq || !(wq->flags & WQ_FREEZEABLE)) 3673 if (!cwq || !(wq->flags & WQ_FREEZABLE))
3649 continue; 3674 continue;
3650 3675
3651 BUG_ON(cwq->nr_active < 0); 3676 BUG_ON(cwq->nr_active < 0);
@@ -3690,7 +3715,7 @@ void thaw_workqueues(void)
3690 list_for_each_entry(wq, &workqueues, list) { 3715 list_for_each_entry(wq, &workqueues, list) {
3691 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3716 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3692 3717
3693 if (!cwq || !(wq->flags & WQ_FREEZEABLE)) 3718 if (!cwq || !(wq->flags & WQ_FREEZABLE))
3694 continue; 3719 continue;
3695 3720
3696 /* restore max_active and repopulate worklist */ 3721 /* restore max_active and repopulate worklist */
@@ -3764,8 +3789,10 @@ static int __init init_workqueues(void)
3764 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); 3789 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3765 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, 3790 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3766 WQ_UNBOUND_MAX_ACTIVE); 3791 WQ_UNBOUND_MAX_ACTIVE);
3792 system_freezable_wq = alloc_workqueue("events_freezable",
3793 WQ_FREEZABLE, 0);
3767 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || 3794 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
3768 !system_unbound_wq); 3795 !system_unbound_wq || !system_freezable_wq);
3769 return 0; 3796 return 0;
3770} 3797}
3771early_initcall(init_workqueues); 3798early_initcall(init_workqueues);