aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2011-05-14 06:06:36 -0400
committerThomas Gleixner <tglx@linutronix.de>2011-05-14 06:06:36 -0400
commita18f22a968de17b29f2310cdb7ba69163e65ec15 (patch)
treea7d56d88fad5e444d7661484109758a2f436129e /kernel
parenta1c57e0fec53defe745e64417eacdbd3618c3e66 (diff)
parent798778b8653f64b7b2162ac70eca10367cff6ce8 (diff)
Merge branch 'consolidate-clksrc-i8253' of master.kernel.org:~rmk/linux-2.6-arm into timers/clocksource
Conflicts: arch/ia64/kernel/cyclone.c arch/mips/kernel/i8253.c arch/x86/kernel/i8253.c Reason: Resolve conflicts so further cleanups do not conflict further Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit.c8
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/audit_watch.c85
-rw-r--r--kernel/auditfilter.c10
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/capability.c96
-rw-r--r--kernel/cgroup.c70
-rw-r--r--kernel/compat.c136
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/cpuset.c87
-rw-r--r--kernel/crash_dump.c34
-rw-r--r--kernel/cred.c8
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/debug/gdbstub.c30
-rw-r--r--kernel/debug/kdb/kdb_main.c10
-rw-r--r--kernel/debug/kdb/kdb_support.c2
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/fork.c155
-rw-r--r--kernel/futex.c160
-rw-r--r--kernel/futex_compat.c11
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/gcov/Makefile2
-rw-r--r--kernel/groups.c2
-rw-r--r--kernel/hrtimer.c90
-rw-r--r--kernel/irq/Kconfig46
-rw-r--r--kernel/irq/autoprobe.c52
-rw-r--r--kernel/irq/chip.c696
-rw-r--r--kernel/irq/debug.h44
-rw-r--r--kernel/irq/dummychip.c9
-rw-r--r--kernel/irq/handle.c129
-rw-r--r--kernel/irq/internals.h161
-rw-r--r--kernel/irq/irqdesc.c86
-rw-r--r--kernel/irq/manage.c633
-rw-r--r--kernel/irq/migration.c29
-rw-r--r--kernel/irq/pm.c30
-rw-r--r--kernel/irq/proc.c88
-rw-r--r--kernel/irq/resend.c18
-rw-r--r--kernel/irq/settings.h125
-rw-r--r--kernel/irq/spurious.c162
-rw-r--r--kernel/kallsyms.c58
-rw-r--r--kernel/kexec.c18
-rw-r--r--kernel/kthread.c33
-rw-r--r--kernel/latencytop.c2
-rw-r--r--kernel/lockdep.c4
-rw-r--r--kernel/lockdep_proc.c9
-rw-r--r--kernel/module.c10
-rw-r--r--kernel/mutex.c2
-rw-r--r--kernel/nsproxy.c4
-rw-r--r--kernel/padata.c8
-rw-r--r--kernel/panic.c10
-rw-r--r--kernel/params.c2
-rw-r--r--kernel/perf_event.c1070
-rw-r--r--kernel/pid.c7
-rw-r--r--kernel/pid_namespace.c11
-rw-r--r--kernel/pm_qos_params.c24
-rw-r--r--kernel/posix-cpu-timers.c112
-rw-r--r--kernel/posix-timers.c344
-rw-r--r--kernel/power/Kconfig241
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/block_io.c2
-rw-r--r--kernel/power/hibernate.c15
-rw-r--r--kernel/power/main.c5
-rw-r--r--kernel/power/snapshot.c8
-rw-r--r--kernel/power/suspend.c7
-rw-r--r--kernel/printk.c174
-rw-r--r--kernel/ptrace.c50
-rw-r--r--kernel/rcupdate.c10
-rw-r--r--kernel/rcutiny_plugin.h2
-rw-r--r--kernel/rcutorture.c1
-rw-r--r--kernel/res_counter.c14
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex-tester.c40
-rw-r--r--kernel/rtmutex.c318
-rw-r--r--kernel/rtmutex_common.h16
-rw-r--r--kernel/sched.c397
-rw-r--r--kernel/sched_autogroup.c17
-rw-r--r--kernel/sched_autogroup.h5
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c430
-rw-r--r--kernel/sched_idletask.c28
-rw-r--r--kernel/sched_rt.c37
-rw-r--r--kernel/sched_stoptask.c9
-rw-r--r--kernel/signal.c201
-rw-r--r--kernel/smp.c152
-rw-r--r--kernel/softirq.c31
-rw-r--r--kernel/stop_machine.c6
-rw-r--r--kernel/sys.c81
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c66
-rw-r--r--kernel/sysctl_binary.c19
-rw-r--r--kernel/sysctl_check.c10
-rw-r--r--kernel/taskstats.c2
-rw-r--r--kernel/time.c35
-rw-r--r--kernel/time/Makefile3
-rw-r--r--kernel/time/clockevents.c1
-rw-r--r--kernel/time/jiffies.c22
-rw-r--r--kernel/time/ntp.c15
-rw-r--r--kernel/time/posix-clock.c445
-rw-r--r--kernel/time/tick-broadcast.c11
-rw-r--r--kernel/time/tick-common.c7
-rw-r--r--kernel/time/tick-internal.h12
-rw-r--r--kernel/time/tick-oneshot.c1
-rw-r--r--kernel/time/tick-sched.c1
-rw-r--r--kernel/time/timekeeping.c168
-rw-r--r--kernel/time/timer_stats.c2
-rw-r--r--kernel/timer.c42
-rw-r--r--kernel/trace/Kconfig6
-rw-r--r--kernel/trace/blktrace.c64
-rw-r--r--kernel/trace/ftrace.c57
-rw-r--r--kernel/trace/ring_buffer.c30
-rw-r--r--kernel/trace/trace.c41
-rw-r--r--kernel/trace/trace.h41
-rw-r--r--kernel/trace/trace_clock.c2
-rw-r--r--kernel/trace/trace_entries.h8
-rw-r--r--kernel/trace/trace_events.c3
-rw-r--r--kernel/trace/trace_events_filter.c885
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_irqsoff.c2
-rw-r--r--kernel/trace/trace_kprobe.c113
-rw-r--r--kernel/trace/trace_output.c36
-rw-r--r--kernel/trace/trace_sched_switch.c48
-rw-r--r--kernel/trace/trace_syscalls.c42
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/user-return-notifier.c2
-rw-r--r--kernel/user.c8
-rw-r--r--kernel/utsname.c12
-rw-r--r--kernel/wait.c2
-rw-r--r--kernel/watchdog.c30
-rw-r--r--kernel/workqueue.c26
131 files changed, 6586 insertions, 3107 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 353d3fe8ba33..85cbfb31e73e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -107,6 +107,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
107obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 107obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
108obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 108obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
109obj-$(CONFIG_PADATA) += padata.o 109obj-$(CONFIG_PADATA) += padata.o
110obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
110 111
111ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 112ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
112# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 113# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit.c b/kernel/audit.c
index e4956244ae50..939500317066 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -74,6 +74,8 @@ static int audit_initialized;
74int audit_enabled; 74int audit_enabled;
75int audit_ever_enabled; 75int audit_ever_enabled;
76 76
77EXPORT_SYMBOL_GPL(audit_enabled);
78
77/* Default state when kernel boots without any parameters. */ 79/* Default state when kernel boots without any parameters. */
78static int audit_default; 80static int audit_default;
79 81
@@ -671,9 +673,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
671 673
672 pid = NETLINK_CREDS(skb)->pid; 674 pid = NETLINK_CREDS(skb)->pid;
673 uid = NETLINK_CREDS(skb)->uid; 675 uid = NETLINK_CREDS(skb)->uid;
674 loginuid = NETLINK_CB(skb).loginuid; 676 loginuid = audit_get_loginuid(current);
675 sessionid = NETLINK_CB(skb).sessionid; 677 sessionid = audit_get_sessionid(current);
676 sid = NETLINK_CB(skb).sid; 678 security_task_getsecid(current, &sid);
677 seq = nlh->nlmsg_seq; 679 seq = nlh->nlmsg_seq;
678 data = NLMSG_DATA(nlh); 680 data = NLMSG_DATA(nlh);
679 681
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 37b2bea170c8..e99dda04b126 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -607,7 +607,7 @@ void audit_trim_trees(void)
607 spin_lock(&hash_lock); 607 spin_lock(&hash_lock);
608 list_for_each_entry(node, &tree->chunks, list) { 608 list_for_each_entry(node, &tree->chunks, list) {
609 struct audit_chunk *chunk = find_chunk(node); 609 struct audit_chunk *chunk = find_chunk(node);
610 /* this could be NULL if the watch is dieing else where... */ 610 /* this could be NULL if the watch is dying else where... */
611 struct inode *inode = chunk->mark.i.inode; 611 struct inode *inode = chunk->mark.i.inode;
612 node->index |= 1U<<31; 612 node->index |= 1U<<31;
613 if (iterate_mounts(compare_root, inode, root_mnt)) 613 if (iterate_mounts(compare_root, inode, root_mnt))
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index d2e3c7866460..e683869365d9 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
144} 144}
145 145
146/* Initialize a parent watch entry. */ 146/* Initialize a parent watch entry. */
147static struct audit_parent *audit_init_parent(struct nameidata *ndp) 147static struct audit_parent *audit_init_parent(struct path *path)
148{ 148{
149 struct inode *inode = ndp->path.dentry->d_inode; 149 struct inode *inode = path->dentry->d_inode;
150 struct audit_parent *parent; 150 struct audit_parent *parent;
151 int ret; 151 int ret;
152 152
@@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
353} 353}
354 354
355/* Get path information necessary for adding watches. */ 355/* Get path information necessary for adding watches. */
356static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) 356static int audit_get_nd(struct audit_watch *watch, struct path *parent)
357{ 357{
358 struct nameidata *ndparent, *ndwatch; 358 struct nameidata nd;
359 struct dentry *d;
359 int err; 360 int err;
360 361
361 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); 362 err = kern_path_parent(watch->path, &nd);
362 if (unlikely(!ndparent)) 363 if (err)
363 return -ENOMEM; 364 return err;
364 365
365 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); 366 if (nd.last_type != LAST_NORM) {
366 if (unlikely(!ndwatch)) { 367 path_put(&nd.path);
367 kfree(ndparent); 368 return -EINVAL;
368 return -ENOMEM;
369 } 369 }
370 370
371 err = path_lookup(path, LOOKUP_PARENT, ndparent); 371 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
372 if (err) { 372 d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
373 kfree(ndparent); 373 if (IS_ERR(d)) {
374 kfree(ndwatch); 374 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
375 return err; 375 path_put(&nd.path);
376 return PTR_ERR(d);
376 } 377 }
377 378 if (d->d_inode) {
378 err = path_lookup(path, 0, ndwatch); 379 /* update watch filter fields */
379 if (err) { 380 watch->dev = d->d_inode->i_sb->s_dev;
380 kfree(ndwatch); 381 watch->ino = d->d_inode->i_ino;
381 ndwatch = NULL;
382 } 382 }
383 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
383 384
384 *ndp = ndparent; 385 *parent = nd.path;
385 *ndw = ndwatch; 386 dput(d);
386
387 return 0; 387 return 0;
388} 388}
389 389
390/* Release resources used for watch path information. */
391static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
392{
393 if (ndp) {
394 path_put(&ndp->path);
395 kfree(ndp);
396 }
397 if (ndw) {
398 path_put(&ndw->path);
399 kfree(ndw);
400 }
401}
402
403/* Associate the given rule with an existing parent. 390/* Associate the given rule with an existing parent.
404 * Caller must hold audit_filter_mutex. */ 391 * Caller must hold audit_filter_mutex. */
405static void audit_add_to_parent(struct audit_krule *krule, 392static void audit_add_to_parent(struct audit_krule *krule,
@@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
440{ 427{
441 struct audit_watch *watch = krule->watch; 428 struct audit_watch *watch = krule->watch;
442 struct audit_parent *parent; 429 struct audit_parent *parent;
443 struct nameidata *ndp = NULL, *ndw = NULL; 430 struct path parent_path;
444 int h, ret = 0; 431 int h, ret = 0;
445 432
446 mutex_unlock(&audit_filter_mutex); 433 mutex_unlock(&audit_filter_mutex);
447 434
448 /* Avoid calling path_lookup under audit_filter_mutex. */ 435 /* Avoid calling path_lookup under audit_filter_mutex. */
449 ret = audit_get_nd(watch->path, &ndp, &ndw); 436 ret = audit_get_nd(watch, &parent_path);
450 if (ret) {
451 /* caller expects mutex locked */
452 mutex_lock(&audit_filter_mutex);
453 goto error;
454 }
455 437
438 /* caller expects mutex locked */
456 mutex_lock(&audit_filter_mutex); 439 mutex_lock(&audit_filter_mutex);
457 440
458 /* update watch filter fields */ 441 if (ret)
459 if (ndw) { 442 return ret;
460 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
461 watch->ino = ndw->path.dentry->d_inode->i_ino;
462 }
463 443
464 /* either find an old parent or attach a new one */ 444 /* either find an old parent or attach a new one */
465 parent = audit_find_parent(ndp->path.dentry->d_inode); 445 parent = audit_find_parent(parent_path.dentry->d_inode);
466 if (!parent) { 446 if (!parent) {
467 parent = audit_init_parent(ndp); 447 parent = audit_init_parent(&parent_path);
468 if (IS_ERR(parent)) { 448 if (IS_ERR(parent)) {
469 ret = PTR_ERR(parent); 449 ret = PTR_ERR(parent);
470 goto error; 450 goto error;
@@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
479 h = audit_hash_ino((u32)watch->ino); 459 h = audit_hash_ino((u32)watch->ino);
480 *list = &audit_inode_hash[h]; 460 *list = &audit_inode_hash[h];
481error: 461error:
482 audit_put_nd(ndp, ndw); /* NULL args OK */ 462 path_put(&parent_path);
483 return ret; 463 return ret;
484
485} 464}
486 465
487void audit_remove_watch_rule(struct audit_krule *krule) 466void audit_remove_watch_rule(struct audit_krule *krule)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index add2819af71b..f8277c80d678 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1238,6 +1238,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1238 for (i = 0; i < rule->field_count; i++) { 1238 for (i = 0; i < rule->field_count; i++) {
1239 struct audit_field *f = &rule->fields[i]; 1239 struct audit_field *f = &rule->fields[i];
1240 int result = 0; 1240 int result = 0;
1241 u32 sid;
1241 1242
1242 switch (f->type) { 1243 switch (f->type) {
1243 case AUDIT_PID: 1244 case AUDIT_PID:
@@ -1250,19 +1251,22 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1250 result = audit_comparator(cb->creds.gid, f->op, f->val); 1251 result = audit_comparator(cb->creds.gid, f->op, f->val);
1251 break; 1252 break;
1252 case AUDIT_LOGINUID: 1253 case AUDIT_LOGINUID:
1253 result = audit_comparator(cb->loginuid, f->op, f->val); 1254 result = audit_comparator(audit_get_loginuid(current),
1255 f->op, f->val);
1254 break; 1256 break;
1255 case AUDIT_SUBJ_USER: 1257 case AUDIT_SUBJ_USER:
1256 case AUDIT_SUBJ_ROLE: 1258 case AUDIT_SUBJ_ROLE:
1257 case AUDIT_SUBJ_TYPE: 1259 case AUDIT_SUBJ_TYPE:
1258 case AUDIT_SUBJ_SEN: 1260 case AUDIT_SUBJ_SEN:
1259 case AUDIT_SUBJ_CLR: 1261 case AUDIT_SUBJ_CLR:
1260 if (f->lsm_rule) 1262 if (f->lsm_rule) {
1261 result = security_audit_rule_match(cb->sid, 1263 security_task_getsecid(current, &sid);
1264 result = security_audit_rule_match(sid,
1262 f->type, 1265 f->type,
1263 f->op, 1266 f->op,
1264 f->lsm_rule, 1267 f->lsm_rule,
1265 NULL); 1268 NULL);
1269 }
1266 break; 1270 break;
1267 } 1271 }
1268 1272
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f49a0318c2ed..b33513a08beb 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1011,7 +1011,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
1011/* 1011/*
1012 * to_send and len_sent accounting are very loose estimates. We aren't 1012 * to_send and len_sent accounting are very loose estimates. We aren't
1013 * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being 1013 * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
1014 * within about 500 bytes (next page boundry) 1014 * within about 500 bytes (next page boundary)
1015 * 1015 *
1016 * why snprintf? an int is up to 12 digits long. if we just assumed when 1016 * why snprintf? an int is up to 12 digits long. if we just assumed when
1017 * logging that a[%d]= was going to be 16 characters long we would be wasting 1017 * logging that a[%d]= was going to be 16 characters long we would be wasting
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 98a51f26c136..0c9b862292b2 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,11 +9,13 @@
9#include <linux/page-flags.h> 9#include <linux/page-flags.h>
10#include <linux/mmzone.h> 10#include <linux/mmzone.h>
11#include <linux/kbuild.h> 11#include <linux/kbuild.h>
12#include <linux/page_cgroup.h>
12 13
13void foo(void) 14void foo(void)
14{ 15{
15 /* The enum constants to put into include/generated/bounds.h */ 16 /* The enum constants to put into include/generated/bounds.h */
16 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); 17 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
17 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); 18 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
19 DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
18 /* End of constants */ 20 /* End of constants */
19} 21}
diff --git a/kernel/capability.c b/kernel/capability.c
index 9e9385f132c8..bf0c734d0c12 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -14,6 +14,7 @@
14#include <linux/security.h> 14#include <linux/security.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
17#include <linux/user_namespace.h>
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
18 19
19/* 20/*
@@ -290,6 +291,60 @@ error:
290} 291}
291 292
292/** 293/**
294 * has_capability - Does a task have a capability in init_user_ns
295 * @t: The task in question
296 * @cap: The capability to be tested for
297 *
298 * Return true if the specified task has the given superior capability
299 * currently in effect to the initial user namespace, false if not.
300 *
301 * Note that this does not set PF_SUPERPRIV on the task.
302 */
303bool has_capability(struct task_struct *t, int cap)
304{
305 int ret = security_real_capable(t, &init_user_ns, cap);
306
307 return (ret == 0);
308}
309
310/**
311 * has_capability - Does a task have a capability in a specific user ns
312 * @t: The task in question
313 * @ns: target user namespace
314 * @cap: The capability to be tested for
315 *
316 * Return true if the specified task has the given superior capability
317 * currently in effect to the specified user namespace, false if not.
318 *
319 * Note that this does not set PF_SUPERPRIV on the task.
320 */
321bool has_ns_capability(struct task_struct *t,
322 struct user_namespace *ns, int cap)
323{
324 int ret = security_real_capable(t, ns, cap);
325
326 return (ret == 0);
327}
328
329/**
330 * has_capability_noaudit - Does a task have a capability (unaudited)
331 * @t: The task in question
332 * @cap: The capability to be tested for
333 *
334 * Return true if the specified task has the given superior capability
335 * currently in effect to init_user_ns, false if not. Don't write an
336 * audit message for the check.
337 *
338 * Note that this does not set PF_SUPERPRIV on the task.
339 */
340bool has_capability_noaudit(struct task_struct *t, int cap)
341{
342 int ret = security_real_capable_noaudit(t, &init_user_ns, cap);
343
344 return (ret == 0);
345}
346
347/**
293 * capable - Determine if the current task has a superior capability in effect 348 * capable - Determine if the current task has a superior capability in effect
294 * @cap: The capability to be tested for 349 * @cap: The capability to be tested for
295 * 350 *
@@ -299,17 +354,48 @@ error:
299 * This sets PF_SUPERPRIV on the task if the capability is available on the 354 * This sets PF_SUPERPRIV on the task if the capability is available on the
300 * assumption that it's about to be used. 355 * assumption that it's about to be used.
301 */ 356 */
302int capable(int cap) 357bool capable(int cap)
358{
359 return ns_capable(&init_user_ns, cap);
360}
361EXPORT_SYMBOL(capable);
362
363/**
364 * ns_capable - Determine if the current task has a superior capability in effect
365 * @ns: The usernamespace we want the capability in
366 * @cap: The capability to be tested for
367 *
368 * Return true if the current task has the given superior capability currently
369 * available for use, false if not.
370 *
371 * This sets PF_SUPERPRIV on the task if the capability is available on the
372 * assumption that it's about to be used.
373 */
374bool ns_capable(struct user_namespace *ns, int cap)
303{ 375{
304 if (unlikely(!cap_valid(cap))) { 376 if (unlikely(!cap_valid(cap))) {
305 printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); 377 printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
306 BUG(); 378 BUG();
307 } 379 }
308 380
309 if (security_capable(current_cred(), cap) == 0) { 381 if (security_capable(ns, current_cred(), cap) == 0) {
310 current->flags |= PF_SUPERPRIV; 382 current->flags |= PF_SUPERPRIV;
311 return 1; 383 return true;
312 } 384 }
313 return 0; 385 return false;
314} 386}
315EXPORT_SYMBOL(capable); 387EXPORT_SYMBOL(ns_capable);
388
389/**
390 * task_ns_capable - Determine whether current task has a superior
391 * capability targeted at a specific task's user namespace.
392 * @t: The task whose user namespace is targeted.
393 * @cap: The capability in question.
394 *
395 * Return true if it does, false otherwise.
396 */
397bool task_ns_capable(struct task_struct *t, int cap)
398{
399 return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
400}
401EXPORT_SYMBOL(task_ns_capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b24d7027b83c..25c7eb52de1a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -157,7 +157,7 @@ struct css_id {
157}; 157};
158 158
159/* 159/*
160 * cgroup_event represents events which userspace want to recieve. 160 * cgroup_event represents events which userspace want to receive.
161 */ 161 */
162struct cgroup_event { 162struct cgroup_event {
163 /* 163 /*
@@ -1813,10 +1813,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1813 1813
1814 /* Update the css_set linked lists if we're using them */ 1814 /* Update the css_set linked lists if we're using them */
1815 write_lock(&css_set_lock); 1815 write_lock(&css_set_lock);
1816 if (!list_empty(&tsk->cg_list)) { 1816 if (!list_empty(&tsk->cg_list))
1817 list_del(&tsk->cg_list); 1817 list_move(&tsk->cg_list, &newcg->tasks);
1818 list_add(&tsk->cg_list, &newcg->tasks);
1819 }
1820 write_unlock(&css_set_lock); 1818 write_unlock(&css_set_lock);
1821 1819
1822 for_each_subsys(root, ss) { 1820 for_each_subsys(root, ss) {
@@ -3655,12 +3653,12 @@ again:
3655 spin_lock(&release_list_lock); 3653 spin_lock(&release_list_lock);
3656 set_bit(CGRP_REMOVED, &cgrp->flags); 3654 set_bit(CGRP_REMOVED, &cgrp->flags);
3657 if (!list_empty(&cgrp->release_list)) 3655 if (!list_empty(&cgrp->release_list))
3658 list_del(&cgrp->release_list); 3656 list_del_init(&cgrp->release_list);
3659 spin_unlock(&release_list_lock); 3657 spin_unlock(&release_list_lock);
3660 3658
3661 cgroup_lock_hierarchy(cgrp->root); 3659 cgroup_lock_hierarchy(cgrp->root);
3662 /* delete this cgroup from parent->children */ 3660 /* delete this cgroup from parent->children */
3663 list_del(&cgrp->sibling); 3661 list_del_init(&cgrp->sibling);
3664 cgroup_unlock_hierarchy(cgrp->root); 3662 cgroup_unlock_hierarchy(cgrp->root);
3665 3663
3666 d = dget(cgrp->dentry); 3664 d = dget(cgrp->dentry);
@@ -3879,7 +3877,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
3879 subsys[ss->subsys_id] = NULL; 3877 subsys[ss->subsys_id] = NULL;
3880 3878
3881 /* remove subsystem from rootnode's list of subsystems */ 3879 /* remove subsystem from rootnode's list of subsystems */
3882 list_del(&ss->sibling); 3880 list_del_init(&ss->sibling);
3883 3881
3884 /* 3882 /*
3885 * disentangle the css from all css_sets attached to the dummytop. as 3883 * disentangle the css from all css_sets attached to the dummytop. as
@@ -4230,20 +4228,8 @@ void cgroup_post_fork(struct task_struct *child)
4230 */ 4228 */
4231void cgroup_exit(struct task_struct *tsk, int run_callbacks) 4229void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4232{ 4230{
4233 int i;
4234 struct css_set *cg; 4231 struct css_set *cg;
4235 4232 int i;
4236 if (run_callbacks && need_forkexit_callback) {
4237 /*
4238 * modular subsystems can't use callbacks, so no need to lock
4239 * the subsys array
4240 */
4241 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4242 struct cgroup_subsys *ss = subsys[i];
4243 if (ss->exit)
4244 ss->exit(ss, tsk);
4245 }
4246 }
4247 4233
4248 /* 4234 /*
4249 * Unlink from the css_set task list if necessary. 4235 * Unlink from the css_set task list if necessary.
@@ -4253,7 +4239,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4253 if (!list_empty(&tsk->cg_list)) { 4239 if (!list_empty(&tsk->cg_list)) {
4254 write_lock(&css_set_lock); 4240 write_lock(&css_set_lock);
4255 if (!list_empty(&tsk->cg_list)) 4241 if (!list_empty(&tsk->cg_list))
4256 list_del(&tsk->cg_list); 4242 list_del_init(&tsk->cg_list);
4257 write_unlock(&css_set_lock); 4243 write_unlock(&css_set_lock);
4258 } 4244 }
4259 4245
@@ -4261,7 +4247,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4261 task_lock(tsk); 4247 task_lock(tsk);
4262 cg = tsk->cgroups; 4248 cg = tsk->cgroups;
4263 tsk->cgroups = &init_css_set; 4249 tsk->cgroups = &init_css_set;
4250
4251 if (run_callbacks && need_forkexit_callback) {
4252 /*
4253 * modular subsystems can't use callbacks, so no need to lock
4254 * the subsys array
4255 */
4256 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4257 struct cgroup_subsys *ss = subsys[i];
4258 if (ss->exit) {
4259 struct cgroup *old_cgrp =
4260 rcu_dereference_raw(cg->subsys[i])->cgroup;
4261 struct cgroup *cgrp = task_cgroup(tsk, i);
4262 ss->exit(ss, cgrp, old_cgrp, tsk);
4263 }
4264 }
4265 }
4264 task_unlock(tsk); 4266 task_unlock(tsk);
4267
4265 if (cg) 4268 if (cg)
4266 put_css_set_taskexit(cg); 4269 put_css_set_taskexit(cg);
4267} 4270}
@@ -4813,6 +4816,29 @@ css_get_next(struct cgroup_subsys *ss, int id,
4813 return ret; 4816 return ret;
4814} 4817}
4815 4818
4819/*
4820 * get corresponding css from file open on cgroupfs directory
4821 */
4822struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
4823{
4824 struct cgroup *cgrp;
4825 struct inode *inode;
4826 struct cgroup_subsys_state *css;
4827
4828 inode = f->f_dentry->d_inode;
4829 /* check in cgroup filesystem dir */
4830 if (inode->i_op != &cgroup_dir_inode_operations)
4831 return ERR_PTR(-EBADF);
4832
4833 if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
4834 return ERR_PTR(-EINVAL);
4835
4836 /* get cgroup */
4837 cgrp = __d_cgrp(f->f_dentry);
4838 css = cgrp->subsys[id];
4839 return css ? css : ERR_PTR(-ENOENT);
4840}
4841
4816#ifdef CONFIG_CGROUP_DEBUG 4842#ifdef CONFIG_CGROUP_DEBUG
4817static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, 4843static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
4818 struct cgroup *cont) 4844 struct cgroup *cont)
diff --git a/kernel/compat.c b/kernel/compat.c
index c9e2ec0b34a8..38b1d2c1cbe8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o,
52 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; 52 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
53} 53}
54 54
55static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
56{
57 memset(txc, 0, sizeof(struct timex));
58
59 if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
60 __get_user(txc->modes, &utp->modes) ||
61 __get_user(txc->offset, &utp->offset) ||
62 __get_user(txc->freq, &utp->freq) ||
63 __get_user(txc->maxerror, &utp->maxerror) ||
64 __get_user(txc->esterror, &utp->esterror) ||
65 __get_user(txc->status, &utp->status) ||
66 __get_user(txc->constant, &utp->constant) ||
67 __get_user(txc->precision, &utp->precision) ||
68 __get_user(txc->tolerance, &utp->tolerance) ||
69 __get_user(txc->time.tv_sec, &utp->time.tv_sec) ||
70 __get_user(txc->time.tv_usec, &utp->time.tv_usec) ||
71 __get_user(txc->tick, &utp->tick) ||
72 __get_user(txc->ppsfreq, &utp->ppsfreq) ||
73 __get_user(txc->jitter, &utp->jitter) ||
74 __get_user(txc->shift, &utp->shift) ||
75 __get_user(txc->stabil, &utp->stabil) ||
76 __get_user(txc->jitcnt, &utp->jitcnt) ||
77 __get_user(txc->calcnt, &utp->calcnt) ||
78 __get_user(txc->errcnt, &utp->errcnt) ||
79 __get_user(txc->stbcnt, &utp->stbcnt))
80 return -EFAULT;
81
82 return 0;
83}
84
85static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
86{
87 if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
88 __put_user(txc->modes, &utp->modes) ||
89 __put_user(txc->offset, &utp->offset) ||
90 __put_user(txc->freq, &utp->freq) ||
91 __put_user(txc->maxerror, &utp->maxerror) ||
92 __put_user(txc->esterror, &utp->esterror) ||
93 __put_user(txc->status, &utp->status) ||
94 __put_user(txc->constant, &utp->constant) ||
95 __put_user(txc->precision, &utp->precision) ||
96 __put_user(txc->tolerance, &utp->tolerance) ||
97 __put_user(txc->time.tv_sec, &utp->time.tv_sec) ||
98 __put_user(txc->time.tv_usec, &utp->time.tv_usec) ||
99 __put_user(txc->tick, &utp->tick) ||
100 __put_user(txc->ppsfreq, &utp->ppsfreq) ||
101 __put_user(txc->jitter, &utp->jitter) ||
102 __put_user(txc->shift, &utp->shift) ||
103 __put_user(txc->stabil, &utp->stabil) ||
104 __put_user(txc->jitcnt, &utp->jitcnt) ||
105 __put_user(txc->calcnt, &utp->calcnt) ||
106 __put_user(txc->errcnt, &utp->errcnt) ||
107 __put_user(txc->stbcnt, &utp->stbcnt) ||
108 __put_user(txc->tai, &utp->tai))
109 return -EFAULT;
110 return 0;
111}
112
55asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, 113asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
56 struct timezone __user *tz) 114 struct timezone __user *tz)
57{ 115{
@@ -617,6 +675,29 @@ long compat_sys_clock_gettime(clockid_t which_clock,
617 return err; 675 return err;
618} 676}
619 677
678long compat_sys_clock_adjtime(clockid_t which_clock,
679 struct compat_timex __user *utp)
680{
681 struct timex txc;
682 mm_segment_t oldfs;
683 int err, ret;
684
685 err = compat_get_timex(&txc, utp);
686 if (err)
687 return err;
688
689 oldfs = get_fs();
690 set_fs(KERNEL_DS);
691 ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc);
692 set_fs(oldfs);
693
694 err = compat_put_timex(utp, &txc);
695 if (err)
696 return err;
697
698 return ret;
699}
700
620long compat_sys_clock_getres(clockid_t which_clock, 701long compat_sys_clock_getres(clockid_t which_clock,
621 struct compat_timespec __user *tp) 702 struct compat_timespec __user *tp)
622{ 703{
@@ -951,58 +1032,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
951asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) 1032asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
952{ 1033{
953 struct timex txc; 1034 struct timex txc;
954 int ret; 1035 int err, ret;
955
956 memset(&txc, 0, sizeof(struct timex));
957 1036
958 if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || 1037 err = compat_get_timex(&txc, utp);
959 __get_user(txc.modes, &utp->modes) || 1038 if (err)
960 __get_user(txc.offset, &utp->offset) || 1039 return err;
961 __get_user(txc.freq, &utp->freq) ||
962 __get_user(txc.maxerror, &utp->maxerror) ||
963 __get_user(txc.esterror, &utp->esterror) ||
964 __get_user(txc.status, &utp->status) ||
965 __get_user(txc.constant, &utp->constant) ||
966 __get_user(txc.precision, &utp->precision) ||
967 __get_user(txc.tolerance, &utp->tolerance) ||
968 __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
969 __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
970 __get_user(txc.tick, &utp->tick) ||
971 __get_user(txc.ppsfreq, &utp->ppsfreq) ||
972 __get_user(txc.jitter, &utp->jitter) ||
973 __get_user(txc.shift, &utp->shift) ||
974 __get_user(txc.stabil, &utp->stabil) ||
975 __get_user(txc.jitcnt, &utp->jitcnt) ||
976 __get_user(txc.calcnt, &utp->calcnt) ||
977 __get_user(txc.errcnt, &utp->errcnt) ||
978 __get_user(txc.stbcnt, &utp->stbcnt))
979 return -EFAULT;
980 1040
981 ret = do_adjtimex(&txc); 1041 ret = do_adjtimex(&txc);
982 1042
983 if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || 1043 err = compat_put_timex(utp, &txc);
984 __put_user(txc.modes, &utp->modes) || 1044 if (err)
985 __put_user(txc.offset, &utp->offset) || 1045 return err;
986 __put_user(txc.freq, &utp->freq) ||
987 __put_user(txc.maxerror, &utp->maxerror) ||
988 __put_user(txc.esterror, &utp->esterror) ||
989 __put_user(txc.status, &utp->status) ||
990 __put_user(txc.constant, &utp->constant) ||
991 __put_user(txc.precision, &utp->precision) ||
992 __put_user(txc.tolerance, &utp->tolerance) ||
993 __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
994 __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
995 __put_user(txc.tick, &utp->tick) ||
996 __put_user(txc.ppsfreq, &utp->ppsfreq) ||
997 __put_user(txc.jitter, &utp->jitter) ||
998 __put_user(txc.shift, &utp->shift) ||
999 __put_user(txc.stabil, &utp->stabil) ||
1000 __put_user(txc.jitcnt, &utp->jitcnt) ||
1001 __put_user(txc.calcnt, &utp->calcnt) ||
1002 __put_user(txc.errcnt, &utp->errcnt) ||
1003 __put_user(txc.stbcnt, &utp->stbcnt) ||
1004 __put_user(txc.tai, &utp->tai))
1005 ret = -EFAULT;
1006 1046
1007 return ret; 1047 return ret;
1008} 1048}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 156cc5556140..12b7458f23b1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -126,7 +126,7 @@ static void cpu_hotplug_done(void)
126#else /* #if CONFIG_HOTPLUG_CPU */ 126#else /* #if CONFIG_HOTPLUG_CPU */
127static void cpu_hotplug_begin(void) {} 127static void cpu_hotplug_begin(void) {}
128static void cpu_hotplug_done(void) {} 128static void cpu_hotplug_done(void) {}
129#endif /* #esle #if CONFIG_HOTPLUG_CPU */ 129#endif /* #else #if CONFIG_HOTPLUG_CPU */
130 130
131/* Need to know about CPUs going up/down? */ 131/* Need to know about CPUs going up/down? */
132int __ref register_cpu_notifier(struct notifier_block *nb) 132int __ref register_cpu_notifier(struct notifier_block *nb)
@@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v)
160{ 160{
161 BUG_ON(cpu_notify(val, v)); 161 BUG_ON(cpu_notify(val, v));
162} 162}
163
164EXPORT_SYMBOL(register_cpu_notifier); 163EXPORT_SYMBOL(register_cpu_notifier);
165 164
166void __ref unregister_cpu_notifier(struct notifier_block *nb) 165void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -205,7 +204,6 @@ static int __ref take_cpu_down(void *_param)
205 return err; 204 return err;
206 205
207 cpu_notify(CPU_DYING | param->mod, param->hcpu); 206 cpu_notify(CPU_DYING | param->mod, param->hcpu);
208
209 return 0; 207 return 0;
210} 208}
211 209
@@ -227,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
227 return -EINVAL; 225 return -EINVAL;
228 226
229 cpu_hotplug_begin(); 227 cpu_hotplug_begin();
228
230 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); 229 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
231 if (err) { 230 if (err) {
232 nr_calls--; 231 nr_calls--;
@@ -304,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
304 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); 303 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
305 if (ret) { 304 if (ret) {
306 nr_calls--; 305 nr_calls--;
307 printk("%s: attempt to bring up CPU %u failed\n", 306 printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
308 __func__, cpu); 307 __func__, cpu);
309 goto out_notify; 308 goto out_notify;
310 } 309 }
@@ -450,14 +449,14 @@ void __ref enable_nonboot_cpus(void)
450 if (cpumask_empty(frozen_cpus)) 449 if (cpumask_empty(frozen_cpus))
451 goto out; 450 goto out;
452 451
453 printk("Enabling non-boot CPUs ...\n"); 452 printk(KERN_INFO "Enabling non-boot CPUs ...\n");
454 453
455 arch_enable_nonboot_cpus_begin(); 454 arch_enable_nonboot_cpus_begin();
456 455
457 for_each_cpu(cpu, frozen_cpus) { 456 for_each_cpu(cpu, frozen_cpus) {
458 error = _cpu_up(cpu, 1); 457 error = _cpu_up(cpu, 1);
459 if (!error) { 458 if (!error) {
460 printk("CPU%d is up\n", cpu); 459 printk(KERN_INFO "CPU%d is up\n", cpu);
461 continue; 460 continue;
462 } 461 }
463 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 462 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
@@ -509,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
509 */ 508 */
510 509
511/* cpu_bit_bitmap[0] is empty - so we can back into it */ 510/* cpu_bit_bitmap[0] is empty - so we can back into it */
512#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x) 511#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x))
513#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) 512#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
514#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) 513#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
515#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) 514#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4349935c2ad8..33eee16addb8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p,
1015 struct cpuset *cs; 1015 struct cpuset *cs;
1016 int migrate; 1016 int migrate;
1017 const nodemask_t *oldmem = scan->data; 1017 const nodemask_t *oldmem = scan->data;
1018 NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); 1018 static nodemask_t newmems; /* protected by cgroup_mutex */
1019
1020 if (!newmems)
1021 return;
1022 1019
1023 cs = cgroup_cs(scan->cg); 1020 cs = cgroup_cs(scan->cg);
1024 guarantee_online_mems(cs, newmems); 1021 guarantee_online_mems(cs, &newmems);
1025
1026 cpuset_change_task_nodemask(p, newmems);
1027 1022
1028 NODEMASK_FREE(newmems); 1023 cpuset_change_task_nodemask(p, &newmems);
1029 1024
1030 mm = get_task_mm(p); 1025 mm = get_task_mm(p);
1031 if (!mm) 1026 if (!mm)
@@ -1438,44 +1433,35 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1438 struct mm_struct *mm; 1433 struct mm_struct *mm;
1439 struct cpuset *cs = cgroup_cs(cont); 1434 struct cpuset *cs = cgroup_cs(cont);
1440 struct cpuset *oldcs = cgroup_cs(oldcont); 1435 struct cpuset *oldcs = cgroup_cs(oldcont);
1441 NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL); 1436 static nodemask_t to; /* protected by cgroup_mutex */
1442 NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
1443
1444 if (from == NULL || to == NULL)
1445 goto alloc_fail;
1446 1437
1447 if (cs == &top_cpuset) { 1438 if (cs == &top_cpuset) {
1448 cpumask_copy(cpus_attach, cpu_possible_mask); 1439 cpumask_copy(cpus_attach, cpu_possible_mask);
1449 } else { 1440 } else {
1450 guarantee_online_cpus(cs, cpus_attach); 1441 guarantee_online_cpus(cs, cpus_attach);
1451 } 1442 }
1452 guarantee_online_mems(cs, to); 1443 guarantee_online_mems(cs, &to);
1453 1444
1454 /* do per-task migration stuff possibly for each in the threadgroup */ 1445 /* do per-task migration stuff possibly for each in the threadgroup */
1455 cpuset_attach_task(tsk, to, cs); 1446 cpuset_attach_task(tsk, &to, cs);
1456 if (threadgroup) { 1447 if (threadgroup) {
1457 struct task_struct *c; 1448 struct task_struct *c;
1458 rcu_read_lock(); 1449 rcu_read_lock();
1459 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 1450 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1460 cpuset_attach_task(c, to, cs); 1451 cpuset_attach_task(c, &to, cs);
1461 } 1452 }
1462 rcu_read_unlock(); 1453 rcu_read_unlock();
1463 } 1454 }
1464 1455
1465 /* change mm; only needs to be done once even if threadgroup */ 1456 /* change mm; only needs to be done once even if threadgroup */
1466 *from = oldcs->mems_allowed; 1457 to = cs->mems_allowed;
1467 *to = cs->mems_allowed;
1468 mm = get_task_mm(tsk); 1458 mm = get_task_mm(tsk);
1469 if (mm) { 1459 if (mm) {
1470 mpol_rebind_mm(mm, to); 1460 mpol_rebind_mm(mm, &to);
1471 if (is_memory_migrate(cs)) 1461 if (is_memory_migrate(cs))
1472 cpuset_migrate_mm(mm, from, to); 1462 cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
1473 mmput(mm); 1463 mmput(mm);
1474 } 1464 }
1475
1476alloc_fail:
1477 NODEMASK_FREE(from);
1478 NODEMASK_FREE(to);
1479} 1465}
1480 1466
1481/* The various types of files and directories in a cpuset file system */ 1467/* The various types of files and directories in a cpuset file system */
@@ -1575,8 +1561,10 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1575 return -ENODEV; 1561 return -ENODEV;
1576 1562
1577 trialcs = alloc_trial_cpuset(cs); 1563 trialcs = alloc_trial_cpuset(cs);
1578 if (!trialcs) 1564 if (!trialcs) {
1579 return -ENOMEM; 1565 retval = -ENOMEM;
1566 goto out;
1567 }
1580 1568
1581 switch (cft->private) { 1569 switch (cft->private) {
1582 case FILE_CPULIST: 1570 case FILE_CPULIST:
@@ -1591,6 +1579,7 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1591 } 1579 }
1592 1580
1593 free_trial_cpuset(trialcs); 1581 free_trial_cpuset(trialcs);
1582out:
1594 cgroup_unlock(); 1583 cgroup_unlock();
1595 return retval; 1584 return retval;
1596} 1585}
@@ -1607,34 +1596,26 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1607 * across a page fault. 1596 * across a page fault.
1608 */ 1597 */
1609 1598
1610static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) 1599static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1611{ 1600{
1612 int ret; 1601 size_t count;
1613 1602
1614 mutex_lock(&callback_mutex); 1603 mutex_lock(&callback_mutex);
1615 ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); 1604 count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
1616 mutex_unlock(&callback_mutex); 1605 mutex_unlock(&callback_mutex);
1617 1606
1618 return ret; 1607 return count;
1619} 1608}
1620 1609
1621static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1610static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1622{ 1611{
1623 NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); 1612 size_t count;
1624 int retval;
1625
1626 if (mask == NULL)
1627 return -ENOMEM;
1628 1613
1629 mutex_lock(&callback_mutex); 1614 mutex_lock(&callback_mutex);
1630 *mask = cs->mems_allowed; 1615 count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
1631 mutex_unlock(&callback_mutex); 1616 mutex_unlock(&callback_mutex);
1632 1617
1633 retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); 1618 return count;
1634
1635 NODEMASK_FREE(mask);
1636
1637 return retval;
1638} 1619}
1639 1620
1640static ssize_t cpuset_common_file_read(struct cgroup *cont, 1621static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1859,8 +1840,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
1859 cs = cgroup_cs(cgroup); 1840 cs = cgroup_cs(cgroup);
1860 parent_cs = cgroup_cs(parent); 1841 parent_cs = cgroup_cs(parent);
1861 1842
1843 mutex_lock(&callback_mutex);
1862 cs->mems_allowed = parent_cs->mems_allowed; 1844 cs->mems_allowed = parent_cs->mems_allowed;
1863 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); 1845 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1846 mutex_unlock(&callback_mutex);
1864 return; 1847 return;
1865} 1848}
1866 1849
@@ -2063,10 +2046,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2063 struct cpuset *cp; /* scans cpusets being updated */ 2046 struct cpuset *cp; /* scans cpusets being updated */
2064 struct cpuset *child; /* scans child cpusets of cp */ 2047 struct cpuset *child; /* scans child cpusets of cp */
2065 struct cgroup *cont; 2048 struct cgroup *cont;
2066 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); 2049 static nodemask_t oldmems; /* protected by cgroup_mutex */
2067
2068 if (oldmems == NULL)
2069 return;
2070 2050
2071 list_add_tail((struct list_head *)&root->stack_list, &queue); 2051 list_add_tail((struct list_head *)&root->stack_list, &queue);
2072 2052
@@ -2083,7 +2063,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2083 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2063 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2084 continue; 2064 continue;
2085 2065
2086 *oldmems = cp->mems_allowed; 2066 oldmems = cp->mems_allowed;
2087 2067
2088 /* Remove offline cpus and mems from this cpuset. */ 2068 /* Remove offline cpus and mems from this cpuset. */
2089 mutex_lock(&callback_mutex); 2069 mutex_lock(&callback_mutex);
@@ -2099,10 +2079,9 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2099 remove_tasks_in_empty_cpuset(cp); 2079 remove_tasks_in_empty_cpuset(cp);
2100 else { 2080 else {
2101 update_tasks_cpumask(cp, NULL); 2081 update_tasks_cpumask(cp, NULL);
2102 update_tasks_nodemask(cp, oldmems, NULL); 2082 update_tasks_nodemask(cp, &oldmems, NULL);
2103 } 2083 }
2104 } 2084 }
2105 NODEMASK_FREE(oldmems);
2106} 2085}
2107 2086
2108/* 2087/*
@@ -2144,19 +2123,16 @@ void cpuset_update_active_cpus(void)
2144static int cpuset_track_online_nodes(struct notifier_block *self, 2123static int cpuset_track_online_nodes(struct notifier_block *self,
2145 unsigned long action, void *arg) 2124 unsigned long action, void *arg)
2146{ 2125{
2147 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); 2126 static nodemask_t oldmems; /* protected by cgroup_mutex */
2148
2149 if (oldmems == NULL)
2150 return NOTIFY_DONE;
2151 2127
2152 cgroup_lock(); 2128 cgroup_lock();
2153 switch (action) { 2129 switch (action) {
2154 case MEM_ONLINE: 2130 case MEM_ONLINE:
2155 *oldmems = top_cpuset.mems_allowed; 2131 oldmems = top_cpuset.mems_allowed;
2156 mutex_lock(&callback_mutex); 2132 mutex_lock(&callback_mutex);
2157 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2133 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2158 mutex_unlock(&callback_mutex); 2134 mutex_unlock(&callback_mutex);
2159 update_tasks_nodemask(&top_cpuset, oldmems, NULL); 2135 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2160 break; 2136 break;
2161 case MEM_OFFLINE: 2137 case MEM_OFFLINE:
2162 /* 2138 /*
@@ -2170,7 +2146,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2170 } 2146 }
2171 cgroup_unlock(); 2147 cgroup_unlock();
2172 2148
2173 NODEMASK_FREE(oldmems);
2174 return NOTIFY_OK; 2149 return NOTIFY_OK;
2175} 2150}
2176#endif 2151#endif
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000000..5f85690285d4
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,34 @@
1#include <linux/kernel.h>
2#include <linux/crash_dump.h>
3#include <linux/init.h>
4#include <linux/errno.h>
5#include <linux/module.h>
6
7/*
8 * If we have booted due to a crash, max_pfn will be a very low value. We need
9 * to know the amount of memory that the previous kernel used.
10 */
11unsigned long saved_max_pfn;
12
13/*
14 * stores the physical address of elf header of crash image
15 *
16 * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
17 * is_kdump_kernel() to determine if we are booting after a panic. Hence put
18 * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
19 */
20unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
21
22/*
23 * elfcorehdr= specifies the location of elf core header stored by the crashed
24 * kernel. This option will be passed by kexec loader to the capture kernel.
25 */
26static int __init setup_elfcorehdr(char *arg)
27{
28 char *end;
29 if (!arg)
30 return -EINVAL;
31 elfcorehdr_addr = memparse(arg, &end);
32 return end > arg ? 0 : -EINVAL;
33}
34early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
index 3a9d6dd53a6c..5557b55048df 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar;
35static struct thread_group_cred init_tgcred = { 35static struct thread_group_cred init_tgcred = {
36 .usage = ATOMIC_INIT(2), 36 .usage = ATOMIC_INIT(2),
37 .tgid = 0, 37 .tgid = 0,
38 .lock = SPIN_LOCK_UNLOCKED, 38 .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
39}; 39};
40#endif 40#endif
41 41
@@ -741,6 +741,12 @@ int set_create_files_as(struct cred *new, struct inode *inode)
741} 741}
742EXPORT_SYMBOL(set_create_files_as); 742EXPORT_SYMBOL(set_create_files_as);
743 743
744struct user_namespace *current_user_ns(void)
745{
746 return _current_user_ns();
747}
748EXPORT_SYMBOL(current_user_ns);
749
744#ifdef CONFIG_DEBUG_CREDENTIALS 750#ifdef CONFIG_DEBUG_CREDENTIALS
745 751
746bool creds_are_invalid(const struct cred *cred) 752bool creds_are_invalid(const struct cred *cred)
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index cefd4a11f6d9..bad6786dee88 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -538,7 +538,7 @@ return_normal:
538 538
539 /* 539 /*
540 * For single stepping, try to only enter on the processor 540 * For single stepping, try to only enter on the processor
541 * that was single stepping. To gaurd against a deadlock, the 541 * that was single stepping. To guard against a deadlock, the
542 * kernel will only try for the value of sstep_tries before 542 * kernel will only try for the value of sstep_tries before
543 * giving up and continuing on. 543 * giving up and continuing on.
544 */ 544 */
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 481a7bd2dfe7..a11db956dd62 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1093,3 +1093,33 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
1093 put_packet(remcom_out_buffer); 1093 put_packet(remcom_out_buffer);
1094 return 0; 1094 return 0;
1095} 1095}
1096
1097/**
1098 * gdbstub_exit - Send an exit message to GDB
1099 * @status: The exit code to report.
1100 */
1101void gdbstub_exit(int status)
1102{
1103 unsigned char checksum, ch, buffer[3];
1104 int loop;
1105
1106 buffer[0] = 'W';
1107 buffer[1] = hex_asc_hi(status);
1108 buffer[2] = hex_asc_lo(status);
1109
1110 dbg_io_ops->write_char('$');
1111 checksum = 0;
1112
1113 for (loop = 0; loop < 3; loop++) {
1114 ch = buffer[loop];
1115 checksum += ch;
1116 dbg_io_ops->write_char(ch);
1117 }
1118
1119 dbg_io_ops->write_char('#');
1120 dbg_io_ops->write_char(hex_asc_hi(checksum));
1121 dbg_io_ops->write_char(hex_asc_lo(checksum));
1122
1123 /* make sure the output is flushed, lest the bootloader clobber it */
1124 dbg_io_ops->flush();
1125}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index bd3e8e29caa3..be14779bcef6 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -78,7 +78,7 @@ static unsigned int kdb_continue_catastrophic;
78static kdbtab_t *kdb_commands; 78static kdbtab_t *kdb_commands;
79#define KDB_BASE_CMD_MAX 50 79#define KDB_BASE_CMD_MAX 50
80static int kdb_max_commands = KDB_BASE_CMD_MAX; 80static int kdb_max_commands = KDB_BASE_CMD_MAX;
81static kdbtab_t kdb_base_commands[50]; 81static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX];
82#define for_each_kdbcmd(cmd, num) \ 82#define for_each_kdbcmd(cmd, num) \
83 for ((cmd) = kdb_base_commands, (num) = 0; \ 83 for ((cmd) = kdb_base_commands, (num) = 0; \
84 num < kdb_max_commands; \ 84 num < kdb_max_commands; \
@@ -441,9 +441,9 @@ static int kdb_check_regs(void)
441 * symbol name, and offset to the caller. 441 * symbol name, and offset to the caller.
442 * 442 *
443 * The argument may consist of a numeric value (decimal or 443 * The argument may consist of a numeric value (decimal or
444 * hexidecimal), a symbol name, a register name (preceeded by the 444 * hexidecimal), a symbol name, a register name (preceded by the
445 * percent sign), an environment variable with a numeric value 445 * percent sign), an environment variable with a numeric value
446 * (preceeded by a dollar sign) or a simple arithmetic expression 446 * (preceded by a dollar sign) or a simple arithmetic expression
447 * consisting of a symbol name, +/-, and a numeric constant value 447 * consisting of a symbol name, +/-, and a numeric constant value
448 * (offset). 448 * (offset).
449 * Parameters: 449 * Parameters:
@@ -1335,7 +1335,7 @@ void kdb_print_state(const char *text, int value)
1335 * error The hardware-defined error code 1335 * error The hardware-defined error code
1336 * reason2 kdb's current reason code. 1336 * reason2 kdb's current reason code.
1337 * Initially error but can change 1337 * Initially error but can change
1338 * acording to kdb state. 1338 * according to kdb state.
1339 * db_result Result code from break or debug point. 1339 * db_result Result code from break or debug point.
1340 * regs The exception frame at time of fault/breakpoint. 1340 * regs The exception frame at time of fault/breakpoint.
1341 * should always be valid. 1341 * should always be valid.
@@ -2892,7 +2892,7 @@ static void __init kdb_inittab(void)
2892 "Send a signal to a process", 0, KDB_REPEAT_NONE); 2892 "Send a signal to a process", 0, KDB_REPEAT_NONE);
2893 kdb_register_repeat("summary", kdb_summary, "", 2893 kdb_register_repeat("summary", kdb_summary, "",
2894 "Summarize the system", 4, KDB_REPEAT_NONE); 2894 "Summarize the system", 4, KDB_REPEAT_NONE);
2895 kdb_register_repeat("per_cpu", kdb_per_cpu, "", 2895 kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
2896 "Display per_cpu variables", 3, KDB_REPEAT_NONE); 2896 "Display per_cpu variables", 3, KDB_REPEAT_NONE);
2897 kdb_register_repeat("grephelp", kdb_grep_help, "", 2897 kdb_register_repeat("grephelp", kdb_grep_help, "",
2898 "Display help on | grep", 0, KDB_REPEAT_NONE); 2898 "Display help on | grep", 0, KDB_REPEAT_NONE);
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 6b2485dcb050..5532dd37aa86 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -545,7 +545,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size)
545 * Mask for process state. 545 * Mask for process state.
546 * Notes: 546 * Notes:
547 * The mask folds data from several sources into a single long value, so 547 * The mask folds data from several sources into a single long value, so
548 * be carefull not to overlap the bits. TASK_* bits are in the LSB, 548 * be careful not to overlap the bits. TASK_* bits are in the LSB,
549 * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there 549 * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
550 * is no overlap between TASK_* and EXIT_* but that may not always be 550 * is no overlap between TASK_* and EXIT_* but that may not always be
551 * true, so EXIT_* bits are shifted left 16 bits before being stored in 551 * true, so EXIT_* bits are shifted left 16 bits before being stored in
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45ebcc7b1..8dd874181542 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -841,7 +841,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
841 /* Let father know we died 841 /* Let father know we died
842 * 842 *
843 * Thread signals are configurable, but you aren't going to use 843 * Thread signals are configurable, but you aren't going to use
844 * that to send signals to arbitary processes. 844 * that to send signals to arbitrary processes.
845 * That stops right now. 845 * That stops right now.
846 * 846 *
847 * If the parent exec id doesn't match the exec id we saved 847 * If the parent exec id doesn't match the exec id we saved
@@ -908,6 +908,7 @@ NORET_TYPE void do_exit(long code)
908 profile_task_exit(tsk); 908 profile_task_exit(tsk);
909 909
910 WARN_ON(atomic_read(&tsk->fs_excl)); 910 WARN_ON(atomic_read(&tsk->fs_excl));
911 WARN_ON(blk_needs_flush_plug(tsk));
911 912
912 if (unlikely(in_interrupt())) 913 if (unlikely(in_interrupt()))
913 panic("Aiee, killing interrupt handler!"); 914 panic("Aiee, killing interrupt handler!");
@@ -1015,7 +1016,7 @@ NORET_TYPE void do_exit(long code)
1015 /* 1016 /*
1016 * FIXME: do that only when needed, using sched_exit tracepoint 1017 * FIXME: do that only when needed, using sched_exit tracepoint
1017 */ 1018 */
1018 flush_ptrace_hw_breakpoint(tsk); 1019 ptrace_put_breakpoints(tsk);
1019 1020
1020 exit_notify(tsk, group_dead); 1021 exit_notify(tsk, group_dead);
1021#ifdef CONFIG_NUMA 1022#ifdef CONFIG_NUMA
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e429152ddc..e7548dee636b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
40#include <linux/tracehook.h> 40#include <linux/tracehook.h>
41#include <linux/futex.h> 41#include <linux/futex.h>
42#include <linux/compat.h> 42#include <linux/compat.h>
43#include <linux/kthread.h>
43#include <linux/task_io_accounting_ops.h> 44#include <linux/task_io_accounting_ops.h>
44#include <linux/rcupdate.h> 45#include <linux/rcupdate.h>
45#include <linux/ptrace.h> 46#include <linux/ptrace.h>
@@ -109,20 +110,25 @@ int nr_processes(void)
109} 110}
110 111
111#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 112#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
112# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) 113# define alloc_task_struct_node(node) \
113# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) 114 kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
115# define free_task_struct(tsk) \
116 kmem_cache_free(task_struct_cachep, (tsk))
114static struct kmem_cache *task_struct_cachep; 117static struct kmem_cache *task_struct_cachep;
115#endif 118#endif
116 119
117#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR 120#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
118static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) 121static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
122 int node)
119{ 123{
120#ifdef CONFIG_DEBUG_STACK_USAGE 124#ifdef CONFIG_DEBUG_STACK_USAGE
121 gfp_t mask = GFP_KERNEL | __GFP_ZERO; 125 gfp_t mask = GFP_KERNEL | __GFP_ZERO;
122#else 126#else
123 gfp_t mask = GFP_KERNEL; 127 gfp_t mask = GFP_KERNEL;
124#endif 128#endif
125 return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); 129 struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
130
131 return page ? page_address(page) : NULL;
126} 132}
127 133
128static inline void free_thread_info(struct thread_info *ti) 134static inline void free_thread_info(struct thread_info *ti)
@@ -193,6 +199,7 @@ void __put_task_struct(struct task_struct *tsk)
193 if (!profile_handoff_task(tsk)) 199 if (!profile_handoff_task(tsk))
194 free_task(tsk); 200 free_task(tsk);
195} 201}
202EXPORT_SYMBOL_GPL(__put_task_struct);
196 203
197/* 204/*
198 * macro override instead of weak attribute alias, to workaround 205 * macro override instead of weak attribute alias, to workaround
@@ -248,16 +255,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
248 struct task_struct *tsk; 255 struct task_struct *tsk;
249 struct thread_info *ti; 256 struct thread_info *ti;
250 unsigned long *stackend; 257 unsigned long *stackend;
251 258 int node = tsk_fork_get_node(orig);
252 int err; 259 int err;
253 260
254 prepare_to_copy(orig); 261 prepare_to_copy(orig);
255 262
256 tsk = alloc_task_struct(); 263 tsk = alloc_task_struct_node(node);
257 if (!tsk) 264 if (!tsk)
258 return NULL; 265 return NULL;
259 266
260 ti = alloc_thread_info(tsk); 267 ti = alloc_thread_info_node(tsk, node);
261 if (!ti) { 268 if (!ti) {
262 free_task_struct(tsk); 269 free_task_struct(tsk);
263 return NULL; 270 return NULL;
@@ -1180,12 +1187,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1180 pid = alloc_pid(p->nsproxy->pid_ns); 1187 pid = alloc_pid(p->nsproxy->pid_ns);
1181 if (!pid) 1188 if (!pid)
1182 goto bad_fork_cleanup_io; 1189 goto bad_fork_cleanup_io;
1183
1184 if (clone_flags & CLONE_NEWPID) {
1185 retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
1186 if (retval < 0)
1187 goto bad_fork_free_pid;
1188 }
1189 } 1190 }
1190 1191
1191 p->pid = pid_nr(pid); 1192 p->pid = pid_nr(pid);
@@ -1204,6 +1205,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1204 * Clear TID on mm_release()? 1205 * Clear TID on mm_release()?
1205 */ 1206 */
1206 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; 1207 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
1208#ifdef CONFIG_BLOCK
1209 p->plug = NULL;
1210#endif
1207#ifdef CONFIG_FUTEX 1211#ifdef CONFIG_FUTEX
1208 p->robust_list = NULL; 1212 p->robust_list = NULL;
1209#ifdef CONFIG_COMPAT 1213#ifdef CONFIG_COMPAT
@@ -1289,7 +1293,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1289 tracehook_finish_clone(p, clone_flags, trace); 1293 tracehook_finish_clone(p, clone_flags, trace);
1290 1294
1291 if (thread_group_leader(p)) { 1295 if (thread_group_leader(p)) {
1292 if (clone_flags & CLONE_NEWPID) 1296 if (is_child_reaper(pid))
1293 p->nsproxy->pid_ns->child_reaper = p; 1297 p->nsproxy->pid_ns->child_reaper = p;
1294 1298
1295 p->signal->leader_pid = pid; 1299 p->signal->leader_pid = pid;
@@ -1512,38 +1516,24 @@ void __init proc_caches_init(void)
1512} 1516}
1513 1517
1514/* 1518/*
1515 * Check constraints on flags passed to the unshare system call and 1519 * Check constraints on flags passed to the unshare system call.
1516 * force unsharing of additional process context as appropriate.
1517 */ 1520 */
1518static void check_unshare_flags(unsigned long *flags_ptr) 1521static int check_unshare_flags(unsigned long unshare_flags)
1519{ 1522{
1523 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1524 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1525 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
1526 return -EINVAL;
1520 /* 1527 /*
1521 * If unsharing a thread from a thread group, must also 1528 * Not implemented, but pretend it works if there is nothing to
1522 * unshare vm. 1529 * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
1523 */ 1530 * needs to unshare vm.
1524 if (*flags_ptr & CLONE_THREAD)
1525 *flags_ptr |= CLONE_VM;
1526
1527 /*
1528 * If unsharing vm, must also unshare signal handlers.
1529 */
1530 if (*flags_ptr & CLONE_VM)
1531 *flags_ptr |= CLONE_SIGHAND;
1532
1533 /*
1534 * If unsharing namespace, must also unshare filesystem information.
1535 */ 1531 */
1536 if (*flags_ptr & CLONE_NEWNS) 1532 if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
1537 *flags_ptr |= CLONE_FS; 1533 /* FIXME: get_task_mm() increments ->mm_users */
1538} 1534 if (atomic_read(&current->mm->mm_users) > 1)
1539 1535 return -EINVAL;
1540/* 1536 }
1541 * Unsharing of tasks created with CLONE_THREAD is not supported yet
1542 */
1543static int unshare_thread(unsigned long unshare_flags)
1544{
1545 if (unshare_flags & CLONE_THREAD)
1546 return -EINVAL;
1547 1537
1548 return 0; 1538 return 0;
1549} 1539}
@@ -1570,34 +1560,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1570} 1560}
1571 1561
1572/* 1562/*
1573 * Unsharing of sighand is not supported yet
1574 */
1575static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
1576{
1577 struct sighand_struct *sigh = current->sighand;
1578
1579 if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
1580 return -EINVAL;
1581 else
1582 return 0;
1583}
1584
1585/*
1586 * Unshare vm if it is being shared
1587 */
1588static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
1589{
1590 struct mm_struct *mm = current->mm;
1591
1592 if ((unshare_flags & CLONE_VM) &&
1593 (mm && atomic_read(&mm->mm_users) > 1)) {
1594 return -EINVAL;
1595 }
1596
1597 return 0;
1598}
1599
1600/*
1601 * Unshare file descriptor table if it is being shared 1563 * Unshare file descriptor table if it is being shared
1602 */ 1564 */
1603static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) 1565static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
@@ -1625,45 +1587,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
1625 */ 1587 */
1626SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) 1588SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1627{ 1589{
1628 int err = 0;
1629 struct fs_struct *fs, *new_fs = NULL; 1590 struct fs_struct *fs, *new_fs = NULL;
1630 struct sighand_struct *new_sigh = NULL;
1631 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1632 struct files_struct *fd, *new_fd = NULL; 1591 struct files_struct *fd, *new_fd = NULL;
1633 struct nsproxy *new_nsproxy = NULL; 1592 struct nsproxy *new_nsproxy = NULL;
1634 int do_sysvsem = 0; 1593 int do_sysvsem = 0;
1594 int err;
1635 1595
1636 check_unshare_flags(&unshare_flags); 1596 err = check_unshare_flags(unshare_flags);
1637 1597 if (err)
1638 /* Return -EINVAL for all unsupported flags */
1639 err = -EINVAL;
1640 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1641 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1642 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
1643 goto bad_unshare_out; 1598 goto bad_unshare_out;
1644 1599
1645 /* 1600 /*
1601 * If unsharing namespace, must also unshare filesystem information.
1602 */
1603 if (unshare_flags & CLONE_NEWNS)
1604 unshare_flags |= CLONE_FS;
1605 /*
1646 * CLONE_NEWIPC must also detach from the undolist: after switching 1606 * CLONE_NEWIPC must also detach from the undolist: after switching
1647 * to a new ipc namespace, the semaphore arrays from the old 1607 * to a new ipc namespace, the semaphore arrays from the old
1648 * namespace are unreachable. 1608 * namespace are unreachable.
1649 */ 1609 */
1650 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) 1610 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
1651 do_sysvsem = 1; 1611 do_sysvsem = 1;
1652 if ((err = unshare_thread(unshare_flags)))
1653 goto bad_unshare_out;
1654 if ((err = unshare_fs(unshare_flags, &new_fs))) 1612 if ((err = unshare_fs(unshare_flags, &new_fs)))
1655 goto bad_unshare_cleanup_thread; 1613 goto bad_unshare_out;
1656 if ((err = unshare_sighand(unshare_flags, &new_sigh)))
1657 goto bad_unshare_cleanup_fs;
1658 if ((err = unshare_vm(unshare_flags, &new_mm)))
1659 goto bad_unshare_cleanup_sigh;
1660 if ((err = unshare_fd(unshare_flags, &new_fd))) 1614 if ((err = unshare_fd(unshare_flags, &new_fd)))
1661 goto bad_unshare_cleanup_vm; 1615 goto bad_unshare_cleanup_fs;
1662 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, 1616 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1663 new_fs))) 1617 new_fs)))
1664 goto bad_unshare_cleanup_fd; 1618 goto bad_unshare_cleanup_fd;
1665 1619
1666 if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { 1620 if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
1667 if (do_sysvsem) { 1621 if (do_sysvsem) {
1668 /* 1622 /*
1669 * CLONE_SYSVSEM is equivalent to sys_exit(). 1623 * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1689,19 +1643,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1689 spin_unlock(&fs->lock); 1643 spin_unlock(&fs->lock);
1690 } 1644 }
1691 1645
1692 if (new_mm) {
1693 mm = current->mm;
1694 active_mm = current->active_mm;
1695 current->mm = new_mm;
1696 current->active_mm = new_mm;
1697 if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
1698 atomic_dec(&mm->oom_disable_count);
1699 atomic_inc(&new_mm->oom_disable_count);
1700 }
1701 activate_mm(active_mm, new_mm);
1702 new_mm = mm;
1703 }
1704
1705 if (new_fd) { 1646 if (new_fd) {
1706 fd = current->files; 1647 fd = current->files;
1707 current->files = new_fd; 1648 current->files = new_fd;
@@ -1718,20 +1659,10 @@ bad_unshare_cleanup_fd:
1718 if (new_fd) 1659 if (new_fd)
1719 put_files_struct(new_fd); 1660 put_files_struct(new_fd);
1720 1661
1721bad_unshare_cleanup_vm:
1722 if (new_mm)
1723 mmput(new_mm);
1724
1725bad_unshare_cleanup_sigh:
1726 if (new_sigh)
1727 if (atomic_dec_and_test(&new_sigh->count))
1728 kmem_cache_free(sighand_cachep, new_sigh);
1729
1730bad_unshare_cleanup_fs: 1662bad_unshare_cleanup_fs:
1731 if (new_fs) 1663 if (new_fs)
1732 free_fs_struct(new_fs); 1664 free_fs_struct(new_fs);
1733 1665
1734bad_unshare_cleanup_thread:
1735bad_unshare_out: 1666bad_unshare_out:
1736 return err; 1667 return err;
1737} 1668}
diff --git a/kernel/futex.c b/kernel/futex.c
index b766d28accd6..fe28dc282eae 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -381,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
381 return NULL; 381 return NULL;
382} 382}
383 383
384static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 384static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
385 u32 uval, u32 newval)
385{ 386{
386 u32 curval; 387 int ret;
387 388
388 pagefault_disable(); 389 pagefault_disable();
389 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 390 ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
390 pagefault_enable(); 391 pagefault_enable();
391 392
392 return curval; 393 return ret;
393} 394}
394 395
395static int get_futex_value_locked(u32 *dest, u32 __user *from) 396static int get_futex_value_locked(u32 *dest, u32 __user *from)
@@ -674,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
674 struct task_struct *task, int set_waiters) 675 struct task_struct *task, int set_waiters)
675{ 676{
676 int lock_taken, ret, ownerdied = 0; 677 int lock_taken, ret, ownerdied = 0;
677 u32 uval, newval, curval; 678 u32 uval, newval, curval, vpid = task_pid_vnr(task);
678 679
679retry: 680retry:
680 ret = lock_taken = 0; 681 ret = lock_taken = 0;
@@ -684,19 +685,17 @@ retry:
684 * (by doing a 0 -> TID atomic cmpxchg), while holding all 685 * (by doing a 0 -> TID atomic cmpxchg), while holding all
685 * the locks. It will most likely not succeed. 686 * the locks. It will most likely not succeed.
686 */ 687 */
687 newval = task_pid_vnr(task); 688 newval = vpid;
688 if (set_waiters) 689 if (set_waiters)
689 newval |= FUTEX_WAITERS; 690 newval |= FUTEX_WAITERS;
690 691
691 curval = cmpxchg_futex_value_locked(uaddr, 0, newval); 692 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
692
693 if (unlikely(curval == -EFAULT))
694 return -EFAULT; 693 return -EFAULT;
695 694
696 /* 695 /*
697 * Detect deadlocks. 696 * Detect deadlocks.
698 */ 697 */
699 if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) 698 if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
700 return -EDEADLK; 699 return -EDEADLK;
701 700
702 /* 701 /*
@@ -723,14 +722,12 @@ retry:
723 */ 722 */
724 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { 723 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
725 /* Keep the OWNER_DIED bit */ 724 /* Keep the OWNER_DIED bit */
726 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); 725 newval = (curval & ~FUTEX_TID_MASK) | vpid;
727 ownerdied = 0; 726 ownerdied = 0;
728 lock_taken = 1; 727 lock_taken = 1;
729 } 728 }
730 729
731 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 730 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
732
733 if (unlikely(curval == -EFAULT))
734 return -EFAULT; 731 return -EFAULT;
735 if (unlikely(curval != uval)) 732 if (unlikely(curval != uval))
736 goto retry; 733 goto retry;
@@ -775,6 +772,24 @@ retry:
775 return ret; 772 return ret;
776} 773}
777 774
775/**
776 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
777 * @q: The futex_q to unqueue
778 *
779 * The q->lock_ptr must not be NULL and must be held by the caller.
780 */
781static void __unqueue_futex(struct futex_q *q)
782{
783 struct futex_hash_bucket *hb;
784
785 if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
786 || WARN_ON(plist_node_empty(&q->list)))
787 return;
788
789 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
790 plist_del(&q->list, &hb->chain);
791}
792
778/* 793/*
779 * The hash bucket lock must be held when this is called. 794 * The hash bucket lock must be held when this is called.
780 * Afterwards, the futex_q must not be accessed. 795 * Afterwards, the futex_q must not be accessed.
@@ -792,7 +807,7 @@ static void wake_futex(struct futex_q *q)
792 */ 807 */
793 get_task_struct(p); 808 get_task_struct(p);
794 809
795 plist_del(&q->list, &q->list.plist); 810 __unqueue_futex(q);
796 /* 811 /*
797 * The waiting task can free the futex_q as soon as 812 * The waiting task can free the futex_q as soon as
798 * q->lock_ptr = NULL is written, without taking any locks. A 813 * q->lock_ptr = NULL is written, without taking any locks. A
@@ -843,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
843 858
844 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 859 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
845 860
846 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 861 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
847
848 if (curval == -EFAULT)
849 ret = -EFAULT; 862 ret = -EFAULT;
850 else if (curval != uval) 863 else if (curval != uval)
851 ret = -EINVAL; 864 ret = -EINVAL;
@@ -880,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
880 * There is no waiter, so we unlock the futex. The owner died 893 * There is no waiter, so we unlock the futex. The owner died
881 * bit has not to be preserved here. We are the owner: 894 * bit has not to be preserved here. We are the owner:
882 */ 895 */
883 oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); 896 if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
884 897 return -EFAULT;
885 if (oldval == -EFAULT)
886 return oldval;
887 if (oldval != uval) 898 if (oldval != uval)
888 return -EAGAIN; 899 return -EAGAIN;
889 900
@@ -1071,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1071 plist_del(&q->list, &hb1->chain); 1082 plist_del(&q->list, &hb1->chain);
1072 plist_add(&q->list, &hb2->chain); 1083 plist_add(&q->list, &hb2->chain);
1073 q->lock_ptr = &hb2->lock; 1084 q->lock_ptr = &hb2->lock;
1074#ifdef CONFIG_DEBUG_PI_LIST
1075 q->list.plist.spinlock = &hb2->lock;
1076#endif
1077 } 1085 }
1078 get_futex_key_refs(key2); 1086 get_futex_key_refs(key2);
1079 q->key = *key2; 1087 q->key = *key2;
@@ -1100,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1100 get_futex_key_refs(key); 1108 get_futex_key_refs(key);
1101 q->key = *key; 1109 q->key = *key;
1102 1110
1103 WARN_ON(plist_node_empty(&q->list)); 1111 __unqueue_futex(q);
1104 plist_del(&q->list, &q->list.plist);
1105 1112
1106 WARN_ON(!q->rt_waiter); 1113 WARN_ON(!q->rt_waiter);
1107 q->rt_waiter = NULL; 1114 q->rt_waiter = NULL;
1108 1115
1109 q->lock_ptr = &hb->lock; 1116 q->lock_ptr = &hb->lock;
1110#ifdef CONFIG_DEBUG_PI_LIST
1111 q->list.plist.spinlock = &hb->lock;
1112#endif
1113 1117
1114 wake_up_state(q->task, TASK_NORMAL); 1118 wake_up_state(q->task, TASK_NORMAL);
1115} 1119}
@@ -1457,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1457 prio = min(current->normal_prio, MAX_RT_PRIO); 1461 prio = min(current->normal_prio, MAX_RT_PRIO);
1458 1462
1459 plist_node_init(&q->list, prio); 1463 plist_node_init(&q->list, prio);
1460#ifdef CONFIG_DEBUG_PI_LIST
1461 q->list.plist.spinlock = &hb->lock;
1462#endif
1463 plist_add(&q->list, &hb->chain); 1464 plist_add(&q->list, &hb->chain);
1464 q->task = current; 1465 q->task = current;
1465 spin_unlock(&hb->lock); 1466 spin_unlock(&hb->lock);
@@ -1504,8 +1505,7 @@ retry:
1504 spin_unlock(lock_ptr); 1505 spin_unlock(lock_ptr);
1505 goto retry; 1506 goto retry;
1506 } 1507 }
1507 WARN_ON(plist_node_empty(&q->list)); 1508 __unqueue_futex(q);
1508 plist_del(&q->list, &q->list.plist);
1509 1509
1510 BUG_ON(q->pi_state); 1510 BUG_ON(q->pi_state);
1511 1511
@@ -1525,8 +1525,7 @@ retry:
1525static void unqueue_me_pi(struct futex_q *q) 1525static void unqueue_me_pi(struct futex_q *q)
1526 __releases(q->lock_ptr) 1526 __releases(q->lock_ptr)
1527{ 1527{
1528 WARN_ON(plist_node_empty(&q->list)); 1528 __unqueue_futex(q);
1529 plist_del(&q->list, &q->list.plist);
1530 1529
1531 BUG_ON(!q->pi_state); 1530 BUG_ON(!q->pi_state);
1532 free_pi_state(q->pi_state); 1531 free_pi_state(q->pi_state);
@@ -1556,10 +1555,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1556 1555
1557 /* 1556 /*
1558 * We are here either because we stole the rtmutex from the 1557 * We are here either because we stole the rtmutex from the
1559 * pending owner or we are the pending owner which failed to 1558 * previous highest priority waiter or we are the highest priority
1560 * get the rtmutex. We have to replace the pending owner TID 1559 * waiter but failed to get the rtmutex the first time.
1561 * in the user space variable. This must be atomic as we have 1560 * We have to replace the newowner TID in the user space variable.
1562 * to preserve the owner died bit here. 1561 * This must be atomic as we have to preserve the owner died bit here.
1563 * 1562 *
1564 * Note: We write the user space value _before_ changing the pi_state 1563 * Note: We write the user space value _before_ changing the pi_state
1565 * because we can fault here. Imagine swapped out pages or a fork 1564 * because we can fault here. Imagine swapped out pages or a fork
@@ -1578,9 +1577,7 @@ retry:
1578 while (1) { 1577 while (1) {
1579 newval = (uval & FUTEX_OWNER_DIED) | newtid; 1578 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1580 1579
1581 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 1580 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1582
1583 if (curval == -EFAULT)
1584 goto handle_fault; 1581 goto handle_fault;
1585 if (curval == uval) 1582 if (curval == uval)
1586 break; 1583 break;
@@ -1608,8 +1605,8 @@ retry:
1608 1605
1609 /* 1606 /*
1610 * To handle the page fault we need to drop the hash bucket 1607 * To handle the page fault we need to drop the hash bucket
1611 * lock here. That gives the other task (either the pending 1608 * lock here. That gives the other task (either the highest priority
1612 * owner itself or the task which stole the rtmutex) the 1609 * waiter itself or the task which stole the rtmutex) the
1613 * chance to try the fixup of the pi_state. So once we are 1610 * chance to try the fixup of the pi_state. So once we are
1614 * back from handling the fault we need to check the pi_state 1611 * back from handling the fault we need to check the pi_state
1615 * after reacquiring the hash bucket lock and before trying to 1612 * after reacquiring the hash bucket lock and before trying to
@@ -1685,18 +1682,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1685 /* 1682 /*
1686 * pi_state is incorrect, some other task did a lock steal and 1683 * pi_state is incorrect, some other task did a lock steal and
1687 * we returned due to timeout or signal without taking the 1684 * we returned due to timeout or signal without taking the
1688 * rt_mutex. Too late. We can access the rt_mutex_owner without 1685 * rt_mutex. Too late.
1689 * locking, as the other task is now blocked on the hash bucket
1690 * lock. Fix the state up.
1691 */ 1686 */
1687 raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
1692 owner = rt_mutex_owner(&q->pi_state->pi_mutex); 1688 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1689 if (!owner)
1690 owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
1691 raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
1693 ret = fixup_pi_state_owner(uaddr, q, owner); 1692 ret = fixup_pi_state_owner(uaddr, q, owner);
1694 goto out; 1693 goto out;
1695 } 1694 }
1696 1695
1697 /* 1696 /*
1698 * Paranoia check. If we did not take the lock, then we should not be 1697 * Paranoia check. If we did not take the lock, then we should not be
1699 * the owner, nor the pending owner, of the rt_mutex. 1698 * the owner of the rt_mutex.
1700 */ 1699 */
1701 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) 1700 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1702 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " 1701 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
@@ -1781,13 +1780,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1781 * 1780 *
1782 * The basic logical guarantee of a futex is that it blocks ONLY 1781 * The basic logical guarantee of a futex is that it blocks ONLY
1783 * if cond(var) is known to be true at the time of blocking, for 1782 * if cond(var) is known to be true at the time of blocking, for
1784 * any cond. If we queued after testing *uaddr, that would open 1783 * any cond. If we locked the hash-bucket after testing *uaddr, that
1785 * a race condition where we could block indefinitely with 1784 * would open a race condition where we could block indefinitely with
1786 * cond(var) false, which would violate the guarantee. 1785 * cond(var) false, which would violate the guarantee.
1787 * 1786 *
1788 * A consequence is that futex_wait() can return zero and absorb 1787 * On the other hand, we insert q and release the hash-bucket only
1789 * a wakeup when *uaddr != val on entry to the syscall. This is 1788 * after testing *uaddr. This guarantees that futex_wait() will NOT
1790 * rare, but normal. 1789 * absorb a wakeup if *uaddr does not match the desired values
1790 * while the syscall executes.
1791 */ 1791 */
1792retry: 1792retry:
1793 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); 1793 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
@@ -1886,7 +1886,7 @@ retry:
1886 restart->futex.val = val; 1886 restart->futex.val = val;
1887 restart->futex.time = abs_time->tv64; 1887 restart->futex.time = abs_time->tv64;
1888 restart->futex.bitset = bitset; 1888 restart->futex.bitset = bitset;
1889 restart->futex.flags = flags; 1889 restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
1890 1890
1891 ret = -ERESTART_RESTARTBLOCK; 1891 ret = -ERESTART_RESTARTBLOCK;
1892 1892
@@ -2046,9 +2046,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2046{ 2046{
2047 struct futex_hash_bucket *hb; 2047 struct futex_hash_bucket *hb;
2048 struct futex_q *this, *next; 2048 struct futex_q *this, *next;
2049 u32 uval;
2050 struct plist_head *head; 2049 struct plist_head *head;
2051 union futex_key key = FUTEX_KEY_INIT; 2050 union futex_key key = FUTEX_KEY_INIT;
2051 u32 uval, vpid = task_pid_vnr(current);
2052 int ret; 2052 int ret;
2053 2053
2054retry: 2054retry:
@@ -2057,7 +2057,7 @@ retry:
2057 /* 2057 /*
2058 * We release only a lock we actually own: 2058 * We release only a lock we actually own:
2059 */ 2059 */
2060 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 2060 if ((uval & FUTEX_TID_MASK) != vpid)
2061 return -EPERM; 2061 return -EPERM;
2062 2062
2063 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); 2063 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
@@ -2072,17 +2072,14 @@ retry:
2072 * again. If it succeeds then we can return without waking 2072 * again. If it succeeds then we can return without waking
2073 * anyone else up: 2073 * anyone else up:
2074 */ 2074 */
2075 if (!(uval & FUTEX_OWNER_DIED)) 2075 if (!(uval & FUTEX_OWNER_DIED) &&
2076 uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); 2076 cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2077
2078
2079 if (unlikely(uval == -EFAULT))
2080 goto pi_faulted; 2077 goto pi_faulted;
2081 /* 2078 /*
2082 * Rare case: we managed to release the lock atomically, 2079 * Rare case: we managed to release the lock atomically,
2083 * no need to wake anyone else up: 2080 * no need to wake anyone else up:
2084 */ 2081 */
2085 if (unlikely(uval == task_pid_vnr(current))) 2082 if (unlikely(uval == vpid))
2086 goto out_unlock; 2083 goto out_unlock;
2087 2084
2088 /* 2085 /*
@@ -2167,7 +2164,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2167 * We were woken prior to requeue by a timeout or a signal. 2164 * We were woken prior to requeue by a timeout or a signal.
2168 * Unqueue the futex_q and determine which it was. 2165 * Unqueue the futex_q and determine which it was.
2169 */ 2166 */
2170 plist_del(&q->list, &q->list.plist); 2167 plist_del(&q->list, &hb->chain);
2171 2168
2172 /* Handle spurious wakeups gracefully */ 2169 /* Handle spurious wakeups gracefully */
2173 ret = -EWOULDBLOCK; 2170 ret = -EWOULDBLOCK;
@@ -2421,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
2421 goto err_unlock; 2418 goto err_unlock;
2422 ret = -EPERM; 2419 ret = -EPERM;
2423 pcred = __task_cred(p); 2420 pcred = __task_cred(p);
2421 /* If victim is in different user_ns, then uids are not
2422 comparable, so we must have CAP_SYS_PTRACE */
2423 if (cred->user->user_ns != pcred->user->user_ns) {
2424 if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
2425 goto err_unlock;
2426 goto ok;
2427 }
2428 /* If victim is in same user_ns, then uids are comparable */
2424 if (cred->euid != pcred->euid && 2429 if (cred->euid != pcred->euid &&
2425 cred->euid != pcred->uid && 2430 cred->euid != pcred->uid &&
2426 !capable(CAP_SYS_PTRACE)) 2431 !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
2427 goto err_unlock; 2432 goto err_unlock;
2433ok:
2428 head = p->robust_list; 2434 head = p->robust_list;
2429 rcu_read_unlock(); 2435 rcu_read_unlock();
2430 } 2436 }
@@ -2463,11 +2469,20 @@ retry:
2463 * userspace. 2469 * userspace.
2464 */ 2470 */
2465 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 2471 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2466 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); 2472 /*
2467 2473 * We are not holding a lock here, but we want to have
2468 if (nval == -EFAULT) 2474 * the pagefault_disable/enable() protection because
2469 return -1; 2475 * we want to handle the fault gracefully. If the
2470 2476 * access fails we try to fault in the futex with R/W
2477 * verification via get_user_pages. get_user() above
2478 * does not guarantee R/W access. If that fails we
2479 * give up and leave the futex locked.
2480 */
2481 if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
2482 if (fault_in_user_writeable(uaddr))
2483 return -1;
2484 goto retry;
2485 }
2471 if (nval != uval) 2486 if (nval != uval)
2472 goto retry; 2487 goto retry;
2473 2488
@@ -2678,8 +2693,7 @@ static int __init futex_init(void)
2678 * implementation, the non-functional ones will return 2693 * implementation, the non-functional ones will return
2679 * -ENOSYS. 2694 * -ENOSYS.
2680 */ 2695 */
2681 curval = cmpxchg_futex_value_locked(NULL, 0, 0); 2696 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2682 if (curval == -EFAULT)
2683 futex_cmpxchg_enabled = 1; 2697 futex_cmpxchg_enabled = 1;
2684 2698
2685 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 2699 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index a7934ac75e5b..5f9e689dc8f0 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -153,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
153 goto err_unlock; 153 goto err_unlock;
154 ret = -EPERM; 154 ret = -EPERM;
155 pcred = __task_cred(p); 155 pcred = __task_cred(p);
156 /* If victim is in different user_ns, then uids are not
157 comparable, so we must have CAP_SYS_PTRACE */
158 if (cred->user->user_ns != pcred->user->user_ns) {
159 if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
160 goto err_unlock;
161 goto ok;
162 }
163 /* If victim is in same user_ns, then uids are comparable */
156 if (cred->euid != pcred->euid && 164 if (cred->euid != pcred->euid &&
157 cred->euid != pcred->uid && 165 cred->euid != pcred->uid &&
158 !capable(CAP_SYS_PTRACE)) 166 !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
159 goto err_unlock; 167 goto err_unlock;
168ok:
160 head = p->compat_robust_list; 169 head = p->compat_robust_list;
161 rcu_read_unlock(); 170 rcu_read_unlock();
162 } 171 }
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 70a298d6da71..b8cadf70b1fb 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -34,7 +34,7 @@ config GCOV_KERNEL
34config GCOV_PROFILE_ALL 34config GCOV_PROFILE_ALL
35 bool "Profile entire Kernel" 35 bool "Profile entire Kernel"
36 depends on GCOV_KERNEL 36 depends on GCOV_KERNEL
37 depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE 37 depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
38 default n 38 default n
39 ---help--- 39 ---help---
40 This options activates profiling for the entire kernel. 40 This options activates profiling for the entire kernel.
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 3f761001d517..e97ca59e2520 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,3 +1,3 @@
1EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' 1ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
2 2
3obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o 3obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/groups.c b/kernel/groups.c
index 253dc0f35cf4..1cc476d52dd3 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
233 struct group_info *group_info; 233 struct group_info *group_info;
234 int retval; 234 int retval;
235 235
236 if (!capable(CAP_SETGID)) 236 if (!nsown_capable(CAP_SETGID))
237 return -EPERM; 237 return -EPERM;
238 if ((unsigned)gidsetsize > NGROUPS_MAX) 238 if ((unsigned)gidsetsize > NGROUPS_MAX)
239 return -EINVAL; 239 return -EINVAL;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0c8d7c048615..87fdb3f8db14 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -53,11 +53,10 @@
53/* 53/*
54 * The timer bases: 54 * The timer bases:
55 * 55 *
56 * Note: If we want to add new timer bases, we have to skip the two 56 * There are more clockids then hrtimer bases. Thus, we index
57 * clock ids captured by the cpu-timers. We do this by holding empty 57 * into the timer bases by the hrtimer_base_type enum. When trying
58 * entries rather than doing math adjustment of the clock ids. 58 * to reach a base using a clockid, hrtimer_clockid_to_base()
59 * This ensures that we capture erroneous accesses to these clock ids 59 * is used to convert from clockid to the proper hrtimer_base_type.
60 * rather than moving them into the range of valid clock id's.
61 */ 60 */
62DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = 61DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
63{ 62{
@@ -74,30 +73,43 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
74 .get_time = &ktime_get, 73 .get_time = &ktime_get,
75 .resolution = KTIME_LOW_RES, 74 .resolution = KTIME_LOW_RES,
76 }, 75 },
76 {
77 .index = CLOCK_BOOTTIME,
78 .get_time = &ktime_get_boottime,
79 .resolution = KTIME_LOW_RES,
80 },
77 } 81 }
78}; 82};
79 83
84static int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
85 [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
86 [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
87 [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
88};
89
90static inline int hrtimer_clockid_to_base(clockid_t clock_id)
91{
92 return hrtimer_clock_to_base_table[clock_id];
93}
94
95
80/* 96/*
81 * Get the coarse grained time at the softirq based on xtime and 97 * Get the coarse grained time at the softirq based on xtime and
82 * wall_to_monotonic. 98 * wall_to_monotonic.
83 */ 99 */
84static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) 100static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
85{ 101{
86 ktime_t xtim, tomono; 102 ktime_t xtim, mono, boot;
87 struct timespec xts, tom; 103 struct timespec xts, tom, slp;
88 unsigned long seq;
89 104
90 do { 105 get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
91 seq = read_seqbegin(&xtime_lock);
92 xts = __current_kernel_time();
93 tom = __get_wall_to_monotonic();
94 } while (read_seqretry(&xtime_lock, seq));
95 106
96 xtim = timespec_to_ktime(xts); 107 xtim = timespec_to_ktime(xts);
97 tomono = timespec_to_ktime(tom); 108 mono = ktime_add(xtim, timespec_to_ktime(tom));
98 base->clock_base[CLOCK_REALTIME].softirq_time = xtim; 109 boot = ktime_add(mono, timespec_to_ktime(slp));
99 base->clock_base[CLOCK_MONOTONIC].softirq_time = 110 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
100 ktime_add(xtim, tomono); 111 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
112 base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
101} 113}
102 114
103/* 115/*
@@ -184,10 +196,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
184 struct hrtimer_cpu_base *new_cpu_base; 196 struct hrtimer_cpu_base *new_cpu_base;
185 int this_cpu = smp_processor_id(); 197 int this_cpu = smp_processor_id();
186 int cpu = hrtimer_get_target(this_cpu, pinned); 198 int cpu = hrtimer_get_target(this_cpu, pinned);
199 int basenum = hrtimer_clockid_to_base(base->index);
187 200
188again: 201again:
189 new_cpu_base = &per_cpu(hrtimer_bases, cpu); 202 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
190 new_base = &new_cpu_base->clock_base[base->index]; 203 new_base = &new_cpu_base->clock_base[basenum];
191 204
192 if (base != new_base) { 205 if (base != new_base) {
193 /* 206 /*
@@ -334,6 +347,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe);
334 347
335static struct debug_obj_descr hrtimer_debug_descr; 348static struct debug_obj_descr hrtimer_debug_descr;
336 349
350static void *hrtimer_debug_hint(void *addr)
351{
352 return ((struct hrtimer *) addr)->function;
353}
354
337/* 355/*
338 * fixup_init is called when: 356 * fixup_init is called when:
339 * - an active object is initialized 357 * - an active object is initialized
@@ -393,6 +411,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
393 411
394static struct debug_obj_descr hrtimer_debug_descr = { 412static struct debug_obj_descr hrtimer_debug_descr = {
395 .name = "hrtimer", 413 .name = "hrtimer",
414 .debug_hint = hrtimer_debug_hint,
396 .fixup_init = hrtimer_fixup_init, 415 .fixup_init = hrtimer_fixup_init,
397 .fixup_activate = hrtimer_fixup_activate, 416 .fixup_activate = hrtimer_fixup_activate,
398 .fixup_free = hrtimer_fixup_free, 417 .fixup_free = hrtimer_fixup_free,
@@ -611,24 +630,23 @@ static int hrtimer_reprogram(struct hrtimer *timer,
611static void retrigger_next_event(void *arg) 630static void retrigger_next_event(void *arg)
612{ 631{
613 struct hrtimer_cpu_base *base; 632 struct hrtimer_cpu_base *base;
614 struct timespec realtime_offset, wtm; 633 struct timespec realtime_offset, wtm, sleep;
615 unsigned long seq;
616 634
617 if (!hrtimer_hres_active()) 635 if (!hrtimer_hres_active())
618 return; 636 return;
619 637
620 do { 638 get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm,
621 seq = read_seqbegin(&xtime_lock); 639 &sleep);
622 wtm = __get_wall_to_monotonic();
623 } while (read_seqretry(&xtime_lock, seq));
624 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); 640 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
625 641
626 base = &__get_cpu_var(hrtimer_bases); 642 base = &__get_cpu_var(hrtimer_bases);
627 643
628 /* Adjust CLOCK_REALTIME offset */ 644 /* Adjust CLOCK_REALTIME offset */
629 raw_spin_lock(&base->lock); 645 raw_spin_lock(&base->lock);
630 base->clock_base[CLOCK_REALTIME].offset = 646 base->clock_base[HRTIMER_BASE_REALTIME].offset =
631 timespec_to_ktime(realtime_offset); 647 timespec_to_ktime(realtime_offset);
648 base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
649 timespec_to_ktime(sleep);
632 650
633 hrtimer_force_reprogram(base, 0); 651 hrtimer_force_reprogram(base, 0);
634 raw_spin_unlock(&base->lock); 652 raw_spin_unlock(&base->lock);
@@ -673,14 +691,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
673} 691}
674 692
675/* 693/*
676 * Initialize the high resolution related parts of a hrtimer
677 */
678static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
679{
680}
681
682
683/*
684 * When High resolution timers are active, try to reprogram. Note, that in case 694 * When High resolution timers are active, try to reprogram. Note, that in case
685 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry 695 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
686 * check happens. The timer gets enqueued into the rbtree. The reprogramming 696 * check happens. The timer gets enqueued into the rbtree. The reprogramming
@@ -725,8 +735,9 @@ static int hrtimer_switch_to_hres(void)
725 return 0; 735 return 0;
726 } 736 }
727 base->hres_active = 1; 737 base->hres_active = 1;
728 base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; 738 base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES;
729 base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; 739 base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES;
740 base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES;
730 741
731 tick_setup_sched_timer(); 742 tick_setup_sched_timer();
732 743
@@ -750,7 +761,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
750 return 0; 761 return 0;
751} 762}
752static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } 763static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
753static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
754 764
755#endif /* CONFIG_HIGH_RES_TIMERS */ 765#endif /* CONFIG_HIGH_RES_TIMERS */
756 766
@@ -1121,6 +1131,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1121 enum hrtimer_mode mode) 1131 enum hrtimer_mode mode)
1122{ 1132{
1123 struct hrtimer_cpu_base *cpu_base; 1133 struct hrtimer_cpu_base *cpu_base;
1134 int base;
1124 1135
1125 memset(timer, 0, sizeof(struct hrtimer)); 1136 memset(timer, 0, sizeof(struct hrtimer));
1126 1137
@@ -1129,8 +1140,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1129 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) 1140 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
1130 clock_id = CLOCK_MONOTONIC; 1141 clock_id = CLOCK_MONOTONIC;
1131 1142
1132 timer->base = &cpu_base->clock_base[clock_id]; 1143 base = hrtimer_clockid_to_base(clock_id);
1133 hrtimer_init_timer_hres(timer); 1144 timer->base = &cpu_base->clock_base[base];
1134 timerqueue_init(&timer->node); 1145 timerqueue_init(&timer->node);
1135 1146
1136#ifdef CONFIG_TIMER_STATS 1147#ifdef CONFIG_TIMER_STATS
@@ -1165,9 +1176,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
1165int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) 1176int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1166{ 1177{
1167 struct hrtimer_cpu_base *cpu_base; 1178 struct hrtimer_cpu_base *cpu_base;
1179 int base = hrtimer_clockid_to_base(which_clock);
1168 1180
1169 cpu_base = &__raw_get_cpu_var(hrtimer_bases); 1181 cpu_base = &__raw_get_cpu_var(hrtimer_bases);
1170 *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); 1182 *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
1171 1183
1172 return 0; 1184 return 0;
1173} 1185}
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 8e42fec7686d..c574f9a12c48 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,5 +1,6 @@
1# Select this to activate the generic irq options below
1config HAVE_GENERIC_HARDIRQS 2config HAVE_GENERIC_HARDIRQS
2 def_bool n 3 bool
3 4
4if HAVE_GENERIC_HARDIRQS 5if HAVE_GENERIC_HARDIRQS
5menu "IRQ subsystem" 6menu "IRQ subsystem"
@@ -9,28 +10,47 @@ menu "IRQ subsystem"
9config GENERIC_HARDIRQS 10config GENERIC_HARDIRQS
10 def_bool y 11 def_bool y
11 12
12# Select this to disable the deprecated stuff
13config GENERIC_HARDIRQS_NO_DEPRECATED
14 def_bool n
15
16# Options selectable by the architecture code 13# Options selectable by the architecture code
14
15# Make sparse irq Kconfig switch below available
17config HAVE_SPARSE_IRQ 16config HAVE_SPARSE_IRQ
18 def_bool n 17 bool
19 18
19# Enable the generic irq autoprobe mechanism
20config GENERIC_IRQ_PROBE 20config GENERIC_IRQ_PROBE
21 def_bool n 21 bool
22
23# Use the generic /proc/interrupts implementation
24config GENERIC_IRQ_SHOW
25 bool
26
27# Print level/edge extra information
28config GENERIC_IRQ_SHOW_LEVEL
29 bool
22 30
31# Support for delayed migration from interrupt context
23config GENERIC_PENDING_IRQ 32config GENERIC_PENDING_IRQ
24 def_bool n 33 bool
25 34
35# Alpha specific irq affinity mechanism
26config AUTO_IRQ_AFFINITY 36config AUTO_IRQ_AFFINITY
27 def_bool n 37 bool
28
29config IRQ_PER_CPU
30 def_bool n
31 38
39# Tasklet based software resend for pending interrupts on enable_irq()
32config HARDIRQS_SW_RESEND 40config HARDIRQS_SW_RESEND
33 def_bool n 41 bool
42
43# Preflow handler support for fasteoi (sparc64)
44config IRQ_PREFLOW_FASTEOI
45 bool
46
47# Edge style eoi based handler (cell)
48config IRQ_EDGE_EOI_HANDLER
49 bool
50
51# Support forced irq threading
52config IRQ_FORCED_THREADING
53 bool
34 54
35config SPARSE_IRQ 55config SPARSE_IRQ
36 bool "Support sparse irq numbering" 56 bool "Support sparse irq numbering"
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 505798f86c36..342d8f44e401 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -17,7 +17,7 @@
17/* 17/*
18 * Autodetection depends on the fact that any interrupt that 18 * Autodetection depends on the fact that any interrupt that
19 * comes in on to an unassigned handler will get stuck with 19 * comes in on to an unassigned handler will get stuck with
20 * "IRQ_WAITING" cleared and the interrupt disabled. 20 * "IRQS_WAITING" cleared and the interrupt disabled.
21 */ 21 */
22static DEFINE_MUTEX(probing_active); 22static DEFINE_MUTEX(probing_active);
23 23
@@ -32,7 +32,6 @@ unsigned long probe_irq_on(void)
32{ 32{
33 struct irq_desc *desc; 33 struct irq_desc *desc;
34 unsigned long mask = 0; 34 unsigned long mask = 0;
35 unsigned int status;
36 int i; 35 int i;
37 36
38 /* 37 /*
@@ -46,13 +45,7 @@ unsigned long probe_irq_on(void)
46 */ 45 */
47 for_each_irq_desc_reverse(i, desc) { 46 for_each_irq_desc_reverse(i, desc) {
48 raw_spin_lock_irq(&desc->lock); 47 raw_spin_lock_irq(&desc->lock);
49 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 48 if (!desc->action && irq_settings_can_probe(desc)) {
50 /*
51 * An old-style architecture might still have
52 * the handle_bad_irq handler there:
53 */
54 compat_irq_chip_set_default_handler(desc);
55
56 /* 49 /*
57 * Some chips need to know about probing in 50 * Some chips need to know about probing in
58 * progress: 51 * progress:
@@ -60,7 +53,7 @@ unsigned long probe_irq_on(void)
60 if (desc->irq_data.chip->irq_set_type) 53 if (desc->irq_data.chip->irq_set_type)
61 desc->irq_data.chip->irq_set_type(&desc->irq_data, 54 desc->irq_data.chip->irq_set_type(&desc->irq_data,
62 IRQ_TYPE_PROBE); 55 IRQ_TYPE_PROBE);
63 desc->irq_data.chip->irq_startup(&desc->irq_data); 56 irq_startup(desc);
64 } 57 }
65 raw_spin_unlock_irq(&desc->lock); 58 raw_spin_unlock_irq(&desc->lock);
66 } 59 }
@@ -75,10 +68,10 @@ unsigned long probe_irq_on(void)
75 */ 68 */
76 for_each_irq_desc_reverse(i, desc) { 69 for_each_irq_desc_reverse(i, desc) {
77 raw_spin_lock_irq(&desc->lock); 70 raw_spin_lock_irq(&desc->lock);
78 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 71 if (!desc->action && irq_settings_can_probe(desc)) {
79 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 72 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
80 if (desc->irq_data.chip->irq_startup(&desc->irq_data)) 73 if (irq_startup(desc))
81 desc->status |= IRQ_PENDING; 74 desc->istate |= IRQS_PENDING;
82 } 75 }
83 raw_spin_unlock_irq(&desc->lock); 76 raw_spin_unlock_irq(&desc->lock);
84 } 77 }
@@ -93,13 +86,12 @@ unsigned long probe_irq_on(void)
93 */ 86 */
94 for_each_irq_desc(i, desc) { 87 for_each_irq_desc(i, desc) {
95 raw_spin_lock_irq(&desc->lock); 88 raw_spin_lock_irq(&desc->lock);
96 status = desc->status;
97 89
98 if (status & IRQ_AUTODETECT) { 90 if (desc->istate & IRQS_AUTODETECT) {
99 /* It triggered already - consider it spurious. */ 91 /* It triggered already - consider it spurious. */
100 if (!(status & IRQ_WAITING)) { 92 if (!(desc->istate & IRQS_WAITING)) {
101 desc->status = status & ~IRQ_AUTODETECT; 93 desc->istate &= ~IRQS_AUTODETECT;
102 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 94 irq_shutdown(desc);
103 } else 95 } else
104 if (i < 32) 96 if (i < 32)
105 mask |= 1 << i; 97 mask |= 1 << i;
@@ -125,20 +117,18 @@ EXPORT_SYMBOL(probe_irq_on);
125 */ 117 */
126unsigned int probe_irq_mask(unsigned long val) 118unsigned int probe_irq_mask(unsigned long val)
127{ 119{
128 unsigned int status, mask = 0; 120 unsigned int mask = 0;
129 struct irq_desc *desc; 121 struct irq_desc *desc;
130 int i; 122 int i;
131 123
132 for_each_irq_desc(i, desc) { 124 for_each_irq_desc(i, desc) {
133 raw_spin_lock_irq(&desc->lock); 125 raw_spin_lock_irq(&desc->lock);
134 status = desc->status; 126 if (desc->istate & IRQS_AUTODETECT) {
135 127 if (i < 16 && !(desc->istate & IRQS_WAITING))
136 if (status & IRQ_AUTODETECT) {
137 if (i < 16 && !(status & IRQ_WAITING))
138 mask |= 1 << i; 128 mask |= 1 << i;
139 129
140 desc->status = status & ~IRQ_AUTODETECT; 130 desc->istate &= ~IRQS_AUTODETECT;
141 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 131 irq_shutdown(desc);
142 } 132 }
143 raw_spin_unlock_irq(&desc->lock); 133 raw_spin_unlock_irq(&desc->lock);
144 } 134 }
@@ -169,20 +159,18 @@ int probe_irq_off(unsigned long val)
169{ 159{
170 int i, irq_found = 0, nr_of_irqs = 0; 160 int i, irq_found = 0, nr_of_irqs = 0;
171 struct irq_desc *desc; 161 struct irq_desc *desc;
172 unsigned int status;
173 162
174 for_each_irq_desc(i, desc) { 163 for_each_irq_desc(i, desc) {
175 raw_spin_lock_irq(&desc->lock); 164 raw_spin_lock_irq(&desc->lock);
176 status = desc->status;
177 165
178 if (status & IRQ_AUTODETECT) { 166 if (desc->istate & IRQS_AUTODETECT) {
179 if (!(status & IRQ_WAITING)) { 167 if (!(desc->istate & IRQS_WAITING)) {
180 if (!nr_of_irqs) 168 if (!nr_of_irqs)
181 irq_found = i; 169 irq_found = i;
182 nr_of_irqs++; 170 nr_of_irqs++;
183 } 171 }
184 desc->status = status & ~IRQ_AUTODETECT; 172 desc->istate &= ~IRQS_AUTODETECT;
185 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 173 irq_shutdown(desc);
186 } 174 }
187 raw_spin_unlock_irq(&desc->lock); 175 raw_spin_unlock_irq(&desc->lock);
188 } 176 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index baa5c4acad83..4af1e2b244cb 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -19,140 +19,115 @@
19#include "internals.h" 19#include "internals.h"
20 20
21/** 21/**
22 * set_irq_chip - set the irq chip for an irq 22 * irq_set_chip - set the irq chip for an irq
23 * @irq: irq number 23 * @irq: irq number
24 * @chip: pointer to irq chip description structure 24 * @chip: pointer to irq chip description structure
25 */ 25 */
26int set_irq_chip(unsigned int irq, struct irq_chip *chip) 26int irq_set_chip(unsigned int irq, struct irq_chip *chip)
27{ 27{
28 struct irq_desc *desc = irq_to_desc(irq);
29 unsigned long flags; 28 unsigned long flags;
29 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
30 30
31 if (!desc) { 31 if (!desc)
32 WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
33 return -EINVAL; 32 return -EINVAL;
34 }
35 33
36 if (!chip) 34 if (!chip)
37 chip = &no_irq_chip; 35 chip = &no_irq_chip;
38 36
39 raw_spin_lock_irqsave(&desc->lock, flags);
40 irq_chip_set_defaults(chip);
41 desc->irq_data.chip = chip; 37 desc->irq_data.chip = chip;
42 raw_spin_unlock_irqrestore(&desc->lock, flags); 38 irq_put_desc_unlock(desc, flags);
43 39 /*
40 * For !CONFIG_SPARSE_IRQ make the irq show up in
41 * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is
42 * already marked, and this call is harmless.
43 */
44 irq_reserve_irq(irq);
44 return 0; 45 return 0;
45} 46}
46EXPORT_SYMBOL(set_irq_chip); 47EXPORT_SYMBOL(irq_set_chip);
47 48
48/** 49/**
49 * set_irq_type - set the irq trigger type for an irq 50 * irq_set_type - set the irq trigger type for an irq
50 * @irq: irq number 51 * @irq: irq number
51 * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h 52 * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
52 */ 53 */
53int set_irq_type(unsigned int irq, unsigned int type) 54int irq_set_irq_type(unsigned int irq, unsigned int type)
54{ 55{
55 struct irq_desc *desc = irq_to_desc(irq);
56 unsigned long flags; 56 unsigned long flags;
57 int ret = -ENXIO; 57 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
58 int ret = 0;
58 59
59 if (!desc) { 60 if (!desc)
60 printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); 61 return -EINVAL;
61 return -ENODEV;
62 }
63 62
64 type &= IRQ_TYPE_SENSE_MASK; 63 type &= IRQ_TYPE_SENSE_MASK;
65 if (type == IRQ_TYPE_NONE) 64 if (type != IRQ_TYPE_NONE)
66 return 0; 65 ret = __irq_set_trigger(desc, irq, type);
67 66 irq_put_desc_busunlock(desc, flags);
68 raw_spin_lock_irqsave(&desc->lock, flags);
69 ret = __irq_set_trigger(desc, irq, type);
70 raw_spin_unlock_irqrestore(&desc->lock, flags);
71 return ret; 67 return ret;
72} 68}
73EXPORT_SYMBOL(set_irq_type); 69EXPORT_SYMBOL(irq_set_irq_type);
74 70
75/** 71/**
76 * set_irq_data - set irq type data for an irq 72 * irq_set_handler_data - set irq handler data for an irq
77 * @irq: Interrupt number 73 * @irq: Interrupt number
78 * @data: Pointer to interrupt specific data 74 * @data: Pointer to interrupt specific data
79 * 75 *
80 * Set the hardware irq controller data for an irq 76 * Set the hardware irq controller data for an irq
81 */ 77 */
82int set_irq_data(unsigned int irq, void *data) 78int irq_set_handler_data(unsigned int irq, void *data)
83{ 79{
84 struct irq_desc *desc = irq_to_desc(irq);
85 unsigned long flags; 80 unsigned long flags;
81 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
86 82
87 if (!desc) { 83 if (!desc)
88 printk(KERN_ERR
89 "Trying to install controller data for IRQ%d\n", irq);
90 return -EINVAL; 84 return -EINVAL;
91 }
92
93 raw_spin_lock_irqsave(&desc->lock, flags);
94 desc->irq_data.handler_data = data; 85 desc->irq_data.handler_data = data;
95 raw_spin_unlock_irqrestore(&desc->lock, flags); 86 irq_put_desc_unlock(desc, flags);
96 return 0; 87 return 0;
97} 88}
98EXPORT_SYMBOL(set_irq_data); 89EXPORT_SYMBOL(irq_set_handler_data);
99 90
100/** 91/**
101 * set_irq_msi - set MSI descriptor data for an irq 92 * irq_set_msi_desc - set MSI descriptor data for an irq
102 * @irq: Interrupt number 93 * @irq: Interrupt number
103 * @entry: Pointer to MSI descriptor data 94 * @entry: Pointer to MSI descriptor data
104 * 95 *
105 * Set the MSI descriptor entry for an irq 96 * Set the MSI descriptor entry for an irq
106 */ 97 */
107int set_irq_msi(unsigned int irq, struct msi_desc *entry) 98int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
108{ 99{
109 struct irq_desc *desc = irq_to_desc(irq);
110 unsigned long flags; 100 unsigned long flags;
101 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
111 102
112 if (!desc) { 103 if (!desc)
113 printk(KERN_ERR
114 "Trying to install msi data for IRQ%d\n", irq);
115 return -EINVAL; 104 return -EINVAL;
116 }
117
118 raw_spin_lock_irqsave(&desc->lock, flags);
119 desc->irq_data.msi_desc = entry; 105 desc->irq_data.msi_desc = entry;
120 if (entry) 106 if (entry)
121 entry->irq = irq; 107 entry->irq = irq;
122 raw_spin_unlock_irqrestore(&desc->lock, flags); 108 irq_put_desc_unlock(desc, flags);
123 return 0; 109 return 0;
124} 110}
125 111
126/** 112/**
127 * set_irq_chip_data - set irq chip data for an irq 113 * irq_set_chip_data - set irq chip data for an irq
128 * @irq: Interrupt number 114 * @irq: Interrupt number
129 * @data: Pointer to chip specific data 115 * @data: Pointer to chip specific data
130 * 116 *
131 * Set the hardware irq chip data for an irq 117 * Set the hardware irq chip data for an irq
132 */ 118 */
133int set_irq_chip_data(unsigned int irq, void *data) 119int irq_set_chip_data(unsigned int irq, void *data)
134{ 120{
135 struct irq_desc *desc = irq_to_desc(irq);
136 unsigned long flags; 121 unsigned long flags;
122 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
137 123
138 if (!desc) { 124 if (!desc)
139 printk(KERN_ERR
140 "Trying to install chip data for IRQ%d\n", irq);
141 return -EINVAL;
142 }
143
144 if (!desc->irq_data.chip) {
145 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
146 return -EINVAL; 125 return -EINVAL;
147 }
148
149 raw_spin_lock_irqsave(&desc->lock, flags);
150 desc->irq_data.chip_data = data; 126 desc->irq_data.chip_data = data;
151 raw_spin_unlock_irqrestore(&desc->lock, flags); 127 irq_put_desc_unlock(desc, flags);
152
153 return 0; 128 return 0;
154} 129}
155EXPORT_SYMBOL(set_irq_chip_data); 130EXPORT_SYMBOL(irq_set_chip_data);
156 131
157struct irq_data *irq_get_irq_data(unsigned int irq) 132struct irq_data *irq_get_irq_data(unsigned int irq)
158{ 133{
@@ -162,221 +137,71 @@ struct irq_data *irq_get_irq_data(unsigned int irq)
162} 137}
163EXPORT_SYMBOL_GPL(irq_get_irq_data); 138EXPORT_SYMBOL_GPL(irq_get_irq_data);
164 139
165/** 140static void irq_state_clr_disabled(struct irq_desc *desc)
166 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
167 *
168 * @irq: Interrupt number
169 * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag
170 *
171 * The IRQ_NESTED_THREAD flag indicates that on
172 * request_threaded_irq() no separate interrupt thread should be
173 * created for the irq as the handler are called nested in the
174 * context of a demultiplexing interrupt handler thread.
175 */
176void set_irq_nested_thread(unsigned int irq, int nest)
177{
178 struct irq_desc *desc = irq_to_desc(irq);
179 unsigned long flags;
180
181 if (!desc)
182 return;
183
184 raw_spin_lock_irqsave(&desc->lock, flags);
185 if (nest)
186 desc->status |= IRQ_NESTED_THREAD;
187 else
188 desc->status &= ~IRQ_NESTED_THREAD;
189 raw_spin_unlock_irqrestore(&desc->lock, flags);
190}
191EXPORT_SYMBOL_GPL(set_irq_nested_thread);
192
193/*
194 * default enable function
195 */
196static void default_enable(struct irq_data *data)
197{ 141{
198 struct irq_desc *desc = irq_data_to_desc(data); 142 irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED);
199
200 desc->irq_data.chip->irq_unmask(&desc->irq_data);
201 desc->status &= ~IRQ_MASKED;
202} 143}
203 144
204/* 145static void irq_state_set_disabled(struct irq_desc *desc)
205 * default disable function
206 */
207static void default_disable(struct irq_data *data)
208{
209}
210
211/*
212 * default startup function
213 */
214static unsigned int default_startup(struct irq_data *data)
215{ 146{
216 struct irq_desc *desc = irq_data_to_desc(data); 147 irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
217
218 desc->irq_data.chip->irq_enable(data);
219 return 0;
220} 148}
221 149
222/* 150static void irq_state_clr_masked(struct irq_desc *desc)
223 * default shutdown function
224 */
225static void default_shutdown(struct irq_data *data)
226{ 151{
227 struct irq_desc *desc = irq_data_to_desc(data); 152 irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
228
229 desc->irq_data.chip->irq_mask(&desc->irq_data);
230 desc->status |= IRQ_MASKED;
231} 153}
232 154
233#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED 155static void irq_state_set_masked(struct irq_desc *desc)
234/* Temporary migration helpers */
235static void compat_irq_mask(struct irq_data *data)
236{ 156{
237 data->chip->mask(data->irq); 157 irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
238} 158}
239 159
240static void compat_irq_unmask(struct irq_data *data) 160int irq_startup(struct irq_desc *desc)
241{ 161{
242 data->chip->unmask(data->irq); 162 irq_state_clr_disabled(desc);
243} 163 desc->depth = 0;
244 164
245static void compat_irq_ack(struct irq_data *data) 165 if (desc->irq_data.chip->irq_startup) {
246{ 166 int ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
247 data->chip->ack(data->irq); 167 irq_state_clr_masked(desc);
248} 168 return ret;
249 169 }
250static void compat_irq_mask_ack(struct irq_data *data)
251{
252 data->chip->mask_ack(data->irq);
253}
254
255static void compat_irq_eoi(struct irq_data *data)
256{
257 data->chip->eoi(data->irq);
258}
259
260static void compat_irq_enable(struct irq_data *data)
261{
262 data->chip->enable(data->irq);
263}
264
265static void compat_irq_disable(struct irq_data *data)
266{
267 data->chip->disable(data->irq);
268}
269
270static void compat_irq_shutdown(struct irq_data *data)
271{
272 data->chip->shutdown(data->irq);
273}
274
275static unsigned int compat_irq_startup(struct irq_data *data)
276{
277 return data->chip->startup(data->irq);
278}
279
280static int compat_irq_set_affinity(struct irq_data *data,
281 const struct cpumask *dest, bool force)
282{
283 return data->chip->set_affinity(data->irq, dest);
284}
285
286static int compat_irq_set_type(struct irq_data *data, unsigned int type)
287{
288 return data->chip->set_type(data->irq, type);
289}
290
291static int compat_irq_set_wake(struct irq_data *data, unsigned int on)
292{
293 return data->chip->set_wake(data->irq, on);
294}
295 170
296static int compat_irq_retrigger(struct irq_data *data) 171 irq_enable(desc);
297{ 172 return 0;
298 return data->chip->retrigger(data->irq);
299} 173}
300 174
301static void compat_bus_lock(struct irq_data *data) 175void irq_shutdown(struct irq_desc *desc)
302{ 176{
303 data->chip->bus_lock(data->irq); 177 irq_state_set_disabled(desc);
178 desc->depth = 1;
179 if (desc->irq_data.chip->irq_shutdown)
180 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
181 if (desc->irq_data.chip->irq_disable)
182 desc->irq_data.chip->irq_disable(&desc->irq_data);
183 else
184 desc->irq_data.chip->irq_mask(&desc->irq_data);
185 irq_state_set_masked(desc);
304} 186}
305 187
306static void compat_bus_sync_unlock(struct irq_data *data) 188void irq_enable(struct irq_desc *desc)
307{ 189{
308 data->chip->bus_sync_unlock(data->irq); 190 irq_state_clr_disabled(desc);
191 if (desc->irq_data.chip->irq_enable)
192 desc->irq_data.chip->irq_enable(&desc->irq_data);
193 else
194 desc->irq_data.chip->irq_unmask(&desc->irq_data);
195 irq_state_clr_masked(desc);
309} 196}
310#endif
311 197
312/* 198void irq_disable(struct irq_desc *desc)
313 * Fixup enable/disable function pointers
314 */
315void irq_chip_set_defaults(struct irq_chip *chip)
316{ 199{
317#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED 200 irq_state_set_disabled(desc);
318 /* 201 if (desc->irq_data.chip->irq_disable) {
319 * Compat fixup functions need to be before we set the 202 desc->irq_data.chip->irq_disable(&desc->irq_data);
320 * defaults for enable/disable/startup/shutdown 203 irq_state_set_masked(desc);
321 */ 204 }
322 if (chip->enable)
323 chip->irq_enable = compat_irq_enable;
324 if (chip->disable)
325 chip->irq_disable = compat_irq_disable;
326 if (chip->shutdown)
327 chip->irq_shutdown = compat_irq_shutdown;
328 if (chip->startup)
329 chip->irq_startup = compat_irq_startup;
330#endif
331 /*
332 * The real defaults
333 */
334 if (!chip->irq_enable)
335 chip->irq_enable = default_enable;
336 if (!chip->irq_disable)
337 chip->irq_disable = default_disable;
338 if (!chip->irq_startup)
339 chip->irq_startup = default_startup;
340 /*
341 * We use chip->irq_disable, when the user provided its own. When
342 * we have default_disable set for chip->irq_disable, then we need
343 * to use default_shutdown, otherwise the irq line is not
344 * disabled on free_irq():
345 */
346 if (!chip->irq_shutdown)
347 chip->irq_shutdown = chip->irq_disable != default_disable ?
348 chip->irq_disable : default_shutdown;
349
350#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
351 if (!chip->end)
352 chip->end = dummy_irq_chip.end;
353
354 /*
355 * Now fix up the remaining compat handlers
356 */
357 if (chip->bus_lock)
358 chip->irq_bus_lock = compat_bus_lock;
359 if (chip->bus_sync_unlock)
360 chip->irq_bus_sync_unlock = compat_bus_sync_unlock;
361 if (chip->mask)
362 chip->irq_mask = compat_irq_mask;
363 if (chip->unmask)
364 chip->irq_unmask = compat_irq_unmask;
365 if (chip->ack)
366 chip->irq_ack = compat_irq_ack;
367 if (chip->mask_ack)
368 chip->irq_mask_ack = compat_irq_mask_ack;
369 if (chip->eoi)
370 chip->irq_eoi = compat_irq_eoi;
371 if (chip->set_affinity)
372 chip->irq_set_affinity = compat_irq_set_affinity;
373 if (chip->set_type)
374 chip->irq_set_type = compat_irq_set_type;
375 if (chip->set_wake)
376 chip->irq_set_wake = compat_irq_set_wake;
377 if (chip->retrigger)
378 chip->irq_retrigger = compat_irq_retrigger;
379#endif
380} 205}
381 206
382static inline void mask_ack_irq(struct irq_desc *desc) 207static inline void mask_ack_irq(struct irq_desc *desc)
@@ -388,22 +213,22 @@ static inline void mask_ack_irq(struct irq_desc *desc)
388 if (desc->irq_data.chip->irq_ack) 213 if (desc->irq_data.chip->irq_ack)
389 desc->irq_data.chip->irq_ack(&desc->irq_data); 214 desc->irq_data.chip->irq_ack(&desc->irq_data);
390 } 215 }
391 desc->status |= IRQ_MASKED; 216 irq_state_set_masked(desc);
392} 217}
393 218
394static inline void mask_irq(struct irq_desc *desc) 219void mask_irq(struct irq_desc *desc)
395{ 220{
396 if (desc->irq_data.chip->irq_mask) { 221 if (desc->irq_data.chip->irq_mask) {
397 desc->irq_data.chip->irq_mask(&desc->irq_data); 222 desc->irq_data.chip->irq_mask(&desc->irq_data);
398 desc->status |= IRQ_MASKED; 223 irq_state_set_masked(desc);
399 } 224 }
400} 225}
401 226
402static inline void unmask_irq(struct irq_desc *desc) 227void unmask_irq(struct irq_desc *desc)
403{ 228{
404 if (desc->irq_data.chip->irq_unmask) { 229 if (desc->irq_data.chip->irq_unmask) {
405 desc->irq_data.chip->irq_unmask(&desc->irq_data); 230 desc->irq_data.chip->irq_unmask(&desc->irq_data);
406 desc->status &= ~IRQ_MASKED; 231 irq_state_clr_masked(desc);
407 } 232 }
408} 233}
409 234
@@ -428,10 +253,10 @@ void handle_nested_irq(unsigned int irq)
428 kstat_incr_irqs_this_cpu(irq, desc); 253 kstat_incr_irqs_this_cpu(irq, desc);
429 254
430 action = desc->action; 255 action = desc->action;
431 if (unlikely(!action || (desc->status & IRQ_DISABLED))) 256 if (unlikely(!action || irqd_irq_disabled(&desc->irq_data)))
432 goto out_unlock; 257 goto out_unlock;
433 258
434 desc->status |= IRQ_INPROGRESS; 259 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
435 raw_spin_unlock_irq(&desc->lock); 260 raw_spin_unlock_irq(&desc->lock);
436 261
437 action_ret = action->thread_fn(action->irq, action->dev_id); 262 action_ret = action->thread_fn(action->irq, action->dev_id);
@@ -439,13 +264,20 @@ void handle_nested_irq(unsigned int irq)
439 note_interrupt(irq, desc, action_ret); 264 note_interrupt(irq, desc, action_ret);
440 265
441 raw_spin_lock_irq(&desc->lock); 266 raw_spin_lock_irq(&desc->lock);
442 desc->status &= ~IRQ_INPROGRESS; 267 irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
443 268
444out_unlock: 269out_unlock:
445 raw_spin_unlock_irq(&desc->lock); 270 raw_spin_unlock_irq(&desc->lock);
446} 271}
447EXPORT_SYMBOL_GPL(handle_nested_irq); 272EXPORT_SYMBOL_GPL(handle_nested_irq);
448 273
274static bool irq_check_poll(struct irq_desc *desc)
275{
276 if (!(desc->istate & IRQS_POLL_INPROGRESS))
277 return false;
278 return irq_wait_for_poll(desc);
279}
280
449/** 281/**
450 * handle_simple_irq - Simple and software-decoded IRQs. 282 * handle_simple_irq - Simple and software-decoded IRQs.
451 * @irq: the interrupt number 283 * @irq: the interrupt number
@@ -461,29 +293,20 @@ EXPORT_SYMBOL_GPL(handle_nested_irq);
461void 293void
462handle_simple_irq(unsigned int irq, struct irq_desc *desc) 294handle_simple_irq(unsigned int irq, struct irq_desc *desc)
463{ 295{
464 struct irqaction *action;
465 irqreturn_t action_ret;
466
467 raw_spin_lock(&desc->lock); 296 raw_spin_lock(&desc->lock);
468 297
469 if (unlikely(desc->status & IRQ_INPROGRESS)) 298 if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
470 goto out_unlock; 299 if (!irq_check_poll(desc))
471 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 300 goto out_unlock;
301
302 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
472 kstat_incr_irqs_this_cpu(irq, desc); 303 kstat_incr_irqs_this_cpu(irq, desc);
473 304
474 action = desc->action; 305 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
475 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
476 goto out_unlock; 306 goto out_unlock;
477 307
478 desc->status |= IRQ_INPROGRESS; 308 handle_irq_event(desc);
479 raw_spin_unlock(&desc->lock);
480 309
481 action_ret = handle_IRQ_event(irq, action);
482 if (!noirqdebug)
483 note_interrupt(irq, desc, action_ret);
484
485 raw_spin_lock(&desc->lock);
486 desc->status &= ~IRQ_INPROGRESS;
487out_unlock: 310out_unlock:
488 raw_spin_unlock(&desc->lock); 311 raw_spin_unlock(&desc->lock);
489} 312}
@@ -501,42 +324,42 @@ out_unlock:
501void 324void
502handle_level_irq(unsigned int irq, struct irq_desc *desc) 325handle_level_irq(unsigned int irq, struct irq_desc *desc)
503{ 326{
504 struct irqaction *action;
505 irqreturn_t action_ret;
506
507 raw_spin_lock(&desc->lock); 327 raw_spin_lock(&desc->lock);
508 mask_ack_irq(desc); 328 mask_ack_irq(desc);
509 329
510 if (unlikely(desc->status & IRQ_INPROGRESS)) 330 if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
511 goto out_unlock; 331 if (!irq_check_poll(desc))
512 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 332 goto out_unlock;
333
334 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
513 kstat_incr_irqs_this_cpu(irq, desc); 335 kstat_incr_irqs_this_cpu(irq, desc);
514 336
515 /* 337 /*
516 * If its disabled or no action available 338 * If its disabled or no action available
517 * keep it masked and get out of here 339 * keep it masked and get out of here
518 */ 340 */
519 action = desc->action; 341 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
520 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
521 goto out_unlock; 342 goto out_unlock;
522 343
523 desc->status |= IRQ_INPROGRESS; 344 handle_irq_event(desc);
524 raw_spin_unlock(&desc->lock);
525
526 action_ret = handle_IRQ_event(irq, action);
527 if (!noirqdebug)
528 note_interrupt(irq, desc, action_ret);
529
530 raw_spin_lock(&desc->lock);
531 desc->status &= ~IRQ_INPROGRESS;
532 345
533 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) 346 if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT))
534 unmask_irq(desc); 347 unmask_irq(desc);
535out_unlock: 348out_unlock:
536 raw_spin_unlock(&desc->lock); 349 raw_spin_unlock(&desc->lock);
537} 350}
538EXPORT_SYMBOL_GPL(handle_level_irq); 351EXPORT_SYMBOL_GPL(handle_level_irq);
539 352
353#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
354static inline void preflow_handler(struct irq_desc *desc)
355{
356 if (desc->preflow_handler)
357 desc->preflow_handler(&desc->irq_data);
358}
359#else
360static inline void preflow_handler(struct irq_desc *desc) { }
361#endif
362
540/** 363/**
541 * handle_fasteoi_irq - irq handler for transparent controllers 364 * handle_fasteoi_irq - irq handler for transparent controllers
542 * @irq: the interrupt number 365 * @irq: the interrupt number
@@ -550,42 +373,40 @@ EXPORT_SYMBOL_GPL(handle_level_irq);
550void 373void
551handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) 374handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
552{ 375{
553 struct irqaction *action;
554 irqreturn_t action_ret;
555
556 raw_spin_lock(&desc->lock); 376 raw_spin_lock(&desc->lock);
557 377
558 if (unlikely(desc->status & IRQ_INPROGRESS)) 378 if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
559 goto out; 379 if (!irq_check_poll(desc))
380 goto out;
560 381
561 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 382 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
562 kstat_incr_irqs_this_cpu(irq, desc); 383 kstat_incr_irqs_this_cpu(irq, desc);
563 384
564 /* 385 /*
565 * If its disabled or no action available 386 * If its disabled or no action available
566 * then mask it and get out of here: 387 * then mask it and get out of here:
567 */ 388 */
568 action = desc->action; 389 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
569 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 390 desc->istate |= IRQS_PENDING;
570 desc->status |= IRQ_PENDING;
571 mask_irq(desc); 391 mask_irq(desc);
572 goto out; 392 goto out;
573 } 393 }
574 394
575 desc->status |= IRQ_INPROGRESS; 395 if (desc->istate & IRQS_ONESHOT)
576 desc->status &= ~IRQ_PENDING; 396 mask_irq(desc);
577 raw_spin_unlock(&desc->lock);
578 397
579 action_ret = handle_IRQ_event(irq, action); 398 preflow_handler(desc);
580 if (!noirqdebug) 399 handle_irq_event(desc);
581 note_interrupt(irq, desc, action_ret);
582 400
583 raw_spin_lock(&desc->lock); 401out_eoi:
584 desc->status &= ~IRQ_INPROGRESS;
585out:
586 desc->irq_data.chip->irq_eoi(&desc->irq_data); 402 desc->irq_data.chip->irq_eoi(&desc->irq_data);
587 403out_unlock:
588 raw_spin_unlock(&desc->lock); 404 raw_spin_unlock(&desc->lock);
405 return;
406out:
407 if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED))
408 goto out_eoi;
409 goto out_unlock;
589} 410}
590 411
591/** 412/**
@@ -594,7 +415,7 @@ out:
594 * @desc: the interrupt description structure for this irq 415 * @desc: the interrupt description structure for this irq
595 * 416 *
596 * Interrupt occures on the falling and/or rising edge of a hardware 417 * Interrupt occures on the falling and/or rising edge of a hardware
597 * signal. The occurence is latched into the irq controller hardware 418 * signal. The occurrence is latched into the irq controller hardware
598 * and must be acked in order to be reenabled. After the ack another 419 * and must be acked in order to be reenabled. After the ack another
599 * interrupt can happen on the same source even before the first one 420 * interrupt can happen on the same source even before the first one
600 * is handled by the associated event handler. If this happens it 421 * is handled by the associated event handler. If this happens it
@@ -609,32 +430,27 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
609{ 430{
610 raw_spin_lock(&desc->lock); 431 raw_spin_lock(&desc->lock);
611 432
612 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 433 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
613
614 /* 434 /*
615 * If we're currently running this IRQ, or its disabled, 435 * If we're currently running this IRQ, or its disabled,
616 * we shouldn't process the IRQ. Mark it pending, handle 436 * we shouldn't process the IRQ. Mark it pending, handle
617 * the necessary masking and go out 437 * the necessary masking and go out
618 */ 438 */
619 if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || 439 if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
620 !desc->action)) { 440 irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
621 desc->status |= (IRQ_PENDING | IRQ_MASKED); 441 if (!irq_check_poll(desc)) {
622 mask_ack_irq(desc); 442 desc->istate |= IRQS_PENDING;
623 goto out_unlock; 443 mask_ack_irq(desc);
444 goto out_unlock;
445 }
624 } 446 }
625 kstat_incr_irqs_this_cpu(irq, desc); 447 kstat_incr_irqs_this_cpu(irq, desc);
626 448
627 /* Start handling the irq */ 449 /* Start handling the irq */
628 desc->irq_data.chip->irq_ack(&desc->irq_data); 450 desc->irq_data.chip->irq_ack(&desc->irq_data);
629 451
630 /* Mark the IRQ currently in progress.*/
631 desc->status |= IRQ_INPROGRESS;
632
633 do { 452 do {
634 struct irqaction *action = desc->action; 453 if (unlikely(!desc->action)) {
635 irqreturn_t action_ret;
636
637 if (unlikely(!action)) {
638 mask_irq(desc); 454 mask_irq(desc);
639 goto out_unlock; 455 goto out_unlock;
640 } 456 }
@@ -644,26 +460,66 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
644 * one, we could have masked the irq. 460 * one, we could have masked the irq.
645 * Renable it, if it was not disabled in meantime. 461 * Renable it, if it was not disabled in meantime.
646 */ 462 */
647 if (unlikely((desc->status & 463 if (unlikely(desc->istate & IRQS_PENDING)) {
648 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == 464 if (!irqd_irq_disabled(&desc->irq_data) &&
649 (IRQ_PENDING | IRQ_MASKED))) { 465 irqd_irq_masked(&desc->irq_data))
650 unmask_irq(desc); 466 unmask_irq(desc);
651 } 467 }
652 468
653 desc->status &= ~IRQ_PENDING; 469 handle_irq_event(desc);
654 raw_spin_unlock(&desc->lock);
655 action_ret = handle_IRQ_event(irq, action);
656 if (!noirqdebug)
657 note_interrupt(irq, desc, action_ret);
658 raw_spin_lock(&desc->lock);
659 470
660 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); 471 } while ((desc->istate & IRQS_PENDING) &&
472 !irqd_irq_disabled(&desc->irq_data));
661 473
662 desc->status &= ~IRQ_INPROGRESS;
663out_unlock: 474out_unlock:
664 raw_spin_unlock(&desc->lock); 475 raw_spin_unlock(&desc->lock);
665} 476}
666 477
478#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
479/**
480 * handle_edge_eoi_irq - edge eoi type IRQ handler
481 * @irq: the interrupt number
482 * @desc: the interrupt description structure for this irq
483 *
484 * Similar as the above handle_edge_irq, but using eoi and w/o the
485 * mask/unmask logic.
486 */
487void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
488{
489 struct irq_chip *chip = irq_desc_get_chip(desc);
490
491 raw_spin_lock(&desc->lock);
492
493 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
494 /*
495 * If we're currently running this IRQ, or its disabled,
496 * we shouldn't process the IRQ. Mark it pending, handle
497 * the necessary masking and go out
498 */
499 if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
500 irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
501 if (!irq_check_poll(desc)) {
502 desc->istate |= IRQS_PENDING;
503 goto out_eoi;
504 }
505 }
506 kstat_incr_irqs_this_cpu(irq, desc);
507
508 do {
509 if (unlikely(!desc->action))
510 goto out_eoi;
511
512 handle_irq_event(desc);
513
514 } while ((desc->istate & IRQS_PENDING) &&
515 !irqd_irq_disabled(&desc->irq_data));
516
517out_eoi:
518 chip->irq_eoi(&desc->irq_data);
519 raw_spin_unlock(&desc->lock);
520}
521#endif
522
667/** 523/**
668 * handle_percpu_irq - Per CPU local irq handler 524 * handle_percpu_irq - Per CPU local irq handler
669 * @irq: the interrupt number 525 * @irq: the interrupt number
@@ -674,103 +530,145 @@ out_unlock:
674void 530void
675handle_percpu_irq(unsigned int irq, struct irq_desc *desc) 531handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
676{ 532{
677 irqreturn_t action_ret; 533 struct irq_chip *chip = irq_desc_get_chip(desc);
678 534
679 kstat_incr_irqs_this_cpu(irq, desc); 535 kstat_incr_irqs_this_cpu(irq, desc);
680 536
681 if (desc->irq_data.chip->irq_ack) 537 if (chip->irq_ack)
682 desc->irq_data.chip->irq_ack(&desc->irq_data); 538 chip->irq_ack(&desc->irq_data);
683 539
684 action_ret = handle_IRQ_event(irq, desc->action); 540 handle_irq_event_percpu(desc, desc->action);
685 if (!noirqdebug)
686 note_interrupt(irq, desc, action_ret);
687 541
688 if (desc->irq_data.chip->irq_eoi) 542 if (chip->irq_eoi)
689 desc->irq_data.chip->irq_eoi(&desc->irq_data); 543 chip->irq_eoi(&desc->irq_data);
690} 544}
691 545
692void 546void
693__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, 547__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
694 const char *name) 548 const char *name)
695{ 549{
696 struct irq_desc *desc = irq_to_desc(irq);
697 unsigned long flags; 550 unsigned long flags;
551 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
698 552
699 if (!desc) { 553 if (!desc)
700 printk(KERN_ERR
701 "Trying to install type control for IRQ%d\n", irq);
702 return; 554 return;
703 }
704 555
705 if (!handle) 556 if (!handle) {
706 handle = handle_bad_irq; 557 handle = handle_bad_irq;
707 else if (desc->irq_data.chip == &no_irq_chip) { 558 } else {
708 printk(KERN_WARNING "Trying to install %sinterrupt handler " 559 if (WARN_ON(desc->irq_data.chip == &no_irq_chip))
709 "for IRQ%d\n", is_chained ? "chained " : "", irq); 560 goto out;
710 /*
711 * Some ARM implementations install a handler for really dumb
712 * interrupt hardware without setting an irq_chip. This worked
713 * with the ARM no_irq_chip but the check in setup_irq would
714 * prevent us to setup the interrupt at all. Switch it to
715 * dummy_irq_chip for easy transition.
716 */
717 desc->irq_data.chip = &dummy_irq_chip;
718 } 561 }
719 562
720 chip_bus_lock(desc);
721 raw_spin_lock_irqsave(&desc->lock, flags);
722
723 /* Uninstall? */ 563 /* Uninstall? */
724 if (handle == handle_bad_irq) { 564 if (handle == handle_bad_irq) {
725 if (desc->irq_data.chip != &no_irq_chip) 565 if (desc->irq_data.chip != &no_irq_chip)
726 mask_ack_irq(desc); 566 mask_ack_irq(desc);
727 desc->status |= IRQ_DISABLED; 567 irq_state_set_disabled(desc);
728 desc->depth = 1; 568 desc->depth = 1;
729 } 569 }
730 desc->handle_irq = handle; 570 desc->handle_irq = handle;
731 desc->name = name; 571 desc->name = name;
732 572
733 if (handle != handle_bad_irq && is_chained) { 573 if (handle != handle_bad_irq && is_chained) {
734 desc->status &= ~IRQ_DISABLED; 574 irq_settings_set_noprobe(desc);
735 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; 575 irq_settings_set_norequest(desc);
736 desc->depth = 0; 576 irq_startup(desc);
737 desc->irq_data.chip->irq_startup(&desc->irq_data);
738 } 577 }
739 raw_spin_unlock_irqrestore(&desc->lock, flags); 578out:
740 chip_bus_sync_unlock(desc); 579 irq_put_desc_busunlock(desc, flags);
741}
742EXPORT_SYMBOL_GPL(__set_irq_handler);
743
744void
745set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
746 irq_flow_handler_t handle)
747{
748 set_irq_chip(irq, chip);
749 __set_irq_handler(irq, handle, 0, NULL);
750} 580}
581EXPORT_SYMBOL_GPL(__irq_set_handler);
751 582
752void 583void
753set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, 584irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
754 irq_flow_handler_t handle, const char *name) 585 irq_flow_handler_t handle, const char *name)
755{ 586{
756 set_irq_chip(irq, chip); 587 irq_set_chip(irq, chip);
757 __set_irq_handler(irq, handle, 0, name); 588 __irq_set_handler(irq, handle, 0, name);
758} 589}
759 590
760void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) 591void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
761{ 592{
762 struct irq_desc *desc = irq_to_desc(irq);
763 unsigned long flags; 593 unsigned long flags;
594 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
764 595
765 if (!desc) 596 if (!desc)
766 return; 597 return;
598 irq_settings_clr_and_set(desc, clr, set);
599
600 irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
601 IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
602 if (irq_settings_has_no_balance_set(desc))
603 irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
604 if (irq_settings_is_per_cpu(desc))
605 irqd_set(&desc->irq_data, IRQD_PER_CPU);
606 if (irq_settings_can_move_pcntxt(desc))
607 irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
608 if (irq_settings_is_level(desc))
609 irqd_set(&desc->irq_data, IRQD_LEVEL);
610
611 irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
612
613 irq_put_desc_unlock(desc, flags);
614}
615
616/**
617 * irq_cpu_online - Invoke all irq_cpu_online functions.
618 *
619 * Iterate through all irqs and invoke the chip.irq_cpu_online()
620 * for each.
621 */
622void irq_cpu_online(void)
623{
624 struct irq_desc *desc;
625 struct irq_chip *chip;
626 unsigned long flags;
627 unsigned int irq;
628
629 for_each_active_irq(irq) {
630 desc = irq_to_desc(irq);
631 if (!desc)
632 continue;
767 633
768 /* Sanitize flags */ 634 raw_spin_lock_irqsave(&desc->lock, flags);
769 set &= IRQF_MODIFY_MASK;
770 clr &= IRQF_MODIFY_MASK;
771 635
772 raw_spin_lock_irqsave(&desc->lock, flags); 636 chip = irq_data_get_irq_chip(&desc->irq_data);
773 desc->status &= ~clr; 637 if (chip && chip->irq_cpu_online &&
774 desc->status |= set; 638 (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
775 raw_spin_unlock_irqrestore(&desc->lock, flags); 639 !irqd_irq_disabled(&desc->irq_data)))
640 chip->irq_cpu_online(&desc->irq_data);
641
642 raw_spin_unlock_irqrestore(&desc->lock, flags);
643 }
644}
645
646/**
647 * irq_cpu_offline - Invoke all irq_cpu_offline functions.
648 *
649 * Iterate through all irqs and invoke the chip.irq_cpu_offline()
650 * for each.
651 */
652void irq_cpu_offline(void)
653{
654 struct irq_desc *desc;
655 struct irq_chip *chip;
656 unsigned long flags;
657 unsigned int irq;
658
659 for_each_active_irq(irq) {
660 desc = irq_to_desc(irq);
661 if (!desc)
662 continue;
663
664 raw_spin_lock_irqsave(&desc->lock, flags);
665
666 chip = irq_data_get_irq_chip(&desc->irq_data);
667 if (chip && chip->irq_cpu_offline &&
668 (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
669 !irqd_irq_disabled(&desc->irq_data)))
670 chip->irq_cpu_offline(&desc->irq_data);
671
672 raw_spin_unlock_irqrestore(&desc->lock, flags);
673 }
776} 674}
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
new file mode 100644
index 000000000000..306cba37e9a5
--- /dev/null
+++ b/kernel/irq/debug.h
@@ -0,0 +1,44 @@
1/*
2 * Debugging printout:
3 */
4
5#include <linux/kallsyms.h>
6
7#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
8#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
9/* FIXME */
10#define PD(f) do { } while (0)
11
12static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
13{
14 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
15 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
16 printk("->handle_irq(): %p, ", desc->handle_irq);
17 print_symbol("%s\n", (unsigned long)desc->handle_irq);
18 printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
19 print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
20 printk("->action(): %p\n", desc->action);
21 if (desc->action) {
22 printk("->action->handler(): %p, ", desc->action->handler);
23 print_symbol("%s\n", (unsigned long)desc->action->handler);
24 }
25
26 P(IRQ_LEVEL);
27 P(IRQ_PER_CPU);
28 P(IRQ_NOPROBE);
29 P(IRQ_NOREQUEST);
30 P(IRQ_NOAUTOEN);
31
32 PS(IRQS_AUTODETECT);
33 PS(IRQS_REPLAY);
34 PS(IRQS_WAITING);
35 PS(IRQS_PENDING);
36
37 PD(IRQS_INPROGRESS);
38 PD(IRQS_DISABLED);
39 PD(IRQS_MASKED);
40}
41
42#undef P
43#undef PS
44#undef PD
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 20dc5474947e..b5fcd96c7102 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -31,13 +31,6 @@ static unsigned int noop_ret(struct irq_data *data)
31 return 0; 31 return 0;
32} 32}
33 33
34#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
35static void compat_noop(unsigned int irq) { }
36#define END_INIT .end = compat_noop
37#else
38#define END_INIT
39#endif
40
41/* 34/*
42 * Generic no controller implementation 35 * Generic no controller implementation
43 */ 36 */
@@ -48,7 +41,6 @@ struct irq_chip no_irq_chip = {
48 .irq_enable = noop, 41 .irq_enable = noop,
49 .irq_disable = noop, 42 .irq_disable = noop,
50 .irq_ack = ack_bad, 43 .irq_ack = ack_bad,
51 END_INIT
52}; 44};
53 45
54/* 46/*
@@ -64,5 +56,4 @@ struct irq_chip dummy_irq_chip = {
64 .irq_ack = noop, 56 .irq_ack = noop,
65 .irq_mask = noop, 57 .irq_mask = noop,
66 .irq_unmask = noop, 58 .irq_unmask = noop,
67 END_INIT
68}; 59};
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3540a7190122..90cb55f6d7eb 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -51,30 +51,92 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
51 "but no thread function available.", irq, action->name); 51 "but no thread function available.", irq, action->name);
52} 52}
53 53
54/** 54static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
55 * handle_IRQ_event - irq action chain handler 55{
56 * @irq: the interrupt number 56 /*
57 * @action: the interrupt action chain for this irq 57 * Wake up the handler thread for this action. In case the
58 * 58 * thread crashed and was killed we just pretend that we
59 * Handles the action chain of an irq event 59 * handled the interrupt. The hardirq handler has disabled the
60 */ 60 * device interrupt, so no irq storm is lurking. If the
61irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) 61 * RUNTHREAD bit is already set, nothing to do.
62 */
63 if (test_bit(IRQTF_DIED, &action->thread_flags) ||
64 test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
65 return;
66
67 /*
68 * It's safe to OR the mask lockless here. We have only two
69 * places which write to threads_oneshot: This code and the
70 * irq thread.
71 *
72 * This code is the hard irq context and can never run on two
73 * cpus in parallel. If it ever does we have more serious
74 * problems than this bitmask.
75 *
76 * The irq threads of this irq which clear their "running" bit
77 * in threads_oneshot are serialized via desc->lock against
78 * each other and they are serialized against this code by
79 * IRQS_INPROGRESS.
80 *
81 * Hard irq handler:
82 *
83 * spin_lock(desc->lock);
84 * desc->state |= IRQS_INPROGRESS;
85 * spin_unlock(desc->lock);
86 * set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
87 * desc->threads_oneshot |= mask;
88 * spin_lock(desc->lock);
89 * desc->state &= ~IRQS_INPROGRESS;
90 * spin_unlock(desc->lock);
91 *
92 * irq thread:
93 *
94 * again:
95 * spin_lock(desc->lock);
96 * if (desc->state & IRQS_INPROGRESS) {
97 * spin_unlock(desc->lock);
98 * while(desc->state & IRQS_INPROGRESS)
99 * cpu_relax();
100 * goto again;
101 * }
102 * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
103 * desc->threads_oneshot &= ~mask;
104 * spin_unlock(desc->lock);
105 *
106 * So either the thread waits for us to clear IRQS_INPROGRESS
107 * or we are waiting in the flow handler for desc->lock to be
108 * released before we reach this point. The thread also checks
109 * IRQTF_RUNTHREAD under desc->lock. If set it leaves
110 * threads_oneshot untouched and runs the thread another time.
111 */
112 desc->threads_oneshot |= action->thread_mask;
113 wake_up_process(action->thread);
114}
115
116irqreturn_t
117handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
62{ 118{
63 irqreturn_t ret, retval = IRQ_NONE; 119 irqreturn_t retval = IRQ_NONE;
64 unsigned int status = 0; 120 unsigned int random = 0, irq = desc->irq_data.irq;
65 121
66 do { 122 do {
123 irqreturn_t res;
124
67 trace_irq_handler_entry(irq, action); 125 trace_irq_handler_entry(irq, action);
68 ret = action->handler(irq, action->dev_id); 126 res = action->handler(irq, action->dev_id);
69 trace_irq_handler_exit(irq, action, ret); 127 trace_irq_handler_exit(irq, action, res);
128
129 if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
130 irq, action->handler))
131 local_irq_disable();
70 132
71 switch (ret) { 133 switch (res) {
72 case IRQ_WAKE_THREAD: 134 case IRQ_WAKE_THREAD:
73 /* 135 /*
74 * Set result to handled so the spurious check 136 * Set result to handled so the spurious check
75 * does not trigger. 137 * does not trigger.
76 */ 138 */
77 ret = IRQ_HANDLED; 139 res = IRQ_HANDLED;
78 140
79 /* 141 /*
80 * Catch drivers which return WAKE_THREAD but 142 * Catch drivers which return WAKE_THREAD but
@@ -85,36 +147,41 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
85 break; 147 break;
86 } 148 }
87 149
88 /* 150 irq_wake_thread(desc, action);
89 * Wake up the handler thread for this
90 * action. In case the thread crashed and was
91 * killed we just pretend that we handled the
92 * interrupt. The hardirq handler above has
93 * disabled the device interrupt, so no irq
94 * storm is lurking.
95 */
96 if (likely(!test_bit(IRQTF_DIED,
97 &action->thread_flags))) {
98 set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
99 wake_up_process(action->thread);
100 }
101 151
102 /* Fall through to add to randomness */ 152 /* Fall through to add to randomness */
103 case IRQ_HANDLED: 153 case IRQ_HANDLED:
104 status |= action->flags; 154 random |= action->flags;
105 break; 155 break;
106 156
107 default: 157 default:
108 break; 158 break;
109 } 159 }
110 160
111 retval |= ret; 161 retval |= res;
112 action = action->next; 162 action = action->next;
113 } while (action); 163 } while (action);
114 164
115 if (status & IRQF_SAMPLE_RANDOM) 165 if (random & IRQF_SAMPLE_RANDOM)
116 add_interrupt_randomness(irq); 166 add_interrupt_randomness(irq);
117 local_irq_disable();
118 167
168 if (!noirqdebug)
169 note_interrupt(irq, desc, retval);
119 return retval; 170 return retval;
120} 171}
172
173irqreturn_t handle_irq_event(struct irq_desc *desc)
174{
175 struct irqaction *action = desc->action;
176 irqreturn_t ret;
177
178 desc->istate &= ~IRQS_PENDING;
179 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
180 raw_spin_unlock(&desc->lock);
181
182 ret = handle_irq_event_percpu(desc, action);
183
184 raw_spin_lock(&desc->lock);
185 irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
186 return ret;
187}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 4571ae7e085a..6546431447d7 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,27 +1,87 @@
1/* 1/*
2 * IRQ subsystem internal functions and variables: 2 * IRQ subsystem internal functions and variables:
3 *
4 * Do not ever include this file from anything else than
5 * kernel/irq/. Do not even think about using any information outside
6 * of this file for your non core code.
3 */ 7 */
4#include <linux/irqdesc.h> 8#include <linux/irqdesc.h>
5 9
10#ifdef CONFIG_SPARSE_IRQ
11# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
12#else
13# define IRQ_BITMAP_BITS NR_IRQS
14#endif
15
16#define istate core_internal_state__do_not_mess_with_it
17
6extern int noirqdebug; 18extern int noirqdebug;
7 19
8#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) 20/*
21 * Bits used by threaded handlers:
22 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
23 * IRQTF_DIED - handler thread died
24 * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
25 * IRQTF_AFFINITY - irq thread is requested to adjust affinity
26 * IRQTF_FORCED_THREAD - irq action is force threaded
27 */
28enum {
29 IRQTF_RUNTHREAD,
30 IRQTF_DIED,
31 IRQTF_WARNED,
32 IRQTF_AFFINITY,
33 IRQTF_FORCED_THREAD,
34};
9 35
10/* Set default functions for irq_chip structures: */ 36/*
11extern void irq_chip_set_defaults(struct irq_chip *chip); 37 * Bit masks for desc->state
38 *
39 * IRQS_AUTODETECT - autodetection in progress
40 * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt
41 * detection
42 * IRQS_POLL_INPROGRESS - polling in progress
43 * IRQS_ONESHOT - irq is not unmasked in primary handler
44 * IRQS_REPLAY - irq is replayed
45 * IRQS_WAITING - irq is waiting
46 * IRQS_PENDING - irq is pending and replayed later
47 * IRQS_SUSPENDED - irq is suspended
48 */
49enum {
50 IRQS_AUTODETECT = 0x00000001,
51 IRQS_SPURIOUS_DISABLED = 0x00000002,
52 IRQS_POLL_INPROGRESS = 0x00000008,
53 IRQS_ONESHOT = 0x00000020,
54 IRQS_REPLAY = 0x00000040,
55 IRQS_WAITING = 0x00000080,
56 IRQS_PENDING = 0x00000200,
57 IRQS_SUSPENDED = 0x00000800,
58};
59
60#include "debug.h"
61#include "settings.h"
12 62
13/* Set default handler: */ 63#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
14extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
15 64
16extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 65extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
17 unsigned long flags); 66 unsigned long flags);
18extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); 67extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
19extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 68extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
20 69
70extern int irq_startup(struct irq_desc *desc);
71extern void irq_shutdown(struct irq_desc *desc);
72extern void irq_enable(struct irq_desc *desc);
73extern void irq_disable(struct irq_desc *desc);
74extern void mask_irq(struct irq_desc *desc);
75extern void unmask_irq(struct irq_desc *desc);
76
21extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 77extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
22 78
79irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
80irqreturn_t handle_irq_event(struct irq_desc *desc);
81
23/* Resending of interrupts :*/ 82/* Resending of interrupts :*/
24void check_irq_resend(struct irq_desc *desc, unsigned int irq); 83void check_irq_resend(struct irq_desc *desc, unsigned int irq);
84bool irq_wait_for_poll(struct irq_desc *desc);
25 85
26#ifdef CONFIG_PROC_FS 86#ifdef CONFIG_PROC_FS
27extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); 87extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
@@ -37,20 +97,10 @@ static inline void unregister_handler_proc(unsigned int irq,
37 struct irqaction *action) { } 97 struct irqaction *action) { }
38#endif 98#endif
39 99
40extern int irq_select_affinity_usr(unsigned int irq); 100extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
41 101
42extern void irq_set_thread_affinity(struct irq_desc *desc); 102extern void irq_set_thread_affinity(struct irq_desc *desc);
43 103
44#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
45static inline void irq_end(unsigned int irq, struct irq_desc *desc)
46{
47 if (desc->irq_data.chip && desc->irq_data.chip->end)
48 desc->irq_data.chip->end(irq);
49}
50#else
51static inline void irq_end(unsigned int irq, struct irq_desc *desc) { }
52#endif
53
54/* Inline functions for support of irq chips on slow busses */ 104/* Inline functions for support of irq chips on slow busses */
55static inline void chip_bus_lock(struct irq_desc *desc) 105static inline void chip_bus_lock(struct irq_desc *desc)
56{ 106{
@@ -64,43 +114,58 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
64 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); 114 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
65} 115}
66 116
117struct irq_desc *
118__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus);
119void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
120
121static inline struct irq_desc *
122irq_get_desc_buslock(unsigned int irq, unsigned long *flags)
123{
124 return __irq_get_desc_lock(irq, flags, true);
125}
126
127static inline void
128irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
129{
130 __irq_put_desc_unlock(desc, flags, true);
131}
132
133static inline struct irq_desc *
134irq_get_desc_lock(unsigned int irq, unsigned long *flags)
135{
136 return __irq_get_desc_lock(irq, flags, false);
137}
138
139static inline void
140irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
141{
142 __irq_put_desc_unlock(desc, flags, false);
143}
144
67/* 145/*
68 * Debugging printout: 146 * Manipulation functions for irq_data.state
69 */ 147 */
148static inline void irqd_set_move_pending(struct irq_data *d)
149{
150 d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
151}
70 152
71#include <linux/kallsyms.h> 153static inline void irqd_clr_move_pending(struct irq_data *d)
72 154{
73#define P(f) if (desc->status & f) printk("%14s set\n", #f) 155 d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
156}
74 157
75static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) 158static inline void irqd_clear(struct irq_data *d, unsigned int mask)
76{ 159{
77 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", 160 d->state_use_accessors &= ~mask;
78 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
79 printk("->handle_irq(): %p, ", desc->handle_irq);
80 print_symbol("%s\n", (unsigned long)desc->handle_irq);
81 printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
82 print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
83 printk("->action(): %p\n", desc->action);
84 if (desc->action) {
85 printk("->action->handler(): %p, ", desc->action->handler);
86 print_symbol("%s\n", (unsigned long)desc->action->handler);
87 }
88
89 P(IRQ_INPROGRESS);
90 P(IRQ_DISABLED);
91 P(IRQ_PENDING);
92 P(IRQ_REPLAY);
93 P(IRQ_AUTODETECT);
94 P(IRQ_WAITING);
95 P(IRQ_LEVEL);
96 P(IRQ_MASKED);
97#ifdef CONFIG_IRQ_PER_CPU
98 P(IRQ_PER_CPU);
99#endif
100 P(IRQ_NOPROBE);
101 P(IRQ_NOREQUEST);
102 P(IRQ_NOAUTOEN);
103} 161}
104 162
105#undef P 163static inline void irqd_set(struct irq_data *d, unsigned int mask)
164{
165 d->state_use_accessors |= mask;
166}
106 167
168static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
169{
170 return d->state_use_accessors & mask;
171}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 282f20230e67..2c039c9b9383 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -79,7 +79,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
79 desc->irq_data.chip_data = NULL; 79 desc->irq_data.chip_data = NULL;
80 desc->irq_data.handler_data = NULL; 80 desc->irq_data.handler_data = NULL;
81 desc->irq_data.msi_desc = NULL; 81 desc->irq_data.msi_desc = NULL;
82 desc->status = IRQ_DEFAULT_INIT_FLAGS; 82 irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
83 irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
83 desc->handle_irq = handle_bad_irq; 84 desc->handle_irq = handle_bad_irq;
84 desc->depth = 1; 85 desc->depth = 1;
85 desc->irq_count = 0; 86 desc->irq_count = 0;
@@ -94,7 +95,7 @@ int nr_irqs = NR_IRQS;
94EXPORT_SYMBOL_GPL(nr_irqs); 95EXPORT_SYMBOL_GPL(nr_irqs);
95 96
96static DEFINE_MUTEX(sparse_irq_lock); 97static DEFINE_MUTEX(sparse_irq_lock);
97static DECLARE_BITMAP(allocated_irqs, NR_IRQS); 98static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
98 99
99#ifdef CONFIG_SPARSE_IRQ 100#ifdef CONFIG_SPARSE_IRQ
100 101
@@ -197,13 +198,12 @@ err:
197 return -ENOMEM; 198 return -ENOMEM;
198} 199}
199 200
200struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) 201static int irq_expand_nr_irqs(unsigned int nr)
201{ 202{
202 int res = irq_alloc_descs(irq, irq, 1, node); 203 if (nr > IRQ_BITMAP_BITS)
203 204 return -ENOMEM;
204 if (res == -EEXIST || res == irq) 205 nr_irqs = nr;
205 return irq_to_desc(irq); 206 return 0;
206 return NULL;
207} 207}
208 208
209int __init early_irq_init(void) 209int __init early_irq_init(void)
@@ -217,6 +217,15 @@ int __init early_irq_init(void)
217 initcnt = arch_probe_nr_irqs(); 217 initcnt = arch_probe_nr_irqs();
218 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); 218 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
219 219
220 if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
221 nr_irqs = IRQ_BITMAP_BITS;
222
223 if (WARN_ON(initcnt > IRQ_BITMAP_BITS))
224 initcnt = IRQ_BITMAP_BITS;
225
226 if (initcnt > nr_irqs)
227 nr_irqs = initcnt;
228
220 for (i = 0; i < initcnt; i++) { 229 for (i = 0; i < initcnt; i++) {
221 desc = alloc_desc(i, node); 230 desc = alloc_desc(i, node);
222 set_bit(i, allocated_irqs); 231 set_bit(i, allocated_irqs);
@@ -229,7 +238,6 @@ int __init early_irq_init(void)
229 238
230struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { 239struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
231 [0 ... NR_IRQS-1] = { 240 [0 ... NR_IRQS-1] = {
232 .status = IRQ_DEFAULT_INIT_FLAGS,
233 .handle_irq = handle_bad_irq, 241 .handle_irq = handle_bad_irq,
234 .depth = 1, 242 .depth = 1,
235 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), 243 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
@@ -251,8 +259,8 @@ int __init early_irq_init(void)
251 for (i = 0; i < count; i++) { 259 for (i = 0; i < count; i++) {
252 desc[i].irq_data.irq = i; 260 desc[i].irq_data.irq = i;
253 desc[i].irq_data.chip = &no_irq_chip; 261 desc[i].irq_data.chip = &no_irq_chip;
254 /* TODO : do this allocation on-demand ... */
255 desc[i].kstat_irqs = alloc_percpu(unsigned int); 262 desc[i].kstat_irqs = alloc_percpu(unsigned int);
263 irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
256 alloc_masks(desc + i, GFP_KERNEL, node); 264 alloc_masks(desc + i, GFP_KERNEL, node);
257 desc_smp_init(desc + i, node); 265 desc_smp_init(desc + i, node);
258 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 266 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -265,11 +273,6 @@ struct irq_desc *irq_to_desc(unsigned int irq)
265 return (irq < NR_IRQS) ? irq_desc + irq : NULL; 273 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
266} 274}
267 275
268struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
269{
270 return irq_to_desc(irq);
271}
272
273static void free_desc(unsigned int irq) 276static void free_desc(unsigned int irq)
274{ 277{
275 dynamic_irq_cleanup(irq); 278 dynamic_irq_cleanup(irq);
@@ -277,24 +280,14 @@ static void free_desc(unsigned int irq)
277 280
278static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) 281static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
279{ 282{
280#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
281 struct irq_desc *desc;
282 unsigned int i;
283
284 for (i = 0; i < cnt; i++) {
285 desc = irq_to_desc(start + i);
286 if (desc && !desc->kstat_irqs) {
287 unsigned int __percpu *stats = alloc_percpu(unsigned int);
288
289 if (!stats)
290 return -1;
291 if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
292 free_percpu(stats);
293 }
294 }
295#endif
296 return start; 283 return start;
297} 284}
285
286static int irq_expand_nr_irqs(unsigned int nr)
287{
288 return -ENOMEM;
289}
290
298#endif /* !CONFIG_SPARSE_IRQ */ 291#endif /* !CONFIG_SPARSE_IRQ */
299 292
300/* Dynamic interrupt handling */ 293/* Dynamic interrupt handling */
@@ -338,14 +331,17 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
338 331
339 mutex_lock(&sparse_irq_lock); 332 mutex_lock(&sparse_irq_lock);
340 333
341 start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); 334 start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
335 from, cnt, 0);
342 ret = -EEXIST; 336 ret = -EEXIST;
343 if (irq >=0 && start != irq) 337 if (irq >=0 && start != irq)
344 goto err; 338 goto err;
345 339
346 ret = -ENOMEM; 340 if (start + cnt > nr_irqs) {
347 if (start >= nr_irqs) 341 ret = irq_expand_nr_irqs(start + cnt);
348 goto err; 342 if (ret)
343 goto err;
344 }
349 345
350 bitmap_set(allocated_irqs, start, cnt); 346 bitmap_set(allocated_irqs, start, cnt);
351 mutex_unlock(&sparse_irq_lock); 347 mutex_unlock(&sparse_irq_lock);
@@ -392,6 +388,26 @@ unsigned int irq_get_next_irq(unsigned int offset)
392 return find_next_bit(allocated_irqs, nr_irqs, offset); 388 return find_next_bit(allocated_irqs, nr_irqs, offset);
393} 389}
394 390
391struct irq_desc *
392__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus)
393{
394 struct irq_desc *desc = irq_to_desc(irq);
395
396 if (desc) {
397 if (bus)
398 chip_bus_lock(desc);
399 raw_spin_lock_irqsave(&desc->lock, *flags);
400 }
401 return desc;
402}
403
404void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
405{
406 raw_spin_unlock_irqrestore(&desc->lock, flags);
407 if (bus)
408 chip_bus_sync_unlock(desc);
409}
410
395/** 411/**
396 * dynamic_irq_cleanup - cleanup a dynamically allocated irq 412 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
397 * @irq: irq number to initialize 413 * @irq: irq number to initialize
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0caa59f747dd..07c1611f3899 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,17 @@
17 17
18#include "internals.h" 18#include "internals.h"
19 19
20#ifdef CONFIG_IRQ_FORCED_THREADING
21__read_mostly bool force_irqthreads;
22
23static int __init setup_forced_irqthreads(char *arg)
24{
25 force_irqthreads = true;
26 return 0;
27}
28early_param("threadirqs", setup_forced_irqthreads);
29#endif
30
20/** 31/**
21 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 32 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
22 * @irq: interrupt number to wait for 33 * @irq: interrupt number to wait for
@@ -30,7 +41,7 @@
30void synchronize_irq(unsigned int irq) 41void synchronize_irq(unsigned int irq)
31{ 42{
32 struct irq_desc *desc = irq_to_desc(irq); 43 struct irq_desc *desc = irq_to_desc(irq);
33 unsigned int status; 44 bool inprogress;
34 45
35 if (!desc) 46 if (!desc)
36 return; 47 return;
@@ -42,16 +53,16 @@ void synchronize_irq(unsigned int irq)
42 * Wait until we're out of the critical section. This might 53 * Wait until we're out of the critical section. This might
43 * give the wrong answer due to the lack of memory barriers. 54 * give the wrong answer due to the lack of memory barriers.
44 */ 55 */
45 while (desc->status & IRQ_INPROGRESS) 56 while (irqd_irq_inprogress(&desc->irq_data))
46 cpu_relax(); 57 cpu_relax();
47 58
48 /* Ok, that indicated we're done: double-check carefully. */ 59 /* Ok, that indicated we're done: double-check carefully. */
49 raw_spin_lock_irqsave(&desc->lock, flags); 60 raw_spin_lock_irqsave(&desc->lock, flags);
50 status = desc->status; 61 inprogress = irqd_irq_inprogress(&desc->irq_data);
51 raw_spin_unlock_irqrestore(&desc->lock, flags); 62 raw_spin_unlock_irqrestore(&desc->lock, flags);
52 63
53 /* Oops, that failed? */ 64 /* Oops, that failed? */
54 } while (status & IRQ_INPROGRESS); 65 } while (inprogress);
55 66
56 /* 67 /*
57 * We made sure that no hardirq handler is running. Now verify 68 * We made sure that no hardirq handler is running. Now verify
@@ -73,8 +84,8 @@ int irq_can_set_affinity(unsigned int irq)
73{ 84{
74 struct irq_desc *desc = irq_to_desc(irq); 85 struct irq_desc *desc = irq_to_desc(irq);
75 86
76 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip || 87 if (!desc || !irqd_can_balance(&desc->irq_data) ||
77 !desc->irq_data.chip->irq_set_affinity) 88 !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
78 return 0; 89 return 0;
79 90
80 return 1; 91 return 1;
@@ -100,67 +111,180 @@ void irq_set_thread_affinity(struct irq_desc *desc)
100 } 111 }
101} 112}
102 113
114#ifdef CONFIG_GENERIC_PENDING_IRQ
115static inline bool irq_can_move_pcntxt(struct irq_data *data)
116{
117 return irqd_can_move_in_process_context(data);
118}
119static inline bool irq_move_pending(struct irq_data *data)
120{
121 return irqd_is_setaffinity_pending(data);
122}
123static inline void
124irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
125{
126 cpumask_copy(desc->pending_mask, mask);
127}
128static inline void
129irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
130{
131 cpumask_copy(mask, desc->pending_mask);
132}
133#else
134static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; }
135static inline bool irq_move_pending(struct irq_data *data) { return false; }
136static inline void
137irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { }
138static inline void
139irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
140#endif
141
142int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
143{
144 struct irq_chip *chip = irq_data_get_irq_chip(data);
145 struct irq_desc *desc = irq_data_to_desc(data);
146 int ret = 0;
147
148 if (!chip || !chip->irq_set_affinity)
149 return -EINVAL;
150
151 if (irq_can_move_pcntxt(data)) {
152 ret = chip->irq_set_affinity(data, mask, false);
153 switch (ret) {
154 case IRQ_SET_MASK_OK:
155 cpumask_copy(data->affinity, mask);
156 case IRQ_SET_MASK_OK_NOCOPY:
157 irq_set_thread_affinity(desc);
158 ret = 0;
159 }
160 } else {
161 irqd_set_move_pending(data);
162 irq_copy_pending(desc, mask);
163 }
164
165 if (desc->affinity_notify) {
166 kref_get(&desc->affinity_notify->kref);
167 schedule_work(&desc->affinity_notify->work);
168 }
169 irqd_set(data, IRQD_AFFINITY_SET);
170
171 return ret;
172}
173
103/** 174/**
104 * irq_set_affinity - Set the irq affinity of a given irq 175 * irq_set_affinity - Set the irq affinity of a given irq
105 * @irq: Interrupt to set affinity 176 * @irq: Interrupt to set affinity
106 * @cpumask: cpumask 177 * @mask: cpumask
107 * 178 *
108 */ 179 */
109int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) 180int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
110{ 181{
111 struct irq_desc *desc = irq_to_desc(irq); 182 struct irq_desc *desc = irq_to_desc(irq);
112 struct irq_chip *chip = desc->irq_data.chip;
113 unsigned long flags; 183 unsigned long flags;
184 int ret;
114 185
115 if (!chip->irq_set_affinity) 186 if (!desc)
116 return -EINVAL; 187 return -EINVAL;
117 188
118 raw_spin_lock_irqsave(&desc->lock, flags); 189 raw_spin_lock_irqsave(&desc->lock, flags);
119 190 ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask);
120#ifdef CONFIG_GENERIC_PENDING_IRQ
121 if (desc->status & IRQ_MOVE_PCNTXT) {
122 if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
123 cpumask_copy(desc->irq_data.affinity, cpumask);
124 irq_set_thread_affinity(desc);
125 }
126 }
127 else {
128 desc->status |= IRQ_MOVE_PENDING;
129 cpumask_copy(desc->pending_mask, cpumask);
130 }
131#else
132 if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
133 cpumask_copy(desc->irq_data.affinity, cpumask);
134 irq_set_thread_affinity(desc);
135 }
136#endif
137 desc->status |= IRQ_AFFINITY_SET;
138 raw_spin_unlock_irqrestore(&desc->lock, flags); 191 raw_spin_unlock_irqrestore(&desc->lock, flags);
139 return 0; 192 return ret;
140} 193}
141 194
142int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) 195int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
143{ 196{
197 unsigned long flags;
198 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
199
200 if (!desc)
201 return -EINVAL;
202 desc->affinity_hint = m;
203 irq_put_desc_unlock(desc, flags);
204 return 0;
205}
206EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
207
208static void irq_affinity_notify(struct work_struct *work)
209{
210 struct irq_affinity_notify *notify =
211 container_of(work, struct irq_affinity_notify, work);
212 struct irq_desc *desc = irq_to_desc(notify->irq);
213 cpumask_var_t cpumask;
214 unsigned long flags;
215
216 if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL))
217 goto out;
218
219 raw_spin_lock_irqsave(&desc->lock, flags);
220 if (irq_move_pending(&desc->irq_data))
221 irq_get_pending(cpumask, desc);
222 else
223 cpumask_copy(cpumask, desc->irq_data.affinity);
224 raw_spin_unlock_irqrestore(&desc->lock, flags);
225
226 notify->notify(notify, cpumask);
227
228 free_cpumask_var(cpumask);
229out:
230 kref_put(&notify->kref, notify->release);
231}
232
233/**
234 * irq_set_affinity_notifier - control notification of IRQ affinity changes
235 * @irq: Interrupt for which to enable/disable notification
236 * @notify: Context for notification, or %NULL to disable
237 * notification. Function pointers must be initialised;
238 * the other fields will be initialised by this function.
239 *
240 * Must be called in process context. Notification may only be enabled
241 * after the IRQ is allocated and must be disabled before the IRQ is
242 * freed using free_irq().
243 */
244int
245irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
246{
144 struct irq_desc *desc = irq_to_desc(irq); 247 struct irq_desc *desc = irq_to_desc(irq);
248 struct irq_affinity_notify *old_notify;
145 unsigned long flags; 249 unsigned long flags;
146 250
251 /* The release function is promised process context */
252 might_sleep();
253
147 if (!desc) 254 if (!desc)
148 return -EINVAL; 255 return -EINVAL;
149 256
257 /* Complete initialisation of *notify */
258 if (notify) {
259 notify->irq = irq;
260 kref_init(&notify->kref);
261 INIT_WORK(&notify->work, irq_affinity_notify);
262 }
263
150 raw_spin_lock_irqsave(&desc->lock, flags); 264 raw_spin_lock_irqsave(&desc->lock, flags);
151 desc->affinity_hint = m; 265 old_notify = desc->affinity_notify;
266 desc->affinity_notify = notify;
152 raw_spin_unlock_irqrestore(&desc->lock, flags); 267 raw_spin_unlock_irqrestore(&desc->lock, flags);
153 268
269 if (old_notify)
270 kref_put(&old_notify->kref, old_notify->release);
271
154 return 0; 272 return 0;
155} 273}
156EXPORT_SYMBOL_GPL(irq_set_affinity_hint); 274EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
157 275
158#ifndef CONFIG_AUTO_IRQ_AFFINITY 276#ifndef CONFIG_AUTO_IRQ_AFFINITY
159/* 277/*
160 * Generic version of the affinity autoselector. 278 * Generic version of the affinity autoselector.
161 */ 279 */
162static int setup_affinity(unsigned int irq, struct irq_desc *desc) 280static int
281setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
163{ 282{
283 struct irq_chip *chip = irq_desc_get_chip(desc);
284 struct cpumask *set = irq_default_affinity;
285 int ret;
286
287 /* Excludes PER_CPU and NO_BALANCE interrupts */
164 if (!irq_can_set_affinity(irq)) 288 if (!irq_can_set_affinity(irq))
165 return 0; 289 return 0;
166 290
@@ -168,22 +292,27 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
168 * Preserve an userspace affinity setup, but make sure that 292 * Preserve an userspace affinity setup, but make sure that
169 * one of the targets is online. 293 * one of the targets is online.
170 */ 294 */
171 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { 295 if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
172 if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) 296 if (cpumask_intersects(desc->irq_data.affinity,
173 < nr_cpu_ids) 297 cpu_online_mask))
174 goto set_affinity; 298 set = desc->irq_data.affinity;
175 else 299 else
176 desc->status &= ~IRQ_AFFINITY_SET; 300 irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
177 } 301 }
178 302
179 cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity); 303 cpumask_and(mask, cpu_online_mask, set);
180set_affinity: 304 ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
181 desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); 305 switch (ret) {
182 306 case IRQ_SET_MASK_OK:
307 cpumask_copy(desc->irq_data.affinity, mask);
308 case IRQ_SET_MASK_OK_NOCOPY:
309 irq_set_thread_affinity(desc);
310 }
183 return 0; 311 return 0;
184} 312}
185#else 313#else
186static inline int setup_affinity(unsigned int irq, struct irq_desc *d) 314static inline int
315setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask)
187{ 316{
188 return irq_select_affinity(irq); 317 return irq_select_affinity(irq);
189} 318}
@@ -192,23 +321,21 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
192/* 321/*
193 * Called when affinity is set via /proc/irq 322 * Called when affinity is set via /proc/irq
194 */ 323 */
195int irq_select_affinity_usr(unsigned int irq) 324int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
196{ 325{
197 struct irq_desc *desc = irq_to_desc(irq); 326 struct irq_desc *desc = irq_to_desc(irq);
198 unsigned long flags; 327 unsigned long flags;
199 int ret; 328 int ret;
200 329
201 raw_spin_lock_irqsave(&desc->lock, flags); 330 raw_spin_lock_irqsave(&desc->lock, flags);
202 ret = setup_affinity(irq, desc); 331 ret = setup_affinity(irq, desc, mask);
203 if (!ret)
204 irq_set_thread_affinity(desc);
205 raw_spin_unlock_irqrestore(&desc->lock, flags); 332 raw_spin_unlock_irqrestore(&desc->lock, flags);
206
207 return ret; 333 return ret;
208} 334}
209 335
210#else 336#else
211static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) 337static inline int
338setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
212{ 339{
213 return 0; 340 return 0;
214} 341}
@@ -219,13 +346,23 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
219 if (suspend) { 346 if (suspend) {
220 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) 347 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
221 return; 348 return;
222 desc->status |= IRQ_SUSPENDED; 349 desc->istate |= IRQS_SUSPENDED;
223 } 350 }
224 351
225 if (!desc->depth++) { 352 if (!desc->depth++)
226 desc->status |= IRQ_DISABLED; 353 irq_disable(desc);
227 desc->irq_data.chip->irq_disable(&desc->irq_data); 354}
228 } 355
356static int __disable_irq_nosync(unsigned int irq)
357{
358 unsigned long flags;
359 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
360
361 if (!desc)
362 return -EINVAL;
363 __disable_irq(desc, irq, false);
364 irq_put_desc_busunlock(desc, flags);
365 return 0;
229} 366}
230 367
231/** 368/**
@@ -241,17 +378,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
241 */ 378 */
242void disable_irq_nosync(unsigned int irq) 379void disable_irq_nosync(unsigned int irq)
243{ 380{
244 struct irq_desc *desc = irq_to_desc(irq); 381 __disable_irq_nosync(irq);
245 unsigned long flags;
246
247 if (!desc)
248 return;
249
250 chip_bus_lock(desc);
251 raw_spin_lock_irqsave(&desc->lock, flags);
252 __disable_irq(desc, irq, false);
253 raw_spin_unlock_irqrestore(&desc->lock, flags);
254 chip_bus_sync_unlock(desc);
255} 382}
256EXPORT_SYMBOL(disable_irq_nosync); 383EXPORT_SYMBOL(disable_irq_nosync);
257 384
@@ -269,21 +396,24 @@ EXPORT_SYMBOL(disable_irq_nosync);
269 */ 396 */
270void disable_irq(unsigned int irq) 397void disable_irq(unsigned int irq)
271{ 398{
272 struct irq_desc *desc = irq_to_desc(irq); 399 if (!__disable_irq_nosync(irq))
273
274 if (!desc)
275 return;
276
277 disable_irq_nosync(irq);
278 if (desc->action)
279 synchronize_irq(irq); 400 synchronize_irq(irq);
280} 401}
281EXPORT_SYMBOL(disable_irq); 402EXPORT_SYMBOL(disable_irq);
282 403
283void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) 404void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
284{ 405{
285 if (resume) 406 if (resume) {
286 desc->status &= ~IRQ_SUSPENDED; 407 if (!(desc->istate & IRQS_SUSPENDED)) {
408 if (!desc->action)
409 return;
410 if (!(desc->action->flags & IRQF_FORCE_RESUME))
411 return;
412 /* Pretend that it got disabled ! */
413 desc->depth++;
414 }
415 desc->istate &= ~IRQS_SUSPENDED;
416 }
287 417
288 switch (desc->depth) { 418 switch (desc->depth) {
289 case 0: 419 case 0:
@@ -291,12 +421,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
291 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); 421 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
292 break; 422 break;
293 case 1: { 423 case 1: {
294 unsigned int status = desc->status & ~IRQ_DISABLED; 424 if (desc->istate & IRQS_SUSPENDED)
295
296 if (desc->status & IRQ_SUSPENDED)
297 goto err_out; 425 goto err_out;
298 /* Prevent probing on this irq: */ 426 /* Prevent probing on this irq: */
299 desc->status = status | IRQ_NOPROBE; 427 irq_settings_set_noprobe(desc);
428 irq_enable(desc);
300 check_irq_resend(desc, irq); 429 check_irq_resend(desc, irq);
301 /* fall-through */ 430 /* fall-through */
302 } 431 }
@@ -318,21 +447,18 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
318 */ 447 */
319void enable_irq(unsigned int irq) 448void enable_irq(unsigned int irq)
320{ 449{
321 struct irq_desc *desc = irq_to_desc(irq);
322 unsigned long flags; 450 unsigned long flags;
451 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
323 452
324 if (!desc) 453 if (!desc)
325 return; 454 return;
455 if (WARN(!desc->irq_data.chip,
456 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
457 goto out;
326 458
327 if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable,
328 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
329 return;
330
331 chip_bus_lock(desc);
332 raw_spin_lock_irqsave(&desc->lock, flags);
333 __enable_irq(desc, irq, false); 459 __enable_irq(desc, irq, false);
334 raw_spin_unlock_irqrestore(&desc->lock, flags); 460out:
335 chip_bus_sync_unlock(desc); 461 irq_put_desc_busunlock(desc, flags);
336} 462}
337EXPORT_SYMBOL(enable_irq); 463EXPORT_SYMBOL(enable_irq);
338 464
@@ -348,7 +474,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
348} 474}
349 475
350/** 476/**
351 * set_irq_wake - control irq power management wakeup 477 * irq_set_irq_wake - control irq power management wakeup
352 * @irq: interrupt to control 478 * @irq: interrupt to control
353 * @on: enable/disable power management wakeup 479 * @on: enable/disable power management wakeup
354 * 480 *
@@ -359,23 +485,22 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
359 * Wakeup mode lets this IRQ wake the system from sleep 485 * Wakeup mode lets this IRQ wake the system from sleep
360 * states like "suspend to RAM". 486 * states like "suspend to RAM".
361 */ 487 */
362int set_irq_wake(unsigned int irq, unsigned int on) 488int irq_set_irq_wake(unsigned int irq, unsigned int on)
363{ 489{
364 struct irq_desc *desc = irq_to_desc(irq);
365 unsigned long flags; 490 unsigned long flags;
491 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
366 int ret = 0; 492 int ret = 0;
367 493
368 /* wakeup-capable irqs can be shared between drivers that 494 /* wakeup-capable irqs can be shared between drivers that
369 * don't need to have the same sleep mode behaviors. 495 * don't need to have the same sleep mode behaviors.
370 */ 496 */
371 raw_spin_lock_irqsave(&desc->lock, flags);
372 if (on) { 497 if (on) {
373 if (desc->wake_depth++ == 0) { 498 if (desc->wake_depth++ == 0) {
374 ret = set_irq_wake_real(irq, on); 499 ret = set_irq_wake_real(irq, on);
375 if (ret) 500 if (ret)
376 desc->wake_depth = 0; 501 desc->wake_depth = 0;
377 else 502 else
378 desc->status |= IRQ_WAKEUP; 503 irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
379 } 504 }
380 } else { 505 } else {
381 if (desc->wake_depth == 0) { 506 if (desc->wake_depth == 0) {
@@ -385,14 +510,13 @@ int set_irq_wake(unsigned int irq, unsigned int on)
385 if (ret) 510 if (ret)
386 desc->wake_depth = 1; 511 desc->wake_depth = 1;
387 else 512 else
388 desc->status &= ~IRQ_WAKEUP; 513 irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
389 } 514 }
390 } 515 }
391 516 irq_put_desc_busunlock(desc, flags);
392 raw_spin_unlock_irqrestore(&desc->lock, flags);
393 return ret; 517 return ret;
394} 518}
395EXPORT_SYMBOL(set_irq_wake); 519EXPORT_SYMBOL(irq_set_irq_wake);
396 520
397/* 521/*
398 * Internal function that tells the architecture code whether a 522 * Internal function that tells the architecture code whether a
@@ -401,43 +525,27 @@ EXPORT_SYMBOL(set_irq_wake);
401 */ 525 */
402int can_request_irq(unsigned int irq, unsigned long irqflags) 526int can_request_irq(unsigned int irq, unsigned long irqflags)
403{ 527{
404 struct irq_desc *desc = irq_to_desc(irq);
405 struct irqaction *action;
406 unsigned long flags; 528 unsigned long flags;
529 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
530 int canrequest = 0;
407 531
408 if (!desc) 532 if (!desc)
409 return 0; 533 return 0;
410 534
411 if (desc->status & IRQ_NOREQUEST) 535 if (irq_settings_can_request(desc)) {
412 return 0; 536 if (desc->action)
413 537 if (irqflags & desc->action->flags & IRQF_SHARED)
414 raw_spin_lock_irqsave(&desc->lock, flags); 538 canrequest =1;
415 action = desc->action; 539 }
416 if (action) 540 irq_put_desc_unlock(desc, flags);
417 if (irqflags & action->flags & IRQF_SHARED) 541 return canrequest;
418 action = NULL;
419
420 raw_spin_unlock_irqrestore(&desc->lock, flags);
421
422 return !action;
423}
424
425void compat_irq_chip_set_default_handler(struct irq_desc *desc)
426{
427 /*
428 * If the architecture still has not overriden
429 * the flow handler then zap the default. This
430 * should catch incorrect flow-type setting.
431 */
432 if (desc->handle_irq == &handle_bad_irq)
433 desc->handle_irq = NULL;
434} 542}
435 543
436int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 544int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
437 unsigned long flags) 545 unsigned long flags)
438{ 546{
439 int ret;
440 struct irq_chip *chip = desc->irq_data.chip; 547 struct irq_chip *chip = desc->irq_data.chip;
548 int ret, unmask = 0;
441 549
442 if (!chip || !chip->irq_set_type) { 550 if (!chip || !chip->irq_set_type) {
443 /* 551 /*
@@ -449,23 +557,41 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
449 return 0; 557 return 0;
450 } 558 }
451 559
560 flags &= IRQ_TYPE_SENSE_MASK;
561
562 if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
563 if (!irqd_irq_masked(&desc->irq_data))
564 mask_irq(desc);
565 if (!irqd_irq_disabled(&desc->irq_data))
566 unmask = 1;
567 }
568
452 /* caller masked out all except trigger mode flags */ 569 /* caller masked out all except trigger mode flags */
453 ret = chip->irq_set_type(&desc->irq_data, flags); 570 ret = chip->irq_set_type(&desc->irq_data, flags);
454 571
455 if (ret) 572 switch (ret) {
573 case IRQ_SET_MASK_OK:
574 irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
575 irqd_set(&desc->irq_data, flags);
576
577 case IRQ_SET_MASK_OK_NOCOPY:
578 flags = irqd_get_trigger_type(&desc->irq_data);
579 irq_settings_set_trigger_mask(desc, flags);
580 irqd_clear(&desc->irq_data, IRQD_LEVEL);
581 irq_settings_clr_level(desc);
582 if (flags & IRQ_TYPE_LEVEL_MASK) {
583 irq_settings_set_level(desc);
584 irqd_set(&desc->irq_data, IRQD_LEVEL);
585 }
586
587 ret = 0;
588 break;
589 default:
456 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", 590 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
457 flags, irq, chip->irq_set_type); 591 flags, irq, chip->irq_set_type);
458 else {
459 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
460 flags |= IRQ_LEVEL;
461 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
462 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
463 desc->status |= flags;
464
465 if (chip != desc->irq_data.chip)
466 irq_chip_set_defaults(desc->irq_data.chip);
467 } 592 }
468 593 if (unmask)
594 unmask_irq(desc);
469 return ret; 595 return ret;
470} 596}
471 597
@@ -509,8 +635,11 @@ static int irq_wait_for_interrupt(struct irqaction *action)
509 * handler finished. unmask if the interrupt has not been disabled and 635 * handler finished. unmask if the interrupt has not been disabled and
510 * is marked MASKED. 636 * is marked MASKED.
511 */ 637 */
512static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 638static void irq_finalize_oneshot(struct irq_desc *desc,
639 struct irqaction *action, bool force)
513{ 640{
641 if (!(desc->istate & IRQS_ONESHOT))
642 return;
514again: 643again:
515 chip_bus_lock(desc); 644 chip_bus_lock(desc);
516 raw_spin_lock_irq(&desc->lock); 645 raw_spin_lock_irq(&desc->lock);
@@ -522,26 +651,42 @@ again:
522 * The thread is faster done than the hard interrupt handler 651 * The thread is faster done than the hard interrupt handler
523 * on the other CPU. If we unmask the irq line then the 652 * on the other CPU. If we unmask the irq line then the
524 * interrupt can come in again and masks the line, leaves due 653 * interrupt can come in again and masks the line, leaves due
525 * to IRQ_INPROGRESS and the irq line is masked forever. 654 * to IRQS_INPROGRESS and the irq line is masked forever.
655 *
656 * This also serializes the state of shared oneshot handlers
657 * versus "desc->threads_onehsot |= action->thread_mask;" in
658 * irq_wake_thread(). See the comment there which explains the
659 * serialization.
526 */ 660 */
527 if (unlikely(desc->status & IRQ_INPROGRESS)) { 661 if (unlikely(irqd_irq_inprogress(&desc->irq_data))) {
528 raw_spin_unlock_irq(&desc->lock); 662 raw_spin_unlock_irq(&desc->lock);
529 chip_bus_sync_unlock(desc); 663 chip_bus_sync_unlock(desc);
530 cpu_relax(); 664 cpu_relax();
531 goto again; 665 goto again;
532 } 666 }
533 667
534 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 668 /*
535 desc->status &= ~IRQ_MASKED; 669 * Now check again, whether the thread should run. Otherwise
536 desc->irq_data.chip->irq_unmask(&desc->irq_data); 670 * we would clear the threads_oneshot bit of this thread which
537 } 671 * was just set.
672 */
673 if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
674 goto out_unlock;
675
676 desc->threads_oneshot &= ~action->thread_mask;
677
678 if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) &&
679 irqd_irq_masked(&desc->irq_data))
680 unmask_irq(desc);
681
682out_unlock:
538 raw_spin_unlock_irq(&desc->lock); 683 raw_spin_unlock_irq(&desc->lock);
539 chip_bus_sync_unlock(desc); 684 chip_bus_sync_unlock(desc);
540} 685}
541 686
542#ifdef CONFIG_SMP 687#ifdef CONFIG_SMP
543/* 688/*
544 * Check whether we need to change the affinity of the interrupt thread. 689 * Check whether we need to chasnge the affinity of the interrupt thread.
545 */ 690 */
546static void 691static void
547irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) 692irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
@@ -573,6 +718,32 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
573#endif 718#endif
574 719
575/* 720/*
721 * Interrupts which are not explicitely requested as threaded
722 * interrupts rely on the implicit bh/preempt disable of the hard irq
723 * context. So we need to disable bh here to avoid deadlocks and other
724 * side effects.
725 */
726static void
727irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
728{
729 local_bh_disable();
730 action->thread_fn(action->irq, action->dev_id);
731 irq_finalize_oneshot(desc, action, false);
732 local_bh_enable();
733}
734
735/*
736 * Interrupts explicitely requested as threaded interupts want to be
737 * preemtible - many of them need to sleep and wait for slow busses to
738 * complete.
739 */
740static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action)
741{
742 action->thread_fn(action->irq, action->dev_id);
743 irq_finalize_oneshot(desc, action, false);
744}
745
746/*
576 * Interrupt handler thread 747 * Interrupt handler thread
577 */ 748 */
578static int irq_thread(void *data) 749static int irq_thread(void *data)
@@ -582,7 +753,14 @@ static int irq_thread(void *data)
582 }; 753 };
583 struct irqaction *action = data; 754 struct irqaction *action = data;
584 struct irq_desc *desc = irq_to_desc(action->irq); 755 struct irq_desc *desc = irq_to_desc(action->irq);
585 int wake, oneshot = desc->status & IRQ_ONESHOT; 756 void (*handler_fn)(struct irq_desc *desc, struct irqaction *action);
757 int wake;
758
759 if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
760 &action->thread_flags))
761 handler_fn = irq_forced_thread_fn;
762 else
763 handler_fn = irq_thread_fn;
586 764
587 sched_setscheduler(current, SCHED_FIFO, &param); 765 sched_setscheduler(current, SCHED_FIFO, &param);
588 current->irqaction = action; 766 current->irqaction = action;
@@ -594,23 +772,19 @@ static int irq_thread(void *data)
594 atomic_inc(&desc->threads_active); 772 atomic_inc(&desc->threads_active);
595 773
596 raw_spin_lock_irq(&desc->lock); 774 raw_spin_lock_irq(&desc->lock);
597 if (unlikely(desc->status & IRQ_DISABLED)) { 775 if (unlikely(irqd_irq_disabled(&desc->irq_data))) {
598 /* 776 /*
599 * CHECKME: We might need a dedicated 777 * CHECKME: We might need a dedicated
600 * IRQ_THREAD_PENDING flag here, which 778 * IRQ_THREAD_PENDING flag here, which
601 * retriggers the thread in check_irq_resend() 779 * retriggers the thread in check_irq_resend()
602 * but AFAICT IRQ_PENDING should be fine as it 780 * but AFAICT IRQS_PENDING should be fine as it
603 * retriggers the interrupt itself --- tglx 781 * retriggers the interrupt itself --- tglx
604 */ 782 */
605 desc->status |= IRQ_PENDING; 783 desc->istate |= IRQS_PENDING;
606 raw_spin_unlock_irq(&desc->lock); 784 raw_spin_unlock_irq(&desc->lock);
607 } else { 785 } else {
608 raw_spin_unlock_irq(&desc->lock); 786 raw_spin_unlock_irq(&desc->lock);
609 787 handler_fn(desc, action);
610 action->thread_fn(action->irq, action->dev_id);
611
612 if (oneshot)
613 irq_finalize_oneshot(action->irq, desc);
614 } 788 }
615 789
616 wake = atomic_dec_and_test(&desc->threads_active); 790 wake = atomic_dec_and_test(&desc->threads_active);
@@ -619,6 +793,9 @@ static int irq_thread(void *data)
619 wake_up(&desc->wait_for_threads); 793 wake_up(&desc->wait_for_threads);
620 } 794 }
621 795
796 /* Prevent a stale desc->threads_oneshot */
797 irq_finalize_oneshot(desc, action, true);
798
622 /* 799 /*
623 * Clear irqaction. Otherwise exit_irq_thread() would make 800 * Clear irqaction. Otherwise exit_irq_thread() would make
624 * fuzz about an active irq thread going into nirvana. 801 * fuzz about an active irq thread going into nirvana.
@@ -633,6 +810,7 @@ static int irq_thread(void *data)
633void exit_irq_thread(void) 810void exit_irq_thread(void)
634{ 811{
635 struct task_struct *tsk = current; 812 struct task_struct *tsk = current;
813 struct irq_desc *desc;
636 814
637 if (!tsk->irqaction) 815 if (!tsk->irqaction)
638 return; 816 return;
@@ -641,6 +819,14 @@ void exit_irq_thread(void)
641 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", 819 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
642 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); 820 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
643 821
822 desc = irq_to_desc(tsk->irqaction->irq);
823
824 /*
825 * Prevent a stale desc->threads_oneshot. Must be called
826 * before setting the IRQTF_DIED flag.
827 */
828 irq_finalize_oneshot(desc, tsk->irqaction, true);
829
644 /* 830 /*
645 * Set the THREAD DIED flag to prevent further wakeups of the 831 * Set the THREAD DIED flag to prevent further wakeups of the
646 * soon to be gone threaded handler. 832 * soon to be gone threaded handler.
@@ -648,6 +834,22 @@ void exit_irq_thread(void)
648 set_bit(IRQTF_DIED, &tsk->irqaction->flags); 834 set_bit(IRQTF_DIED, &tsk->irqaction->flags);
649} 835}
650 836
837static void irq_setup_forced_threading(struct irqaction *new)
838{
839 if (!force_irqthreads)
840 return;
841 if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
842 return;
843
844 new->flags |= IRQF_ONESHOT;
845
846 if (!new->thread_fn) {
847 set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
848 new->thread_fn = new->handler;
849 new->handler = irq_default_primary_handler;
850 }
851}
852
651/* 853/*
652 * Internal function to register an irqaction - typically used to 854 * Internal function to register an irqaction - typically used to
653 * allocate special interrupts that are part of the architecture. 855 * allocate special interrupts that are part of the architecture.
@@ -657,9 +859,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
657{ 859{
658 struct irqaction *old, **old_ptr; 860 struct irqaction *old, **old_ptr;
659 const char *old_name = NULL; 861 const char *old_name = NULL;
660 unsigned long flags; 862 unsigned long flags, thread_mask = 0;
661 int nested, shared = 0; 863 int ret, nested, shared = 0;
662 int ret; 864 cpumask_var_t mask;
663 865
664 if (!desc) 866 if (!desc)
665 return -EINVAL; 867 return -EINVAL;
@@ -683,15 +885,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
683 rand_initialize_irq(irq); 885 rand_initialize_irq(irq);
684 } 886 }
685 887
686 /* Oneshot interrupts are not allowed with shared */
687 if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
688 return -EINVAL;
689
690 /* 888 /*
691 * Check whether the interrupt nests into another interrupt 889 * Check whether the interrupt nests into another interrupt
692 * thread. 890 * thread.
693 */ 891 */
694 nested = desc->status & IRQ_NESTED_THREAD; 892 nested = irq_settings_is_nested_thread(desc);
695 if (nested) { 893 if (nested) {
696 if (!new->thread_fn) 894 if (!new->thread_fn)
697 return -EINVAL; 895 return -EINVAL;
@@ -701,6 +899,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
701 * dummy function which warns when called. 899 * dummy function which warns when called.
702 */ 900 */
703 new->handler = irq_nested_primary_handler; 901 new->handler = irq_nested_primary_handler;
902 } else {
903 irq_setup_forced_threading(new);
704 } 904 }
705 905
706 /* 906 /*
@@ -724,6 +924,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
724 new->thread = t; 924 new->thread = t;
725 } 925 }
726 926
927 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
928 ret = -ENOMEM;
929 goto out_thread;
930 }
931
727 /* 932 /*
728 * The following block of code has to be executed atomically 933 * The following block of code has to be executed atomically
729 */ 934 */
@@ -735,32 +940,41 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
735 * Can't share interrupts unless both agree to and are 940 * Can't share interrupts unless both agree to and are
736 * the same type (level, edge, polarity). So both flag 941 * the same type (level, edge, polarity). So both flag
737 * fields must have IRQF_SHARED set and the bits which 942 * fields must have IRQF_SHARED set and the bits which
738 * set the trigger type must match. 943 * set the trigger type must match. Also all must
944 * agree on ONESHOT.
739 */ 945 */
740 if (!((old->flags & new->flags) & IRQF_SHARED) || 946 if (!((old->flags & new->flags) & IRQF_SHARED) ||
741 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) { 947 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
948 ((old->flags ^ new->flags) & IRQF_ONESHOT)) {
742 old_name = old->name; 949 old_name = old->name;
743 goto mismatch; 950 goto mismatch;
744 } 951 }
745 952
746#if defined(CONFIG_IRQ_PER_CPU)
747 /* All handlers must agree on per-cpuness */ 953 /* All handlers must agree on per-cpuness */
748 if ((old->flags & IRQF_PERCPU) != 954 if ((old->flags & IRQF_PERCPU) !=
749 (new->flags & IRQF_PERCPU)) 955 (new->flags & IRQF_PERCPU))
750 goto mismatch; 956 goto mismatch;
751#endif
752 957
753 /* add new interrupt at end of irq queue */ 958 /* add new interrupt at end of irq queue */
754 do { 959 do {
960 thread_mask |= old->thread_mask;
755 old_ptr = &old->next; 961 old_ptr = &old->next;
756 old = *old_ptr; 962 old = *old_ptr;
757 } while (old); 963 } while (old);
758 shared = 1; 964 shared = 1;
759 } 965 }
760 966
761 if (!shared) { 967 /*
762 irq_chip_set_defaults(desc->irq_data.chip); 968 * Setup the thread mask for this irqaction. Unlikely to have
969 * 32 resp 64 irqs sharing one line, but who knows.
970 */
971 if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) {
972 ret = -EBUSY;
973 goto out_mask;
974 }
975 new->thread_mask = 1 << ffz(thread_mask);
763 976
977 if (!shared) {
764 init_waitqueue_head(&desc->wait_for_threads); 978 init_waitqueue_head(&desc->wait_for_threads);
765 979
766 /* Setup the type (level, edge polarity) if configured: */ 980 /* Setup the type (level, edge polarity) if configured: */
@@ -769,42 +983,44 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
769 new->flags & IRQF_TRIGGER_MASK); 983 new->flags & IRQF_TRIGGER_MASK);
770 984
771 if (ret) 985 if (ret)
772 goto out_thread; 986 goto out_mask;
773 } else 987 }
774 compat_irq_chip_set_default_handler(desc); 988
775#if defined(CONFIG_IRQ_PER_CPU) 989 desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
776 if (new->flags & IRQF_PERCPU) 990 IRQS_ONESHOT | IRQS_WAITING);
777 desc->status |= IRQ_PER_CPU; 991 irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
778#endif
779 992
780 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT | 993 if (new->flags & IRQF_PERCPU) {
781 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); 994 irqd_set(&desc->irq_data, IRQD_PER_CPU);
995 irq_settings_set_per_cpu(desc);
996 }
782 997
783 if (new->flags & IRQF_ONESHOT) 998 if (new->flags & IRQF_ONESHOT)
784 desc->status |= IRQ_ONESHOT; 999 desc->istate |= IRQS_ONESHOT;
785 1000
786 if (!(desc->status & IRQ_NOAUTOEN)) { 1001 if (irq_settings_can_autoenable(desc))
787 desc->depth = 0; 1002 irq_startup(desc);
788 desc->status &= ~IRQ_DISABLED; 1003 else
789 desc->irq_data.chip->irq_startup(&desc->irq_data);
790 } else
791 /* Undo nested disables: */ 1004 /* Undo nested disables: */
792 desc->depth = 1; 1005 desc->depth = 1;
793 1006
794 /* Exclude IRQ from balancing if requested */ 1007 /* Exclude IRQ from balancing if requested */
795 if (new->flags & IRQF_NOBALANCING) 1008 if (new->flags & IRQF_NOBALANCING) {
796 desc->status |= IRQ_NO_BALANCING; 1009 irq_settings_set_no_balancing(desc);
1010 irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
1011 }
797 1012
798 /* Set default affinity mask once everything is setup */ 1013 /* Set default affinity mask once everything is setup */
799 setup_affinity(irq, desc); 1014 setup_affinity(irq, desc, mask);
800 1015
801 } else if ((new->flags & IRQF_TRIGGER_MASK) 1016 } else if (new->flags & IRQF_TRIGGER_MASK) {
802 && (new->flags & IRQF_TRIGGER_MASK) 1017 unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
803 != (desc->status & IRQ_TYPE_SENSE_MASK)) { 1018 unsigned int omsk = irq_settings_get_trigger_mask(desc);
804 /* hope the handler works with the actual trigger mode... */ 1019
805 pr_warning("IRQ %d uses trigger mode %d; requested %d\n", 1020 if (nmsk != omsk)
806 irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK), 1021 /* hope the handler works with current trigger mode */
807 (int)(new->flags & IRQF_TRIGGER_MASK)); 1022 pr_warning("IRQ %d uses trigger mode %u; requested %u\n",
1023 irq, nmsk, omsk);
808 } 1024 }
809 1025
810 new->irq = irq; 1026 new->irq = irq;
@@ -818,8 +1034,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
818 * Check whether we disabled the irq via the spurious handler 1034 * Check whether we disabled the irq via the spurious handler
819 * before. Reenable it and give it another chance. 1035 * before. Reenable it and give it another chance.
820 */ 1036 */
821 if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { 1037 if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
822 desc->status &= ~IRQ_SPURIOUS_DISABLED; 1038 desc->istate &= ~IRQS_SPURIOUS_DISABLED;
823 __enable_irq(desc, irq, false); 1039 __enable_irq(desc, irq, false);
824 } 1040 }
825 1041
@@ -835,6 +1051,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
835 register_irq_proc(irq, desc); 1051 register_irq_proc(irq, desc);
836 new->dir = NULL; 1052 new->dir = NULL;
837 register_handler_proc(irq, new); 1053 register_handler_proc(irq, new);
1054 free_cpumask_var(mask);
838 1055
839 return 0; 1056 return 0;
840 1057
@@ -849,8 +1066,11 @@ mismatch:
849#endif 1066#endif
850 ret = -EBUSY; 1067 ret = -EBUSY;
851 1068
852out_thread: 1069out_mask:
853 raw_spin_unlock_irqrestore(&desc->lock, flags); 1070 raw_spin_unlock_irqrestore(&desc->lock, flags);
1071 free_cpumask_var(mask);
1072
1073out_thread:
854 if (new->thread) { 1074 if (new->thread) {
855 struct task_struct *t = new->thread; 1075 struct task_struct *t = new->thread;
856 1076
@@ -871,9 +1091,14 @@ out_thread:
871 */ 1091 */
872int setup_irq(unsigned int irq, struct irqaction *act) 1092int setup_irq(unsigned int irq, struct irqaction *act)
873{ 1093{
1094 int retval;
874 struct irq_desc *desc = irq_to_desc(irq); 1095 struct irq_desc *desc = irq_to_desc(irq);
875 1096
876 return __setup_irq(irq, desc, act); 1097 chip_bus_lock(desc);
1098 retval = __setup_irq(irq, desc, act);
1099 chip_bus_sync_unlock(desc);
1100
1101 return retval;
877} 1102}
878EXPORT_SYMBOL_GPL(setup_irq); 1103EXPORT_SYMBOL_GPL(setup_irq);
879 1104
@@ -924,13 +1149,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
924#endif 1149#endif
925 1150
926 /* If this was the last handler, shut down the IRQ line: */ 1151 /* If this was the last handler, shut down the IRQ line: */
927 if (!desc->action) { 1152 if (!desc->action)
928 desc->status |= IRQ_DISABLED; 1153 irq_shutdown(desc);
929 if (desc->irq_data.chip->irq_shutdown)
930 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
931 else
932 desc->irq_data.chip->irq_disable(&desc->irq_data);
933 }
934 1154
935#ifdef CONFIG_SMP 1155#ifdef CONFIG_SMP
936 /* make sure affinity_hint is cleaned up */ 1156 /* make sure affinity_hint is cleaned up */
@@ -1004,6 +1224,11 @@ void free_irq(unsigned int irq, void *dev_id)
1004 if (!desc) 1224 if (!desc)
1005 return; 1225 return;
1006 1226
1227#ifdef CONFIG_SMP
1228 if (WARN_ON(desc->affinity_notify))
1229 desc->affinity_notify = NULL;
1230#endif
1231
1007 chip_bus_lock(desc); 1232 chip_bus_lock(desc);
1008 kfree(__free_irq(irq, dev_id)); 1233 kfree(__free_irq(irq, dev_id));
1009 chip_bus_sync_unlock(desc); 1234 chip_bus_sync_unlock(desc);
@@ -1074,7 +1299,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1074 if (!desc) 1299 if (!desc)
1075 return -EINVAL; 1300 return -EINVAL;
1076 1301
1077 if (desc->status & IRQ_NOREQUEST) 1302 if (!irq_settings_can_request(desc))
1078 return -EINVAL; 1303 return -EINVAL;
1079 1304
1080 if (!handler) { 1305 if (!handler) {
@@ -1100,7 +1325,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1100 if (retval) 1325 if (retval)
1101 kfree(action); 1326 kfree(action);
1102 1327
1103#ifdef CONFIG_DEBUG_SHIRQ 1328#ifdef CONFIG_DEBUG_SHIRQ_FIXME
1104 if (!retval && (irqflags & IRQF_SHARED)) { 1329 if (!retval && (irqflags & IRQF_SHARED)) {
1105 /* 1330 /*
1106 * It's a shared IRQ -- the driver ought to be prepared for it 1331 * It's a shared IRQ -- the driver ought to be prepared for it
@@ -1149,7 +1374,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
1149 if (!desc) 1374 if (!desc)
1150 return -EINVAL; 1375 return -EINVAL;
1151 1376
1152 if (desc->status & IRQ_NESTED_THREAD) { 1377 if (irq_settings_is_nested_thread(desc)) {
1153 ret = request_threaded_irq(irq, NULL, handler, 1378 ret = request_threaded_irq(irq, NULL, handler,
1154 flags, name, dev_id); 1379 flags, name, dev_id);
1155 return !ret ? IRQC_IS_NESTED : ret; 1380 return !ret ? IRQC_IS_NESTED : ret;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 441fd629ff04..47420908fba0 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,23 +4,23 @@
4 4
5#include "internals.h" 5#include "internals.h"
6 6
7void move_masked_irq(int irq) 7void irq_move_masked_irq(struct irq_data *idata)
8{ 8{
9 struct irq_desc *desc = irq_to_desc(irq); 9 struct irq_desc *desc = irq_data_to_desc(idata);
10 struct irq_chip *chip = desc->irq_data.chip; 10 struct irq_chip *chip = idata->chip;
11 11
12 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 12 if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
13 return; 13 return;
14 14
15 /* 15 /*
16 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. 16 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
17 */ 17 */
18 if (CHECK_IRQ_PER_CPU(desc->status)) { 18 if (!irqd_can_balance(&desc->irq_data)) {
19 WARN_ON(1); 19 WARN_ON(1);
20 return; 20 return;
21 } 21 }
22 22
23 desc->status &= ~IRQ_MOVE_PENDING; 23 irqd_clr_move_pending(&desc->irq_data);
24 24
25 if (unlikely(cpumask_empty(desc->pending_mask))) 25 if (unlikely(cpumask_empty(desc->pending_mask)))
26 return; 26 return;
@@ -35,7 +35,7 @@ void move_masked_irq(int irq)
35 * do the disable, re-program, enable sequence. 35 * do the disable, re-program, enable sequence.
36 * This is *not* particularly important for level triggered 36 * This is *not* particularly important for level triggered
37 * but in a edge trigger case, we might be setting rte 37 * but in a edge trigger case, we might be setting rte
38 * when an active trigger is comming in. This could 38 * when an active trigger is coming in. This could
39 * cause some ioapics to mal-function. 39 * cause some ioapics to mal-function.
40 * Being paranoid i guess! 40 * Being paranoid i guess!
41 * 41 *
@@ -53,15 +53,14 @@ void move_masked_irq(int irq)
53 cpumask_clear(desc->pending_mask); 53 cpumask_clear(desc->pending_mask);
54} 54}
55 55
56void move_native_irq(int irq) 56void irq_move_irq(struct irq_data *idata)
57{ 57{
58 struct irq_desc *desc = irq_to_desc(irq);
59 bool masked; 58 bool masked;
60 59
61 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 60 if (likely(!irqd_is_setaffinity_pending(idata)))
62 return; 61 return;
63 62
64 if (unlikely(desc->status & IRQ_DISABLED)) 63 if (unlikely(irqd_irq_disabled(idata)))
65 return; 64 return;
66 65
67 /* 66 /*
@@ -69,10 +68,10 @@ void move_native_irq(int irq)
69 * threaded interrupt with ONESHOT set, we can end up with an 68 * threaded interrupt with ONESHOT set, we can end up with an
70 * interrupt storm. 69 * interrupt storm.
71 */ 70 */
72 masked = desc->status & IRQ_MASKED; 71 masked = irqd_irq_masked(idata);
73 if (!masked) 72 if (!masked)
74 desc->irq_data.chip->irq_mask(&desc->irq_data); 73 idata->chip->irq_mask(idata);
75 move_masked_irq(irq); 74 irq_move_masked_irq(idata);
76 if (!masked) 75 if (!masked)
77 desc->irq_data.chip->irq_unmask(&desc->irq_data); 76 idata->chip->irq_unmask(idata);
78} 77}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 0d4005d85b03..f76fc00c9877 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -18,7 +18,7 @@
18 * During system-wide suspend or hibernation device drivers need to be prevented 18 * During system-wide suspend or hibernation device drivers need to be prevented
19 * from receiving interrupts and this function is provided for this purpose. 19 * from receiving interrupts and this function is provided for this purpose.
20 * It marks all interrupt lines in use, except for the timer ones, as disabled 20 * It marks all interrupt lines in use, except for the timer ones, as disabled
21 * and sets the IRQ_SUSPENDED flag for each of them. 21 * and sets the IRQS_SUSPENDED flag for each of them.
22 */ 22 */
23void suspend_device_irqs(void) 23void suspend_device_irqs(void)
24{ 24{
@@ -34,7 +34,7 @@ void suspend_device_irqs(void)
34 } 34 }
35 35
36 for_each_irq_desc(irq, desc) 36 for_each_irq_desc(irq, desc)
37 if (desc->status & IRQ_SUSPENDED) 37 if (desc->istate & IRQS_SUSPENDED)
38 synchronize_irq(irq); 38 synchronize_irq(irq);
39} 39}
40EXPORT_SYMBOL_GPL(suspend_device_irqs); 40EXPORT_SYMBOL_GPL(suspend_device_irqs);
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs);
43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() 43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
44 * 44 *
45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that 45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that
46 * have the IRQ_SUSPENDED flag set. 46 * have the IRQS_SUSPENDED flag set.
47 */ 47 */
48void resume_device_irqs(void) 48void resume_device_irqs(void)
49{ 49{
@@ -53,9 +53,6 @@ void resume_device_irqs(void)
53 for_each_irq_desc(irq, desc) { 53 for_each_irq_desc(irq, desc) {
54 unsigned long flags; 54 unsigned long flags;
55 55
56 if (!(desc->status & IRQ_SUSPENDED))
57 continue;
58
59 raw_spin_lock_irqsave(&desc->lock, flags); 56 raw_spin_lock_irqsave(&desc->lock, flags);
60 __enable_irq(desc, irq, true); 57 __enable_irq(desc, irq, true);
61 raw_spin_unlock_irqrestore(&desc->lock, flags); 58 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -71,9 +68,24 @@ int check_wakeup_irqs(void)
71 struct irq_desc *desc; 68 struct irq_desc *desc;
72 int irq; 69 int irq;
73 70
74 for_each_irq_desc(irq, desc) 71 for_each_irq_desc(irq, desc) {
75 if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING)) 72 if (irqd_is_wakeup_set(&desc->irq_data)) {
76 return -EBUSY; 73 if (desc->istate & IRQS_PENDING)
74 return -EBUSY;
75 continue;
76 }
77 /*
78 * Check the non wakeup interrupts whether they need
79 * to be masked before finally going into suspend
80 * state. That's for hardware which has no wakeup
81 * source configuration facility. The chip
82 * implementation indicates that with
83 * IRQCHIP_MASK_ON_SUSPEND.
84 */
85 if (desc->istate & IRQS_SUSPENDED &&
86 irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
87 mask_irq(desc);
88 }
77 89
78 return 0; 90 return 0;
79} 91}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6c8a2a9f8a7b..834899f2500f 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -11,6 +11,7 @@
11#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/kernel_stat.h>
14 15
15#include "internals.h" 16#include "internals.h"
16 17
@@ -24,7 +25,7 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
24 const struct cpumask *mask = desc->irq_data.affinity; 25 const struct cpumask *mask = desc->irq_data.affinity;
25 26
26#ifdef CONFIG_GENERIC_PENDING_IRQ 27#ifdef CONFIG_GENERIC_PENDING_IRQ
27 if (desc->status & IRQ_MOVE_PENDING) 28 if (irqd_is_setaffinity_pending(&desc->irq_data))
28 mask = desc->pending_mask; 29 mask = desc->pending_mask;
29#endif 30#endif
30 seq_cpumask(m, mask); 31 seq_cpumask(m, mask);
@@ -65,8 +66,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
65 cpumask_var_t new_value; 66 cpumask_var_t new_value;
66 int err; 67 int err;
67 68
68 if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity || 69 if (!irq_can_set_affinity(irq) || no_irq_affinity)
69 irq_balancing_disabled(irq))
70 return -EIO; 70 return -EIO;
71 71
72 if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) 72 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
@@ -89,7 +89,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
89 if (!cpumask_intersects(new_value, cpu_online_mask)) { 89 if (!cpumask_intersects(new_value, cpu_online_mask)) {
90 /* Special case for empty set - allow the architecture 90 /* Special case for empty set - allow the architecture
91 code to set default SMP affinity. */ 91 code to set default SMP affinity. */
92 err = irq_select_affinity_usr(irq) ? -EINVAL : count; 92 err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count;
93 } else { 93 } else {
94 irq_set_affinity(irq, new_value); 94 irq_set_affinity(irq, new_value);
95 err = count; 95 err = count;
@@ -357,3 +357,83 @@ void init_irq_proc(void)
357 } 357 }
358} 358}
359 359
360#ifdef CONFIG_GENERIC_IRQ_SHOW
361
362int __weak arch_show_interrupts(struct seq_file *p, int prec)
363{
364 return 0;
365}
366
367#ifndef ACTUAL_NR_IRQS
368# define ACTUAL_NR_IRQS nr_irqs
369#endif
370
371int show_interrupts(struct seq_file *p, void *v)
372{
373 static int prec;
374
375 unsigned long flags, any_count = 0;
376 int i = *(loff_t *) v, j;
377 struct irqaction *action;
378 struct irq_desc *desc;
379
380 if (i > ACTUAL_NR_IRQS)
381 return 0;
382
383 if (i == ACTUAL_NR_IRQS)
384 return arch_show_interrupts(p, prec);
385
386 /* print header and calculate the width of the first column */
387 if (i == 0) {
388 for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
389 j *= 10;
390
391 seq_printf(p, "%*s", prec + 8, "");
392 for_each_online_cpu(j)
393 seq_printf(p, "CPU%-8d", j);
394 seq_putc(p, '\n');
395 }
396
397 desc = irq_to_desc(i);
398 if (!desc)
399 return 0;
400
401 raw_spin_lock_irqsave(&desc->lock, flags);
402 for_each_online_cpu(j)
403 any_count |= kstat_irqs_cpu(i, j);
404 action = desc->action;
405 if (!action && !any_count)
406 goto out;
407
408 seq_printf(p, "%*d: ", prec, i);
409 for_each_online_cpu(j)
410 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
411
412 if (desc->irq_data.chip) {
413 if (desc->irq_data.chip->irq_print_chip)
414 desc->irq_data.chip->irq_print_chip(&desc->irq_data, p);
415 else if (desc->irq_data.chip->name)
416 seq_printf(p, " %8s", desc->irq_data.chip->name);
417 else
418 seq_printf(p, " %8s", "-");
419 } else {
420 seq_printf(p, " %8s", "None");
421 }
422#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
423 seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
424#endif
425 if (desc->name)
426 seq_printf(p, "-%-8s", desc->name);
427
428 if (action) {
429 seq_printf(p, " %s", action->name);
430 while ((action = action->next) != NULL)
431 seq_printf(p, ", %s", action->name);
432 }
433
434 seq_putc(p, '\n');
435out:
436 raw_spin_unlock_irqrestore(&desc->lock, flags);
437 return 0;
438}
439#endif
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 891115a929aa..14dd5761e8c9 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -23,7 +23,7 @@
23#ifdef CONFIG_HARDIRQS_SW_RESEND 23#ifdef CONFIG_HARDIRQS_SW_RESEND
24 24
25/* Bitmap to handle software resend of interrupts: */ 25/* Bitmap to handle software resend of interrupts: */
26static DECLARE_BITMAP(irqs_resend, NR_IRQS); 26static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
27 27
28/* 28/*
29 * Run software resends of IRQ's 29 * Run software resends of IRQ's
@@ -55,20 +55,18 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
55 */ 55 */
56void check_irq_resend(struct irq_desc *desc, unsigned int irq) 56void check_irq_resend(struct irq_desc *desc, unsigned int irq)
57{ 57{
58 unsigned int status = desc->status;
59
60 /*
61 * Make sure the interrupt is enabled, before resending it:
62 */
63 desc->irq_data.chip->irq_enable(&desc->irq_data);
64
65 /* 58 /*
66 * We do not resend level type interrupts. Level type 59 * We do not resend level type interrupts. Level type
67 * interrupts are resent by hardware when they are still 60 * interrupts are resent by hardware when they are still
68 * active. 61 * active.
69 */ 62 */
70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 63 if (irq_settings_is_level(desc))
71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; 64 return;
65 if (desc->istate & IRQS_REPLAY)
66 return;
67 if (desc->istate & IRQS_PENDING) {
68 desc->istate &= ~IRQS_PENDING;
69 desc->istate |= IRQS_REPLAY;
72 70
73 if (!desc->irq_data.chip->irq_retrigger || 71 if (!desc->irq_data.chip->irq_retrigger ||
74 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { 72 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
new file mode 100644
index 000000000000..0d91730b6330
--- /dev/null
+++ b/kernel/irq/settings.h
@@ -0,0 +1,125 @@
1/*
2 * Internal header to deal with irq_desc->status which will be renamed
3 * to irq_desc->settings.
4 */
5enum {
6 _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS,
7 _IRQ_PER_CPU = IRQ_PER_CPU,
8 _IRQ_LEVEL = IRQ_LEVEL,
9 _IRQ_NOPROBE = IRQ_NOPROBE,
10 _IRQ_NOREQUEST = IRQ_NOREQUEST,
11 _IRQ_NOAUTOEN = IRQ_NOAUTOEN,
12 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
13 _IRQ_NO_BALANCING = IRQ_NO_BALANCING,
14 _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
15 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
16};
17
18#define IRQ_PER_CPU GOT_YOU_MORON
19#define IRQ_NO_BALANCING GOT_YOU_MORON
20#define IRQ_LEVEL GOT_YOU_MORON
21#define IRQ_NOPROBE GOT_YOU_MORON
22#define IRQ_NOREQUEST GOT_YOU_MORON
23#define IRQ_NOAUTOEN GOT_YOU_MORON
24#define IRQ_NESTED_THREAD GOT_YOU_MORON
25#undef IRQF_MODIFY_MASK
26#define IRQF_MODIFY_MASK GOT_YOU_MORON
27
28static inline void
29irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
30{
31 desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK);
32 desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
33}
34
35static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
36{
37 return desc->status_use_accessors & _IRQ_PER_CPU;
38}
39
40static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
41{
42 desc->status_use_accessors |= _IRQ_PER_CPU;
43}
44
45static inline void irq_settings_set_no_balancing(struct irq_desc *desc)
46{
47 desc->status_use_accessors |= _IRQ_NO_BALANCING;
48}
49
50static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
51{
52 return desc->status_use_accessors & _IRQ_NO_BALANCING;
53}
54
55static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc)
56{
57 return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK;
58}
59
60static inline void
61irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask)
62{
63 desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK;
64 desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK;
65}
66
67static inline bool irq_settings_is_level(struct irq_desc *desc)
68{
69 return desc->status_use_accessors & _IRQ_LEVEL;
70}
71
72static inline void irq_settings_clr_level(struct irq_desc *desc)
73{
74 desc->status_use_accessors &= ~_IRQ_LEVEL;
75}
76
77static inline void irq_settings_set_level(struct irq_desc *desc)
78{
79 desc->status_use_accessors |= _IRQ_LEVEL;
80}
81
82static inline bool irq_settings_can_request(struct irq_desc *desc)
83{
84 return !(desc->status_use_accessors & _IRQ_NOREQUEST);
85}
86
87static inline void irq_settings_clr_norequest(struct irq_desc *desc)
88{
89 desc->status_use_accessors &= ~_IRQ_NOREQUEST;
90}
91
92static inline void irq_settings_set_norequest(struct irq_desc *desc)
93{
94 desc->status_use_accessors |= _IRQ_NOREQUEST;
95}
96
97static inline bool irq_settings_can_probe(struct irq_desc *desc)
98{
99 return !(desc->status_use_accessors & _IRQ_NOPROBE);
100}
101
102static inline void irq_settings_clr_noprobe(struct irq_desc *desc)
103{
104 desc->status_use_accessors &= ~_IRQ_NOPROBE;
105}
106
107static inline void irq_settings_set_noprobe(struct irq_desc *desc)
108{
109 desc->status_use_accessors |= _IRQ_NOPROBE;
110}
111
112static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
113{
114 return desc->status_use_accessors & _IRQ_MOVE_PCNTXT;
115}
116
117static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
118{
119 return !(desc->status_use_accessors & _IRQ_NOAUTOEN);
120}
121
122static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
123{
124 return desc->status_use_accessors & _IRQ_NESTED_THREAD;
125}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 3089d3b9d5f3..dfbd550401b2 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -21,70 +21,93 @@ static int irqfixup __read_mostly;
21#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) 21#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
22static void poll_spurious_irqs(unsigned long dummy); 22static void poll_spurious_irqs(unsigned long dummy);
23static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); 23static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
24static int irq_poll_cpu;
25static atomic_t irq_poll_active;
26
27/*
28 * We wait here for a poller to finish.
29 *
30 * If the poll runs on this CPU, then we yell loudly and return
31 * false. That will leave the interrupt line disabled in the worst
32 * case, but it should never happen.
33 *
34 * We wait until the poller is done and then recheck disabled and
35 * action (about to be disabled). Only if it's still active, we return
36 * true and let the handler run.
37 */
38bool irq_wait_for_poll(struct irq_desc *desc)
39{
40 if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
41 "irq poll in progress on cpu %d for irq %d\n",
42 smp_processor_id(), desc->irq_data.irq))
43 return false;
44
45#ifdef CONFIG_SMP
46 do {
47 raw_spin_unlock(&desc->lock);
48 while (irqd_irq_inprogress(&desc->irq_data))
49 cpu_relax();
50 raw_spin_lock(&desc->lock);
51 } while (irqd_irq_inprogress(&desc->irq_data));
52 /* Might have been disabled in meantime */
53 return !irqd_irq_disabled(&desc->irq_data) && desc->action;
54#else
55 return false;
56#endif
57}
58
24 59
25/* 60/*
26 * Recovery handler for misrouted interrupts. 61 * Recovery handler for misrouted interrupts.
27 */ 62 */
28static int try_one_irq(int irq, struct irq_desc *desc) 63static int try_one_irq(int irq, struct irq_desc *desc, bool force)
29{ 64{
65 irqreturn_t ret = IRQ_NONE;
30 struct irqaction *action; 66 struct irqaction *action;
31 int ok = 0, work = 0;
32 67
33 raw_spin_lock(&desc->lock); 68 raw_spin_lock(&desc->lock);
34 /* Already running on another processor */
35 if (desc->status & IRQ_INPROGRESS) {
36 /*
37 * Already running: If it is shared get the other
38 * CPU to go looking for our mystery interrupt too
39 */
40 if (desc->action && (desc->action->flags & IRQF_SHARED))
41 desc->status |= IRQ_PENDING;
42 raw_spin_unlock(&desc->lock);
43 return ok;
44 }
45 /* Honour the normal IRQ locking */
46 desc->status |= IRQ_INPROGRESS;
47 action = desc->action;
48 raw_spin_unlock(&desc->lock);
49 69
50 while (action) { 70 /* PER_CPU and nested thread interrupts are never polled */
51 /* Only shared IRQ handlers are safe to call */ 71 if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc))
52 if (action->flags & IRQF_SHARED) { 72 goto out;
53 if (action->handler(irq, action->dev_id) ==
54 IRQ_HANDLED)
55 ok = 1;
56 }
57 action = action->next;
58 }
59 local_irq_disable();
60 /* Now clean up the flags */
61 raw_spin_lock(&desc->lock);
62 action = desc->action;
63 73
64 /* 74 /*
65 * While we were looking for a fixup someone queued a real 75 * Do not poll disabled interrupts unless the spurious
66 * IRQ clashing with our walk: 76 * disabled poller asks explicitely.
67 */ 77 */
68 while ((desc->status & IRQ_PENDING) && action) { 78 if (irqd_irq_disabled(&desc->irq_data) && !force)
79 goto out;
80
81 /*
82 * All handlers must agree on IRQF_SHARED, so we test just the
83 * first. Check for action->next as well.
84 */
85 action = desc->action;
86 if (!action || !(action->flags & IRQF_SHARED) ||
87 (action->flags & __IRQF_TIMER) || !action->next)
88 goto out;
89
90 /* Already running on another processor */
91 if (irqd_irq_inprogress(&desc->irq_data)) {
69 /* 92 /*
70 * Perform real IRQ processing for the IRQ we deferred 93 * Already running: If it is shared get the other
94 * CPU to go looking for our mystery interrupt too
71 */ 95 */
72 work = 1; 96 desc->istate |= IRQS_PENDING;
73 raw_spin_unlock(&desc->lock); 97 goto out;
74 handle_IRQ_event(irq, action);
75 raw_spin_lock(&desc->lock);
76 desc->status &= ~IRQ_PENDING;
77 } 98 }
78 desc->status &= ~IRQ_INPROGRESS;
79 /*
80 * If we did actual work for the real IRQ line we must let the
81 * IRQ controller clean up too
82 */
83 if (work)
84 irq_end(irq, desc);
85 raw_spin_unlock(&desc->lock);
86 99
87 return ok; 100 /* Mark it poll in progress */
101 desc->istate |= IRQS_POLL_INPROGRESS;
102 do {
103 if (handle_irq_event(desc) == IRQ_HANDLED)
104 ret = IRQ_HANDLED;
105 action = desc->action;
106 } while ((desc->istate & IRQS_PENDING) && action);
107 desc->istate &= ~IRQS_POLL_INPROGRESS;
108out:
109 raw_spin_unlock(&desc->lock);
110 return ret == IRQ_HANDLED;
88} 111}
89 112
90static int misrouted_irq(int irq) 113static int misrouted_irq(int irq)
@@ -92,6 +115,11 @@ static int misrouted_irq(int irq)
92 struct irq_desc *desc; 115 struct irq_desc *desc;
93 int i, ok = 0; 116 int i, ok = 0;
94 117
118 if (atomic_inc_return(&irq_poll_active) == 1)
119 goto out;
120
121 irq_poll_cpu = smp_processor_id();
122
95 for_each_irq_desc(i, desc) { 123 for_each_irq_desc(i, desc) {
96 if (!i) 124 if (!i)
97 continue; 125 continue;
@@ -99,9 +127,11 @@ static int misrouted_irq(int irq)
99 if (i == irq) /* Already tried */ 127 if (i == irq) /* Already tried */
100 continue; 128 continue;
101 129
102 if (try_one_irq(i, desc)) 130 if (try_one_irq(i, desc, false))
103 ok = 1; 131 ok = 1;
104 } 132 }
133out:
134 atomic_dec(&irq_poll_active);
105 /* So the caller can adjust the irq error counts */ 135 /* So the caller can adjust the irq error counts */
106 return ok; 136 return ok;
107} 137}
@@ -111,23 +141,28 @@ static void poll_spurious_irqs(unsigned long dummy)
111 struct irq_desc *desc; 141 struct irq_desc *desc;
112 int i; 142 int i;
113 143
144 if (atomic_inc_return(&irq_poll_active) != 1)
145 goto out;
146 irq_poll_cpu = smp_processor_id();
147
114 for_each_irq_desc(i, desc) { 148 for_each_irq_desc(i, desc) {
115 unsigned int status; 149 unsigned int state;
116 150
117 if (!i) 151 if (!i)
118 continue; 152 continue;
119 153
120 /* Racy but it doesn't matter */ 154 /* Racy but it doesn't matter */
121 status = desc->status; 155 state = desc->istate;
122 barrier(); 156 barrier();
123 if (!(status & IRQ_SPURIOUS_DISABLED)) 157 if (!(state & IRQS_SPURIOUS_DISABLED))
124 continue; 158 continue;
125 159
126 local_irq_disable(); 160 local_irq_disable();
127 try_one_irq(i, desc); 161 try_one_irq(i, desc, true);
128 local_irq_enable(); 162 local_irq_enable();
129 } 163 }
130 164out:
165 atomic_dec(&irq_poll_active);
131 mod_timer(&poll_spurious_irq_timer, 166 mod_timer(&poll_spurious_irq_timer,
132 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 167 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
133} 168}
@@ -139,15 +174,13 @@ static void poll_spurious_irqs(unsigned long dummy)
139 * 174 *
140 * (The other 100-of-100,000 interrupts may have been a correctly 175 * (The other 100-of-100,000 interrupts may have been a correctly
141 * functioning device sharing an IRQ with the failing one) 176 * functioning device sharing an IRQ with the failing one)
142 *
143 * Called under desc->lock
144 */ 177 */
145
146static void 178static void
147__report_bad_irq(unsigned int irq, struct irq_desc *desc, 179__report_bad_irq(unsigned int irq, struct irq_desc *desc,
148 irqreturn_t action_ret) 180 irqreturn_t action_ret)
149{ 181{
150 struct irqaction *action; 182 struct irqaction *action;
183 unsigned long flags;
151 184
152 if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { 185 if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
153 printk(KERN_ERR "irq event %d: bogus return value %x\n", 186 printk(KERN_ERR "irq event %d: bogus return value %x\n",
@@ -159,6 +192,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
159 dump_stack(); 192 dump_stack();
160 printk(KERN_ERR "handlers:\n"); 193 printk(KERN_ERR "handlers:\n");
161 194
195 /*
196 * We need to take desc->lock here. note_interrupt() is called
197 * w/o desc->lock held, but IRQ_PROGRESS set. We might race
198 * with something else removing an action. It's ok to take
199 * desc->lock here. See synchronize_irq().
200 */
201 raw_spin_lock_irqsave(&desc->lock, flags);
162 action = desc->action; 202 action = desc->action;
163 while (action) { 203 while (action) {
164 printk(KERN_ERR "[<%p>]", action->handler); 204 printk(KERN_ERR "[<%p>]", action->handler);
@@ -167,6 +207,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
167 printk("\n"); 207 printk("\n");
168 action = action->next; 208 action = action->next;
169 } 209 }
210 raw_spin_unlock_irqrestore(&desc->lock, flags);
170} 211}
171 212
172static void 213static void
@@ -218,6 +259,9 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
218void note_interrupt(unsigned int irq, struct irq_desc *desc, 259void note_interrupt(unsigned int irq, struct irq_desc *desc,
219 irqreturn_t action_ret) 260 irqreturn_t action_ret)
220{ 261{
262 if (desc->istate & IRQS_POLL_INPROGRESS)
263 return;
264
221 if (unlikely(action_ret != IRQ_HANDLED)) { 265 if (unlikely(action_ret != IRQ_HANDLED)) {
222 /* 266 /*
223 * If we are seeing only the odd spurious IRQ caused by 267 * If we are seeing only the odd spurious IRQ caused by
@@ -254,9 +298,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
254 * Now kill the IRQ 298 * Now kill the IRQ
255 */ 299 */
256 printk(KERN_EMERG "Disabling IRQ #%d\n", irq); 300 printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
257 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; 301 desc->istate |= IRQS_SPURIOUS_DISABLED;
258 desc->depth++; 302 desc->depth++;
259 desc->irq_data.chip->irq_disable(&desc->irq_data); 303 irq_disable(desc);
260 304
261 mod_timer(&poll_spurious_irq_timer, 305 mod_timer(&poll_spurious_irq_timer,
262 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 306 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6f6d091b5757..079f1d39a8b8 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -64,14 +64,14 @@ static inline int is_kernel_text(unsigned long addr)
64 if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || 64 if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
65 arch_is_kernel_text(addr)) 65 arch_is_kernel_text(addr))
66 return 1; 66 return 1;
67 return in_gate_area_no_task(addr); 67 return in_gate_area_no_mm(addr);
68} 68}
69 69
70static inline int is_kernel(unsigned long addr) 70static inline int is_kernel(unsigned long addr)
71{ 71{
72 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) 72 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
73 return 1; 73 return 1;
74 return in_gate_area_no_task(addr); 74 return in_gate_area_no_mm(addr);
75} 75}
76 76
77static int is_ksym_addr(unsigned long addr) 77static int is_ksym_addr(unsigned long addr)
@@ -342,13 +342,15 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
342} 342}
343 343
344/* Look up a kernel symbol and return it in a text buffer. */ 344/* Look up a kernel symbol and return it in a text buffer. */
345int sprint_symbol(char *buffer, unsigned long address) 345static int __sprint_symbol(char *buffer, unsigned long address,
346 int symbol_offset)
346{ 347{
347 char *modname; 348 char *modname;
348 const char *name; 349 const char *name;
349 unsigned long offset, size; 350 unsigned long offset, size;
350 int len; 351 int len;
351 352
353 address += symbol_offset;
352 name = kallsyms_lookup(address, &size, &offset, &modname, buffer); 354 name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
353 if (!name) 355 if (!name)
354 return sprintf(buffer, "0x%lx", address); 356 return sprintf(buffer, "0x%lx", address);
@@ -357,17 +359,53 @@ int sprint_symbol(char *buffer, unsigned long address)
357 strcpy(buffer, name); 359 strcpy(buffer, name);
358 len = strlen(buffer); 360 len = strlen(buffer);
359 buffer += len; 361 buffer += len;
362 offset -= symbol_offset;
360 363
361 if (modname) 364 if (modname)
362 len += sprintf(buffer, "+%#lx/%#lx [%s]", 365 len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname);
363 offset, size, modname);
364 else 366 else
365 len += sprintf(buffer, "+%#lx/%#lx", offset, size); 367 len += sprintf(buffer, "+%#lx/%#lx", offset, size);
366 368
367 return len; 369 return len;
368} 370}
371
372/**
373 * sprint_symbol - Look up a kernel symbol and return it in a text buffer
374 * @buffer: buffer to be stored
375 * @address: address to lookup
376 *
377 * This function looks up a kernel symbol with @address and stores its name,
378 * offset, size and module name to @buffer if possible. If no symbol was found,
379 * just saves its @address as is.
380 *
381 * This function returns the number of bytes stored in @buffer.
382 */
383int sprint_symbol(char *buffer, unsigned long address)
384{
385 return __sprint_symbol(buffer, address, 0);
386}
387
369EXPORT_SYMBOL_GPL(sprint_symbol); 388EXPORT_SYMBOL_GPL(sprint_symbol);
370 389
390/**
391 * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
392 * @buffer: buffer to be stored
393 * @address: address to lookup
394 *
395 * This function is for stack backtrace and does the same thing as
396 * sprint_symbol() but with modified/decreased @address. If there is a
397 * tail-call to the function marked "noreturn", gcc optimized out code after
398 * the call so that the stack-saved return address could point outside of the
399 * caller. This function ensures that kallsyms will find the original caller
400 * by decreasing @address.
401 *
402 * This function returns the number of bytes stored in @buffer.
403 */
404int sprint_backtrace(char *buffer, unsigned long address)
405{
406 return __sprint_symbol(buffer, address, -1);
407}
408
371/* Look up a kernel symbol and print it to the kernel messages. */ 409/* Look up a kernel symbol and print it to the kernel messages. */
372void __print_symbol(const char *fmt, unsigned long address) 410void __print_symbol(const char *fmt, unsigned long address)
373{ 411{
@@ -477,13 +515,11 @@ static int s_show(struct seq_file *m, void *p)
477 */ 515 */
478 type = iter->exported ? toupper(iter->type) : 516 type = iter->exported ? toupper(iter->type) :
479 tolower(iter->type); 517 tolower(iter->type);
480 seq_printf(m, "%0*lx %c %s\t[%s]\n", 518 seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
481 (int)(2 * sizeof(void *)), 519 type, iter->name, iter->module_name);
482 iter->value, type, iter->name, iter->module_name);
483 } else 520 } else
484 seq_printf(m, "%0*lx %c %s\n", 521 seq_printf(m, "%pK %c %s\n", (void *)iter->value,
485 (int)(2 * sizeof(void *)), 522 iter->type, iter->name);
486 iter->value, iter->type, iter->name);
487 return 0; 523 return 0;
488} 524}
489 525
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ec19b92c7ebd..87b77de03dd3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -33,6 +33,7 @@
33#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/kmsg_dump.h> 35#include <linux/kmsg_dump.h>
36#include <linux/syscore_ops.h>
36 37
37#include <asm/page.h> 38#include <asm/page.h>
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
@@ -144,7 +145,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
144 /* Initialize the list of destination pages */ 145 /* Initialize the list of destination pages */
145 INIT_LIST_HEAD(&image->dest_pages); 146 INIT_LIST_HEAD(&image->dest_pages);
146 147
147 /* Initialize the list of unuseable pages */ 148 /* Initialize the list of unusable pages */
148 INIT_LIST_HEAD(&image->unuseable_pages); 149 INIT_LIST_HEAD(&image->unuseable_pages);
149 150
150 /* Read in the segments */ 151 /* Read in the segments */
@@ -454,7 +455,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
454 /* Deal with the destination pages I have inadvertently allocated. 455 /* Deal with the destination pages I have inadvertently allocated.
455 * 456 *
456 * Ideally I would convert multi-page allocations into single 457 * Ideally I would convert multi-page allocations into single
457 * page allocations, and add everyting to image->dest_pages. 458 * page allocations, and add everything to image->dest_pages.
458 * 459 *
459 * For now it is simpler to just free the pages. 460 * For now it is simpler to just free the pages.
460 */ 461 */
@@ -602,7 +603,7 @@ static void kimage_free_extra_pages(struct kimage *image)
602 /* Walk through and free any extra destination pages I may have */ 603 /* Walk through and free any extra destination pages I may have */
603 kimage_free_page_list(&image->dest_pages); 604 kimage_free_page_list(&image->dest_pages);
604 605
605 /* Walk through and free any unuseable pages I have cached */ 606 /* Walk through and free any unusable pages I have cached */
606 kimage_free_page_list(&image->unuseable_pages); 607 kimage_free_page_list(&image->unuseable_pages);
607 608
608} 609}
@@ -1099,7 +1100,8 @@ size_t crash_get_memory_size(void)
1099 return size; 1100 return size;
1100} 1101}
1101 1102
1102static void free_reserved_phys_range(unsigned long begin, unsigned long end) 1103void __weak crash_free_reserved_phys_range(unsigned long begin,
1104 unsigned long end)
1103{ 1105{
1104 unsigned long addr; 1106 unsigned long addr;
1105 1107
@@ -1135,7 +1137,7 @@ int crash_shrink_memory(unsigned long new_size)
1135 start = roundup(start, PAGE_SIZE); 1137 start = roundup(start, PAGE_SIZE);
1136 end = roundup(start + new_size, PAGE_SIZE); 1138 end = roundup(start + new_size, PAGE_SIZE);
1137 1139
1138 free_reserved_phys_range(end, crashk_res.end); 1140 crash_free_reserved_phys_range(end, crashk_res.end);
1139 1141
1140 if ((start == end) && (crashk_res.parent != NULL)) 1142 if ((start == end) && (crashk_res.parent != NULL))
1141 release_resource(&crashk_res); 1143 release_resource(&crashk_res);
@@ -1531,6 +1533,11 @@ int kernel_kexec(void)
1531 local_irq_disable(); 1533 local_irq_disable();
1532 /* Suspend system devices */ 1534 /* Suspend system devices */
1533 error = sysdev_suspend(PMSG_FREEZE); 1535 error = sysdev_suspend(PMSG_FREEZE);
1536 if (!error) {
1537 error = syscore_suspend();
1538 if (error)
1539 sysdev_resume();
1540 }
1534 if (error) 1541 if (error)
1535 goto Enable_irqs; 1542 goto Enable_irqs;
1536 } else 1543 } else
@@ -1545,6 +1552,7 @@ int kernel_kexec(void)
1545 1552
1546#ifdef CONFIG_KEXEC_JUMP 1553#ifdef CONFIG_KEXEC_JUMP
1547 if (kexec_image->preserve_context) { 1554 if (kexec_image->preserve_context) {
1555 syscore_resume();
1548 sysdev_resume(); 1556 sysdev_resume();
1549 Enable_irqs: 1557 Enable_irqs:
1550 local_irq_enable(); 1558 local_irq_enable();
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c55afba990a3..3b34d2732bce 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,6 +27,7 @@ struct kthread_create_info
27 /* Information passed to kthread() from kthreadd. */ 27 /* Information passed to kthread() from kthreadd. */
28 int (*threadfn)(void *data); 28 int (*threadfn)(void *data);
29 void *data; 29 void *data;
30 int node;
30 31
31 /* Result passed back to kthread_create() from kthreadd. */ 32 /* Result passed back to kthread_create() from kthreadd. */
32 struct task_struct *result; 33 struct task_struct *result;
@@ -98,10 +99,23 @@ static int kthread(void *_create)
98 do_exit(ret); 99 do_exit(ret);
99} 100}
100 101
102/* called from do_fork() to get node information for about to be created task */
103int tsk_fork_get_node(struct task_struct *tsk)
104{
105#ifdef CONFIG_NUMA
106 if (tsk == kthreadd_task)
107 return tsk->pref_node_fork;
108#endif
109 return numa_node_id();
110}
111
101static void create_kthread(struct kthread_create_info *create) 112static void create_kthread(struct kthread_create_info *create)
102{ 113{
103 int pid; 114 int pid;
104 115
116#ifdef CONFIG_NUMA
117 current->pref_node_fork = create->node;
118#endif
105 /* We want our own signal handler (we take no signals by default). */ 119 /* We want our own signal handler (we take no signals by default). */
106 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); 120 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
107 if (pid < 0) { 121 if (pid < 0) {
@@ -111,33 +125,38 @@ static void create_kthread(struct kthread_create_info *create)
111} 125}
112 126
113/** 127/**
114 * kthread_create - create a kthread. 128 * kthread_create_on_node - create a kthread.
115 * @threadfn: the function to run until signal_pending(current). 129 * @threadfn: the function to run until signal_pending(current).
116 * @data: data ptr for @threadfn. 130 * @data: data ptr for @threadfn.
131 * @node: memory node number.
117 * @namefmt: printf-style name for the thread. 132 * @namefmt: printf-style name for the thread.
118 * 133 *
119 * Description: This helper function creates and names a kernel 134 * Description: This helper function creates and names a kernel
120 * thread. The thread will be stopped: use wake_up_process() to start 135 * thread. The thread will be stopped: use wake_up_process() to start
121 * it. See also kthread_run(). 136 * it. See also kthread_run().
122 * 137 *
138 * If thread is going to be bound on a particular cpu, give its node
139 * in @node, to get NUMA affinity for kthread stack, or else give -1.
123 * When woken, the thread will run @threadfn() with @data as its 140 * When woken, the thread will run @threadfn() with @data as its
124 * argument. @threadfn() can either call do_exit() directly if it is a 141 * argument. @threadfn() can either call do_exit() directly if it is a
125 * standalone thread for which noone will call kthread_stop(), or 142 * standalone thread for which no one will call kthread_stop(), or
126 * return when 'kthread_should_stop()' is true (which means 143 * return when 'kthread_should_stop()' is true (which means
127 * kthread_stop() has been called). The return value should be zero 144 * kthread_stop() has been called). The return value should be zero
128 * or a negative error number; it will be passed to kthread_stop(). 145 * or a negative error number; it will be passed to kthread_stop().
129 * 146 *
130 * Returns a task_struct or ERR_PTR(-ENOMEM). 147 * Returns a task_struct or ERR_PTR(-ENOMEM).
131 */ 148 */
132struct task_struct *kthread_create(int (*threadfn)(void *data), 149struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
133 void *data, 150 void *data,
134 const char namefmt[], 151 int node,
135 ...) 152 const char namefmt[],
153 ...)
136{ 154{
137 struct kthread_create_info create; 155 struct kthread_create_info create;
138 156
139 create.threadfn = threadfn; 157 create.threadfn = threadfn;
140 create.data = data; 158 create.data = data;
159 create.node = node;
141 init_completion(&create.done); 160 init_completion(&create.done);
142 161
143 spin_lock(&kthread_create_lock); 162 spin_lock(&kthread_create_lock);
@@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
164 } 183 }
165 return create.result; 184 return create.result;
166} 185}
167EXPORT_SYMBOL(kthread_create); 186EXPORT_SYMBOL(kthread_create_on_node);
168 187
169/** 188/**
170 * kthread_bind - bind a just-created kthread to a cpu. 189 * kthread_bind - bind a just-created kthread to a cpu.
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index ee74b35e528d..376066e10413 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -153,7 +153,7 @@ static inline void store_stacktrace(struct task_struct *tsk,
153} 153}
154 154
155/** 155/**
156 * __account_scheduler_latency - record an occured latency 156 * __account_scheduler_latency - record an occurred latency
157 * @tsk - the task struct of the task hitting the latency 157 * @tsk - the task struct of the task hitting the latency
158 * @usecs - the duration of the latency in microseconds 158 * @usecs - the duration of the latency in microseconds
159 * @inter - 1 if the sleep was interruptible, 0 if uninterruptible 159 * @inter - 1 if the sleep was interruptible, 0 if uninterruptible
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0d2058da80f5..53a68956f131 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2309,7 +2309,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2309 if (unlikely(curr->hardirqs_enabled)) { 2309 if (unlikely(curr->hardirqs_enabled)) {
2310 /* 2310 /*
2311 * Neither irq nor preemption are disabled here 2311 * Neither irq nor preemption are disabled here
2312 * so this is racy by nature but loosing one hit 2312 * so this is racy by nature but losing one hit
2313 * in a stat is not a big deal. 2313 * in a stat is not a big deal.
2314 */ 2314 */
2315 __debug_atomic_inc(redundant_hardirqs_on); 2315 __debug_atomic_inc(redundant_hardirqs_on);
@@ -2620,7 +2620,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2620 if (!graph_lock()) 2620 if (!graph_lock())
2621 return 0; 2621 return 0;
2622 /* 2622 /*
2623 * Make sure we didnt race: 2623 * Make sure we didn't race:
2624 */ 2624 */
2625 if (unlikely(hlock_class(this)->usage_mask & new_mask)) { 2625 if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
2626 graph_unlock(); 2626 graph_unlock();
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 1969d2fc4b36..71edd2f60c02 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -225,7 +225,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
225 nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, 225 nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
226 nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, 226 nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
227 nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, 227 nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
228 sum_forward_deps = 0, factor = 0; 228 sum_forward_deps = 0;
229 229
230 list_for_each_entry(class, &all_lock_classes, lock_entry) { 230 list_for_each_entry(class, &all_lock_classes, lock_entry) {
231 231
@@ -283,13 +283,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
283 nr_hardirq_unsafe * nr_hardirq_safe + 283 nr_hardirq_unsafe * nr_hardirq_safe +
284 nr_list_entries); 284 nr_list_entries);
285 285
286 /*
287 * Estimated factor between direct and indirect
288 * dependencies:
289 */
290 if (nr_list_entries)
291 factor = sum_forward_deps / nr_list_entries;
292
293#ifdef CONFIG_PROVE_LOCKING 286#ifdef CONFIG_PROVE_LOCKING
294 seq_printf(m, " dependency chains: %11lu [max: %lu]\n", 287 seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
295 nr_lock_chains, MAX_LOCKDEP_CHAINS); 288 nr_lock_chains, MAX_LOCKDEP_CHAINS);
diff --git a/kernel/module.c b/kernel/module.c
index efa290ea94bf..d5938a5c19c4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -809,7 +809,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
809 wait_for_zero_refcount(mod); 809 wait_for_zero_refcount(mod);
810 810
811 mutex_unlock(&module_mutex); 811 mutex_unlock(&module_mutex);
812 /* Final destruction now noone is using it. */ 812 /* Final destruction now no one is using it. */
813 if (mod->exit != NULL) 813 if (mod->exit != NULL)
814 mod->exit(); 814 mod->exit();
815 blocking_notifier_call_chain(&module_notify_list, 815 blocking_notifier_call_chain(&module_notify_list,
@@ -1168,7 +1168,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
1168{ 1168{
1169 struct module_sect_attr *sattr = 1169 struct module_sect_attr *sattr =
1170 container_of(mattr, struct module_sect_attr, mattr); 1170 container_of(mattr, struct module_sect_attr, mattr);
1171 return sprintf(buf, "0x%lx\n", sattr->address); 1171 return sprintf(buf, "0x%pK\n", (void *)sattr->address);
1172} 1172}
1173 1173
1174static void free_sect_attrs(struct module_sect_attrs *sect_attrs) 1174static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
@@ -2777,7 +2777,7 @@ static struct module *load_module(void __user *umod,
2777 mod->state = MODULE_STATE_COMING; 2777 mod->state = MODULE_STATE_COMING;
2778 2778
2779 /* Now sew it into the lists so we can get lockdep and oops 2779 /* Now sew it into the lists so we can get lockdep and oops
2780 * info during argument parsing. Noone should access us, since 2780 * info during argument parsing. No one should access us, since
2781 * strong_try_module_get() will fail. 2781 * strong_try_module_get() will fail.
2782 * lockdep/oops can run asynchronous, so use the RCU list insertion 2782 * lockdep/oops can run asynchronous, so use the RCU list insertion
2783 * function to insert in a way safe to concurrent readers. 2783 * function to insert in a way safe to concurrent readers.
@@ -2971,7 +2971,7 @@ static const char *get_ksymbol(struct module *mod,
2971 else 2971 else
2972 nextval = (unsigned long)mod->module_core+mod->core_text_size; 2972 nextval = (unsigned long)mod->module_core+mod->core_text_size;
2973 2973
2974 /* Scan for closest preceeding symbol, and next symbol. (ELF 2974 /* Scan for closest preceding symbol, and next symbol. (ELF
2975 starts real symbols at 1). */ 2975 starts real symbols at 1). */
2976 for (i = 1; i < mod->num_symtab; i++) { 2976 for (i = 1; i < mod->num_symtab; i++) {
2977 if (mod->symtab[i].st_shndx == SHN_UNDEF) 2977 if (mod->symtab[i].st_shndx == SHN_UNDEF)
@@ -3224,7 +3224,7 @@ static int m_show(struct seq_file *m, void *p)
3224 mod->state == MODULE_STATE_COMING ? "Loading": 3224 mod->state == MODULE_STATE_COMING ? "Loading":
3225 "Live"); 3225 "Live");
3226 /* Used by oprofile and other similar tools. */ 3226 /* Used by oprofile and other similar tools. */
3227 seq_printf(m, " 0x%p", mod->module_core); 3227 seq_printf(m, " 0x%pK", mod->module_core);
3228 3228
3229 /* Taints info */ 3229 /* Taints info */
3230 if (mod->taints) 3230 if (mod->taints)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index a5889fb28ecf..c4195fa98900 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -245,7 +245,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
245 } 245 }
246 __set_task_state(task, state); 246 __set_task_state(task, state);
247 247
248 /* didnt get the lock, go to sleep: */ 248 /* didn't get the lock, go to sleep: */
249 spin_unlock_mutex(&lock->wait_lock, flags); 249 spin_unlock_mutex(&lock->wait_lock, flags);
250 preempt_enable_no_resched(); 250 preempt_enable_no_resched();
251 schedule(); 251 schedule();
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f74e6c00e26d..a05d191ffdd9 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -69,13 +69,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
69 goto out_ns; 69 goto out_ns;
70 } 70 }
71 71
72 new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); 72 new_nsp->uts_ns = copy_utsname(flags, tsk);
73 if (IS_ERR(new_nsp->uts_ns)) { 73 if (IS_ERR(new_nsp->uts_ns)) {
74 err = PTR_ERR(new_nsp->uts_ns); 74 err = PTR_ERR(new_nsp->uts_ns);
75 goto out_uts; 75 goto out_uts;
76 } 76 }
77 77
78 new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); 78 new_nsp->ipc_ns = copy_ipcs(flags, tsk);
79 if (IS_ERR(new_nsp->ipc_ns)) { 79 if (IS_ERR(new_nsp->ipc_ns)) {
80 err = PTR_ERR(new_nsp->ipc_ns); 80 err = PTR_ERR(new_nsp->ipc_ns);
81 goto out_ipc; 81 goto out_ipc;
diff --git a/kernel/padata.c b/kernel/padata.c
index 751019415d23..b91941df5e63 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -262,7 +262,7 @@ static void padata_reorder(struct parallel_data *pd)
262 /* 262 /*
263 * This cpu has to do the parallel processing of the next 263 * This cpu has to do the parallel processing of the next
264 * object. It's waiting in the cpu's parallelization queue, 264 * object. It's waiting in the cpu's parallelization queue,
265 * so exit imediately. 265 * so exit immediately.
266 */ 266 */
267 if (PTR_ERR(padata) == -ENODATA) { 267 if (PTR_ERR(padata) == -ENODATA) {
268 del_timer(&pd->timer); 268 del_timer(&pd->timer);
@@ -284,7 +284,7 @@ static void padata_reorder(struct parallel_data *pd)
284 /* 284 /*
285 * The next object that needs serialization might have arrived to 285 * The next object that needs serialization might have arrived to
286 * the reorder queues in the meantime, we will be called again 286 * the reorder queues in the meantime, we will be called again
287 * from the timer function if noone else cares for it. 287 * from the timer function if no one else cares for it.
288 */ 288 */
289 if (atomic_read(&pd->reorder_objects) 289 if (atomic_read(&pd->reorder_objects)
290 && !(pinst->flags & PADATA_RESET)) 290 && !(pinst->flags & PADATA_RESET))
@@ -515,7 +515,7 @@ static void __padata_stop(struct padata_instance *pinst)
515 put_online_cpus(); 515 put_online_cpus();
516} 516}
517 517
518/* Replace the internal control stucture with a new one. */ 518/* Replace the internal control structure with a new one. */
519static void padata_replace(struct padata_instance *pinst, 519static void padata_replace(struct padata_instance *pinst,
520 struct parallel_data *pd_new) 520 struct parallel_data *pd_new)
521{ 521{
@@ -768,7 +768,7 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
768} 768}
769 769
770 /** 770 /**
771 * padata_remove_cpu - remove a cpu from the one or both(serial and paralell) 771 * padata_remove_cpu - remove a cpu from the one or both(serial and parallel)
772 * padata cpumasks. 772 * padata cpumasks.
773 * 773 *
774 * @pinst: padata instance 774 * @pinst: padata instance
diff --git a/kernel/panic.c b/kernel/panic.c
index 991bb87a1704..69231670eb95 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -433,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
433 433
434core_param(panic, panic_timeout, int, 0644); 434core_param(panic, panic_timeout, int, 0644);
435core_param(pause_on_oops, pause_on_oops, int, 0644); 435core_param(pause_on_oops, pause_on_oops, int, 0644);
436
437static int __init oops_setup(char *s)
438{
439 if (!s)
440 return -EINVAL;
441 if (!strcmp(s, "panic"))
442 panic_on_oops = 1;
443 return 0;
444}
445early_param("oops", oops_setup);
diff --git a/kernel/params.c b/kernel/params.c
index 0da1411222b9..7ab388a48a2e 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -95,7 +95,7 @@ static int parse_one(char *param,
95 /* Find parameter */ 95 /* Find parameter */
96 for (i = 0; i < num_params; i++) { 96 for (i = 0; i < num_params; i++) {
97 if (parameq(param, params[i].name)) { 97 if (parameq(param, params[i].name)) {
98 /* Noone handled NULL, so do it here. */ 98 /* No one handled NULL, so do it here. */
99 if (!val && params[i].ops->set != param_set_bool) 99 if (!val && params[i].ops->set != param_set_bool)
100 return -EINVAL; 100 return -EINVAL;
101 DEBUGP("They are equal! Calling %p\n", 101 DEBUGP("They are equal! Calling %p\n",
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 999835b6112b..8e81a9860a0d 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -38,13 +38,96 @@
38 38
39#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
40 40
41struct remote_function_call {
42 struct task_struct *p;
43 int (*func)(void *info);
44 void *info;
45 int ret;
46};
47
48static void remote_function(void *data)
49{
50 struct remote_function_call *tfc = data;
51 struct task_struct *p = tfc->p;
52
53 if (p) {
54 tfc->ret = -EAGAIN;
55 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
56 return;
57 }
58
59 tfc->ret = tfc->func(tfc->info);
60}
61
62/**
63 * task_function_call - call a function on the cpu on which a task runs
64 * @p: the task to evaluate
65 * @func: the function to be called
66 * @info: the function call argument
67 *
68 * Calls the function @func when the task is currently running. This might
69 * be on the current CPU, which just calls the function directly
70 *
71 * returns: @func return value, or
72 * -ESRCH - when the process isn't running
73 * -EAGAIN - when the process moved away
74 */
75static int
76task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
77{
78 struct remote_function_call data = {
79 .p = p,
80 .func = func,
81 .info = info,
82 .ret = -ESRCH, /* No such (running) process */
83 };
84
85 if (task_curr(p))
86 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
87
88 return data.ret;
89}
90
91/**
92 * cpu_function_call - call a function on the cpu
93 * @func: the function to be called
94 * @info: the function call argument
95 *
96 * Calls the function @func on the remote cpu.
97 *
98 * returns: @func return value or -ENXIO when the cpu is offline
99 */
100static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
101{
102 struct remote_function_call data = {
103 .p = NULL,
104 .func = func,
105 .info = info,
106 .ret = -ENXIO, /* No such CPU */
107 };
108
109 smp_call_function_single(cpu, remote_function, &data, 1);
110
111 return data.ret;
112}
113
114#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
115 PERF_FLAG_FD_OUTPUT |\
116 PERF_FLAG_PID_CGROUP)
117
41enum event_type_t { 118enum event_type_t {
42 EVENT_FLEXIBLE = 0x1, 119 EVENT_FLEXIBLE = 0x1,
43 EVENT_PINNED = 0x2, 120 EVENT_PINNED = 0x2,
44 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 121 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
45}; 122};
46 123
47atomic_t perf_task_events __read_mostly; 124/*
125 * perf_sched_events : >0 events exist
126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
127 */
128atomic_t perf_sched_events __read_mostly;
129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
130
48static atomic_t nr_mmap_events __read_mostly; 131static atomic_t nr_mmap_events __read_mostly;
49static atomic_t nr_comm_events __read_mostly; 132static atomic_t nr_comm_events __read_mostly;
50static atomic_t nr_task_events __read_mostly; 133static atomic_t nr_task_events __read_mostly;
@@ -62,12 +145,30 @@ static struct srcu_struct pmus_srcu;
62 */ 145 */
63int sysctl_perf_event_paranoid __read_mostly = 1; 146int sysctl_perf_event_paranoid __read_mostly = 1;
64 147
65int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ 148/* Minimum for 512 kiB + 1 user control page */
149int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
66 150
67/* 151/*
68 * max perf event sample rate 152 * max perf event sample rate
69 */ 153 */
70int sysctl_perf_event_sample_rate __read_mostly = 100000; 154#define DEFAULT_MAX_SAMPLE_RATE 100000
155int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
156static int max_samples_per_tick __read_mostly =
157 DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
158
159int perf_proc_update_handler(struct ctl_table *table, int write,
160 void __user *buffer, size_t *lenp,
161 loff_t *ppos)
162{
163 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
164
165 if (ret || !write)
166 return ret;
167
168 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
169
170 return 0;
171}
71 172
72static atomic64_t perf_event_id; 173static atomic64_t perf_event_id;
73 174
@@ -75,7 +176,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
75 enum event_type_t event_type); 176 enum event_type_t event_type);
76 177
77static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 178static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
78 enum event_type_t event_type); 179 enum event_type_t event_type,
180 struct task_struct *task);
181
182static void update_context_time(struct perf_event_context *ctx);
183static u64 perf_event_time(struct perf_event *event);
79 184
80void __weak perf_event_print_debug(void) { } 185void __weak perf_event_print_debug(void) { }
81 186
@@ -89,6 +194,361 @@ static inline u64 perf_clock(void)
89 return local_clock(); 194 return local_clock();
90} 195}
91 196
197static inline struct perf_cpu_context *
198__get_cpu_context(struct perf_event_context *ctx)
199{
200 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
201}
202
203#ifdef CONFIG_CGROUP_PERF
204
205/*
206 * Must ensure cgroup is pinned (css_get) before calling
207 * this function. In other words, we cannot call this function
208 * if there is no cgroup event for the current CPU context.
209 */
210static inline struct perf_cgroup *
211perf_cgroup_from_task(struct task_struct *task)
212{
213 return container_of(task_subsys_state(task, perf_subsys_id),
214 struct perf_cgroup, css);
215}
216
217static inline bool
218perf_cgroup_match(struct perf_event *event)
219{
220 struct perf_event_context *ctx = event->ctx;
221 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
222
223 return !event->cgrp || event->cgrp == cpuctx->cgrp;
224}
225
226static inline void perf_get_cgroup(struct perf_event *event)
227{
228 css_get(&event->cgrp->css);
229}
230
231static inline void perf_put_cgroup(struct perf_event *event)
232{
233 css_put(&event->cgrp->css);
234}
235
236static inline void perf_detach_cgroup(struct perf_event *event)
237{
238 perf_put_cgroup(event);
239 event->cgrp = NULL;
240}
241
242static inline int is_cgroup_event(struct perf_event *event)
243{
244 return event->cgrp != NULL;
245}
246
247static inline u64 perf_cgroup_event_time(struct perf_event *event)
248{
249 struct perf_cgroup_info *t;
250
251 t = per_cpu_ptr(event->cgrp->info, event->cpu);
252 return t->time;
253}
254
255static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
256{
257 struct perf_cgroup_info *info;
258 u64 now;
259
260 now = perf_clock();
261
262 info = this_cpu_ptr(cgrp->info);
263
264 info->time += now - info->timestamp;
265 info->timestamp = now;
266}
267
268static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
269{
270 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
271 if (cgrp_out)
272 __update_cgrp_time(cgrp_out);
273}
274
275static inline void update_cgrp_time_from_event(struct perf_event *event)
276{
277 struct perf_cgroup *cgrp;
278
279 /*
280 * ensure we access cgroup data only when needed and
281 * when we know the cgroup is pinned (css_get)
282 */
283 if (!is_cgroup_event(event))
284 return;
285
286 cgrp = perf_cgroup_from_task(current);
287 /*
288 * Do not update time when cgroup is not active
289 */
290 if (cgrp == event->cgrp)
291 __update_cgrp_time(event->cgrp);
292}
293
294static inline void
295perf_cgroup_set_timestamp(struct task_struct *task,
296 struct perf_event_context *ctx)
297{
298 struct perf_cgroup *cgrp;
299 struct perf_cgroup_info *info;
300
301 /*
302 * ctx->lock held by caller
303 * ensure we do not access cgroup data
304 * unless we have the cgroup pinned (css_get)
305 */
306 if (!task || !ctx->nr_cgroups)
307 return;
308
309 cgrp = perf_cgroup_from_task(task);
310 info = this_cpu_ptr(cgrp->info);
311 info->timestamp = ctx->timestamp;
312}
313
314#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
315#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
316
317/*
318 * reschedule events based on the cgroup constraint of task.
319 *
320 * mode SWOUT : schedule out everything
321 * mode SWIN : schedule in based on cgroup for next
322 */
323void perf_cgroup_switch(struct task_struct *task, int mode)
324{
325 struct perf_cpu_context *cpuctx;
326 struct pmu *pmu;
327 unsigned long flags;
328
329 /*
330 * disable interrupts to avoid geting nr_cgroup
331 * changes via __perf_event_disable(). Also
332 * avoids preemption.
333 */
334 local_irq_save(flags);
335
336 /*
337 * we reschedule only in the presence of cgroup
338 * constrained events.
339 */
340 rcu_read_lock();
341
342 list_for_each_entry_rcu(pmu, &pmus, entry) {
343
344 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
345
346 perf_pmu_disable(cpuctx->ctx.pmu);
347
348 /*
349 * perf_cgroup_events says at least one
350 * context on this CPU has cgroup events.
351 *
352 * ctx->nr_cgroups reports the number of cgroup
353 * events for a context.
354 */
355 if (cpuctx->ctx.nr_cgroups > 0) {
356
357 if (mode & PERF_CGROUP_SWOUT) {
358 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
359 /*
360 * must not be done before ctxswout due
361 * to event_filter_match() in event_sched_out()
362 */
363 cpuctx->cgrp = NULL;
364 }
365
366 if (mode & PERF_CGROUP_SWIN) {
367 WARN_ON_ONCE(cpuctx->cgrp);
368 /* set cgrp before ctxsw in to
369 * allow event_filter_match() to not
370 * have to pass task around
371 */
372 cpuctx->cgrp = perf_cgroup_from_task(task);
373 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
374 }
375 }
376
377 perf_pmu_enable(cpuctx->ctx.pmu);
378 }
379
380 rcu_read_unlock();
381
382 local_irq_restore(flags);
383}
384
385static inline void perf_cgroup_sched_out(struct task_struct *task)
386{
387 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
388}
389
390static inline void perf_cgroup_sched_in(struct task_struct *task)
391{
392 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
393}
394
395static inline int perf_cgroup_connect(int fd, struct perf_event *event,
396 struct perf_event_attr *attr,
397 struct perf_event *group_leader)
398{
399 struct perf_cgroup *cgrp;
400 struct cgroup_subsys_state *css;
401 struct file *file;
402 int ret = 0, fput_needed;
403
404 file = fget_light(fd, &fput_needed);
405 if (!file)
406 return -EBADF;
407
408 css = cgroup_css_from_dir(file, perf_subsys_id);
409 if (IS_ERR(css)) {
410 ret = PTR_ERR(css);
411 goto out;
412 }
413
414 cgrp = container_of(css, struct perf_cgroup, css);
415 event->cgrp = cgrp;
416
417 /* must be done before we fput() the file */
418 perf_get_cgroup(event);
419
420 /*
421 * all events in a group must monitor
422 * the same cgroup because a task belongs
423 * to only one perf cgroup at a time
424 */
425 if (group_leader && group_leader->cgrp != cgrp) {
426 perf_detach_cgroup(event);
427 ret = -EINVAL;
428 }
429out:
430 fput_light(file, fput_needed);
431 return ret;
432}
433
434static inline void
435perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
436{
437 struct perf_cgroup_info *t;
438 t = per_cpu_ptr(event->cgrp->info, event->cpu);
439 event->shadow_ctx_time = now - t->timestamp;
440}
441
442static inline void
443perf_cgroup_defer_enabled(struct perf_event *event)
444{
445 /*
446 * when the current task's perf cgroup does not match
447 * the event's, we need to remember to call the
448 * perf_mark_enable() function the first time a task with
449 * a matching perf cgroup is scheduled in.
450 */
451 if (is_cgroup_event(event) && !perf_cgroup_match(event))
452 event->cgrp_defer_enabled = 1;
453}
454
455static inline void
456perf_cgroup_mark_enabled(struct perf_event *event,
457 struct perf_event_context *ctx)
458{
459 struct perf_event *sub;
460 u64 tstamp = perf_event_time(event);
461
462 if (!event->cgrp_defer_enabled)
463 return;
464
465 event->cgrp_defer_enabled = 0;
466
467 event->tstamp_enabled = tstamp - event->total_time_enabled;
468 list_for_each_entry(sub, &event->sibling_list, group_entry) {
469 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
470 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
471 sub->cgrp_defer_enabled = 0;
472 }
473 }
474}
475#else /* !CONFIG_CGROUP_PERF */
476
477static inline bool
478perf_cgroup_match(struct perf_event *event)
479{
480 return true;
481}
482
483static inline void perf_detach_cgroup(struct perf_event *event)
484{}
485
486static inline int is_cgroup_event(struct perf_event *event)
487{
488 return 0;
489}
490
491static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
492{
493 return 0;
494}
495
496static inline void update_cgrp_time_from_event(struct perf_event *event)
497{
498}
499
500static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
501{
502}
503
504static inline void perf_cgroup_sched_out(struct task_struct *task)
505{
506}
507
508static inline void perf_cgroup_sched_in(struct task_struct *task)
509{
510}
511
512static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
513 struct perf_event_attr *attr,
514 struct perf_event *group_leader)
515{
516 return -EINVAL;
517}
518
519static inline void
520perf_cgroup_set_timestamp(struct task_struct *task,
521 struct perf_event_context *ctx)
522{
523}
524
525void
526perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
527{
528}
529
530static inline void
531perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
532{
533}
534
535static inline u64 perf_cgroup_event_time(struct perf_event *event)
536{
537 return 0;
538}
539
540static inline void
541perf_cgroup_defer_enabled(struct perf_event *event)
542{
543}
544
545static inline void
546perf_cgroup_mark_enabled(struct perf_event *event,
547 struct perf_event_context *ctx)
548{
549}
550#endif
551
92void perf_pmu_disable(struct pmu *pmu) 552void perf_pmu_disable(struct pmu *pmu)
93{ 553{
94 int *count = this_cpu_ptr(pmu->pmu_disable_count); 554 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -254,7 +714,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
254 raw_spin_lock_irqsave(&ctx->lock, flags); 714 raw_spin_lock_irqsave(&ctx->lock, flags);
255 --ctx->pin_count; 715 --ctx->pin_count;
256 raw_spin_unlock_irqrestore(&ctx->lock, flags); 716 raw_spin_unlock_irqrestore(&ctx->lock, flags);
257 put_ctx(ctx);
258} 717}
259 718
260/* 719/*
@@ -271,6 +730,10 @@ static void update_context_time(struct perf_event_context *ctx)
271static u64 perf_event_time(struct perf_event *event) 730static u64 perf_event_time(struct perf_event *event)
272{ 731{
273 struct perf_event_context *ctx = event->ctx; 732 struct perf_event_context *ctx = event->ctx;
733
734 if (is_cgroup_event(event))
735 return perf_cgroup_event_time(event);
736
274 return ctx ? ctx->time : 0; 737 return ctx ? ctx->time : 0;
275} 738}
276 739
@@ -285,9 +748,20 @@ static void update_event_times(struct perf_event *event)
285 if (event->state < PERF_EVENT_STATE_INACTIVE || 748 if (event->state < PERF_EVENT_STATE_INACTIVE ||
286 event->group_leader->state < PERF_EVENT_STATE_INACTIVE) 749 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
287 return; 750 return;
288 751 /*
289 if (ctx->is_active) 752 * in cgroup mode, time_enabled represents
753 * the time the event was enabled AND active
754 * tasks were in the monitored cgroup. This is
755 * independent of the activity of the context as
756 * there may be a mix of cgroup and non-cgroup events.
757 *
758 * That is why we treat cgroup events differently
759 * here.
760 */
761 if (is_cgroup_event(event))
290 run_end = perf_event_time(event); 762 run_end = perf_event_time(event);
763 else if (ctx->is_active)
764 run_end = ctx->time;
291 else 765 else
292 run_end = event->tstamp_stopped; 766 run_end = event->tstamp_stopped;
293 767
@@ -299,6 +773,7 @@ static void update_event_times(struct perf_event *event)
299 run_end = perf_event_time(event); 773 run_end = perf_event_time(event);
300 774
301 event->total_time_running = run_end - event->tstamp_running; 775 event->total_time_running = run_end - event->tstamp_running;
776
302} 777}
303 778
304/* 779/*
@@ -347,6 +822,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
347 list_add_tail(&event->group_entry, list); 822 list_add_tail(&event->group_entry, list);
348 } 823 }
349 824
825 if (is_cgroup_event(event))
826 ctx->nr_cgroups++;
827
350 list_add_rcu(&event->event_entry, &ctx->event_list); 828 list_add_rcu(&event->event_entry, &ctx->event_list);
351 if (!ctx->nr_events) 829 if (!ctx->nr_events)
352 perf_pmu_rotate_start(ctx->pmu); 830 perf_pmu_rotate_start(ctx->pmu);
@@ -465,6 +943,7 @@ static void perf_group_attach(struct perf_event *event)
465static void 943static void
466list_del_event(struct perf_event *event, struct perf_event_context *ctx) 944list_del_event(struct perf_event *event, struct perf_event_context *ctx)
467{ 945{
946 struct perf_cpu_context *cpuctx;
468 /* 947 /*
469 * We can have double detach due to exit/hot-unplug + close. 948 * We can have double detach due to exit/hot-unplug + close.
470 */ 949 */
@@ -473,6 +952,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
473 952
474 event->attach_state &= ~PERF_ATTACH_CONTEXT; 953 event->attach_state &= ~PERF_ATTACH_CONTEXT;
475 954
955 if (is_cgroup_event(event)) {
956 ctx->nr_cgroups--;
957 cpuctx = __get_cpu_context(ctx);
958 /*
959 * if there are no more cgroup events
960 * then cler cgrp to avoid stale pointer
961 * in update_cgrp_time_from_cpuctx()
962 */
963 if (!ctx->nr_cgroups)
964 cpuctx->cgrp = NULL;
965 }
966
476 ctx->nr_events--; 967 ctx->nr_events--;
477 if (event->attr.inherit_stat) 968 if (event->attr.inherit_stat)
478 ctx->nr_stat--; 969 ctx->nr_stat--;
@@ -544,7 +1035,8 @@ out:
544static inline int 1035static inline int
545event_filter_match(struct perf_event *event) 1036event_filter_match(struct perf_event *event)
546{ 1037{
547 return event->cpu == -1 || event->cpu == smp_processor_id(); 1038 return (event->cpu == -1 || event->cpu == smp_processor_id())
1039 && perf_cgroup_match(event);
548} 1040}
549 1041
550static void 1042static void
@@ -562,7 +1054,7 @@ event_sched_out(struct perf_event *event,
562 */ 1054 */
563 if (event->state == PERF_EVENT_STATE_INACTIVE 1055 if (event->state == PERF_EVENT_STATE_INACTIVE
564 && !event_filter_match(event)) { 1056 && !event_filter_match(event)) {
565 delta = ctx->time - event->tstamp_stopped; 1057 delta = tstamp - event->tstamp_stopped;
566 event->tstamp_running += delta; 1058 event->tstamp_running += delta;
567 event->tstamp_stopped = tstamp; 1059 event->tstamp_stopped = tstamp;
568 } 1060 }
@@ -606,47 +1098,30 @@ group_sched_out(struct perf_event *group_event,
606 cpuctx->exclusive = 0; 1098 cpuctx->exclusive = 0;
607} 1099}
608 1100
609static inline struct perf_cpu_context *
610__get_cpu_context(struct perf_event_context *ctx)
611{
612 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
613}
614
615/* 1101/*
616 * Cross CPU call to remove a performance event 1102 * Cross CPU call to remove a performance event
617 * 1103 *
618 * We disable the event on the hardware level first. After that we 1104 * We disable the event on the hardware level first. After that we
619 * remove it from the context list. 1105 * remove it from the context list.
620 */ 1106 */
621static void __perf_event_remove_from_context(void *info) 1107static int __perf_remove_from_context(void *info)
622{ 1108{
623 struct perf_event *event = info; 1109 struct perf_event *event = info;
624 struct perf_event_context *ctx = event->ctx; 1110 struct perf_event_context *ctx = event->ctx;
625 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1111 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
626 1112
627 /*
628 * If this is a task context, we need to check whether it is
629 * the current task context of this cpu. If not it has been
630 * scheduled out before the smp call arrived.
631 */
632 if (ctx->task && cpuctx->task_ctx != ctx)
633 return;
634
635 raw_spin_lock(&ctx->lock); 1113 raw_spin_lock(&ctx->lock);
636
637 event_sched_out(event, cpuctx, ctx); 1114 event_sched_out(event, cpuctx, ctx);
638
639 list_del_event(event, ctx); 1115 list_del_event(event, ctx);
640
641 raw_spin_unlock(&ctx->lock); 1116 raw_spin_unlock(&ctx->lock);
1117
1118 return 0;
642} 1119}
643 1120
644 1121
645/* 1122/*
646 * Remove the event from a task's (or a CPU's) list of events. 1123 * Remove the event from a task's (or a CPU's) list of events.
647 * 1124 *
648 * Must be called with ctx->mutex held.
649 *
650 * CPU events are removed with a smp call. For task events we only 1125 * CPU events are removed with a smp call. For task events we only
651 * call when the task is on a CPU. 1126 * call when the task is on a CPU.
652 * 1127 *
@@ -657,49 +1132,48 @@ static void __perf_event_remove_from_context(void *info)
657 * When called from perf_event_exit_task, it's OK because the 1132 * When called from perf_event_exit_task, it's OK because the
658 * context has been detached from its task. 1133 * context has been detached from its task.
659 */ 1134 */
660static void perf_event_remove_from_context(struct perf_event *event) 1135static void perf_remove_from_context(struct perf_event *event)
661{ 1136{
662 struct perf_event_context *ctx = event->ctx; 1137 struct perf_event_context *ctx = event->ctx;
663 struct task_struct *task = ctx->task; 1138 struct task_struct *task = ctx->task;
664 1139
1140 lockdep_assert_held(&ctx->mutex);
1141
665 if (!task) { 1142 if (!task) {
666 /* 1143 /*
667 * Per cpu events are removed via an smp call and 1144 * Per cpu events are removed via an smp call and
668 * the removal is always successful. 1145 * the removal is always successful.
669 */ 1146 */
670 smp_call_function_single(event->cpu, 1147 cpu_function_call(event->cpu, __perf_remove_from_context, event);
671 __perf_event_remove_from_context,
672 event, 1);
673 return; 1148 return;
674 } 1149 }
675 1150
676retry: 1151retry:
677 task_oncpu_function_call(task, __perf_event_remove_from_context, 1152 if (!task_function_call(task, __perf_remove_from_context, event))
678 event); 1153 return;
679 1154
680 raw_spin_lock_irq(&ctx->lock); 1155 raw_spin_lock_irq(&ctx->lock);
681 /* 1156 /*
682 * If the context is active we need to retry the smp call. 1157 * If we failed to find a running task, but find the context active now
1158 * that we've acquired the ctx->lock, retry.
683 */ 1159 */
684 if (ctx->nr_active && !list_empty(&event->group_entry)) { 1160 if (ctx->is_active) {
685 raw_spin_unlock_irq(&ctx->lock); 1161 raw_spin_unlock_irq(&ctx->lock);
686 goto retry; 1162 goto retry;
687 } 1163 }
688 1164
689 /* 1165 /*
690 * The lock prevents that this context is scheduled in so we 1166 * Since the task isn't running, its safe to remove the event, us
691 * can remove the event safely, if the call above did not 1167 * holding the ctx->lock ensures the task won't get scheduled in.
692 * succeed.
693 */ 1168 */
694 if (!list_empty(&event->group_entry)) 1169 list_del_event(event, ctx);
695 list_del_event(event, ctx);
696 raw_spin_unlock_irq(&ctx->lock); 1170 raw_spin_unlock_irq(&ctx->lock);
697} 1171}
698 1172
699/* 1173/*
700 * Cross CPU call to disable a performance event 1174 * Cross CPU call to disable a performance event
701 */ 1175 */
702static void __perf_event_disable(void *info) 1176static int __perf_event_disable(void *info)
703{ 1177{
704 struct perf_event *event = info; 1178 struct perf_event *event = info;
705 struct perf_event_context *ctx = event->ctx; 1179 struct perf_event_context *ctx = event->ctx;
@@ -708,9 +1182,12 @@ static void __perf_event_disable(void *info)
708 /* 1182 /*
709 * If this is a per-task event, need to check whether this 1183 * If this is a per-task event, need to check whether this
710 * event's task is the current task on this cpu. 1184 * event's task is the current task on this cpu.
1185 *
1186 * Can trigger due to concurrent perf_event_context_sched_out()
1187 * flipping contexts around.
711 */ 1188 */
712 if (ctx->task && cpuctx->task_ctx != ctx) 1189 if (ctx->task && cpuctx->task_ctx != ctx)
713 return; 1190 return -EINVAL;
714 1191
715 raw_spin_lock(&ctx->lock); 1192 raw_spin_lock(&ctx->lock);
716 1193
@@ -720,6 +1197,7 @@ static void __perf_event_disable(void *info)
720 */ 1197 */
721 if (event->state >= PERF_EVENT_STATE_INACTIVE) { 1198 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
722 update_context_time(ctx); 1199 update_context_time(ctx);
1200 update_cgrp_time_from_event(event);
723 update_group_times(event); 1201 update_group_times(event);
724 if (event == event->group_leader) 1202 if (event == event->group_leader)
725 group_sched_out(event, cpuctx, ctx); 1203 group_sched_out(event, cpuctx, ctx);
@@ -729,6 +1207,8 @@ static void __perf_event_disable(void *info)
729 } 1207 }
730 1208
731 raw_spin_unlock(&ctx->lock); 1209 raw_spin_unlock(&ctx->lock);
1210
1211 return 0;
732} 1212}
733 1213
734/* 1214/*
@@ -753,13 +1233,13 @@ void perf_event_disable(struct perf_event *event)
753 /* 1233 /*
754 * Disable the event on the cpu that it's on 1234 * Disable the event on the cpu that it's on
755 */ 1235 */
756 smp_call_function_single(event->cpu, __perf_event_disable, 1236 cpu_function_call(event->cpu, __perf_event_disable, event);
757 event, 1);
758 return; 1237 return;
759 } 1238 }
760 1239
761retry: 1240retry:
762 task_oncpu_function_call(task, __perf_event_disable, event); 1241 if (!task_function_call(task, __perf_event_disable, event))
1242 return;
763 1243
764 raw_spin_lock_irq(&ctx->lock); 1244 raw_spin_lock_irq(&ctx->lock);
765 /* 1245 /*
@@ -767,6 +1247,11 @@ retry:
767 */ 1247 */
768 if (event->state == PERF_EVENT_STATE_ACTIVE) { 1248 if (event->state == PERF_EVENT_STATE_ACTIVE) {
769 raw_spin_unlock_irq(&ctx->lock); 1249 raw_spin_unlock_irq(&ctx->lock);
1250 /*
1251 * Reload the task pointer, it might have been changed by
1252 * a concurrent perf_event_context_sched_out().
1253 */
1254 task = ctx->task;
770 goto retry; 1255 goto retry;
771 } 1256 }
772 1257
@@ -778,10 +1263,48 @@ retry:
778 update_group_times(event); 1263 update_group_times(event);
779 event->state = PERF_EVENT_STATE_OFF; 1264 event->state = PERF_EVENT_STATE_OFF;
780 } 1265 }
781
782 raw_spin_unlock_irq(&ctx->lock); 1266 raw_spin_unlock_irq(&ctx->lock);
783} 1267}
784 1268
1269static void perf_set_shadow_time(struct perf_event *event,
1270 struct perf_event_context *ctx,
1271 u64 tstamp)
1272{
1273 /*
1274 * use the correct time source for the time snapshot
1275 *
1276 * We could get by without this by leveraging the
1277 * fact that to get to this function, the caller
1278 * has most likely already called update_context_time()
1279 * and update_cgrp_time_xx() and thus both timestamp
1280 * are identical (or very close). Given that tstamp is,
1281 * already adjusted for cgroup, we could say that:
1282 * tstamp - ctx->timestamp
1283 * is equivalent to
1284 * tstamp - cgrp->timestamp.
1285 *
1286 * Then, in perf_output_read(), the calculation would
1287 * work with no changes because:
1288 * - event is guaranteed scheduled in
1289 * - no scheduled out in between
1290 * - thus the timestamp would be the same
1291 *
1292 * But this is a bit hairy.
1293 *
1294 * So instead, we have an explicit cgroup call to remain
1295 * within the time time source all along. We believe it
1296 * is cleaner and simpler to understand.
1297 */
1298 if (is_cgroup_event(event))
1299 perf_cgroup_set_shadow_time(event, tstamp);
1300 else
1301 event->shadow_ctx_time = tstamp - ctx->timestamp;
1302}
1303
1304#define MAX_INTERRUPTS (~0ULL)
1305
1306static void perf_log_throttle(struct perf_event *event, int enable);
1307
785static int 1308static int
786event_sched_in(struct perf_event *event, 1309event_sched_in(struct perf_event *event,
787 struct perf_cpu_context *cpuctx, 1310 struct perf_cpu_context *cpuctx,
@@ -794,6 +1317,17 @@ event_sched_in(struct perf_event *event,
794 1317
795 event->state = PERF_EVENT_STATE_ACTIVE; 1318 event->state = PERF_EVENT_STATE_ACTIVE;
796 event->oncpu = smp_processor_id(); 1319 event->oncpu = smp_processor_id();
1320
1321 /*
1322 * Unthrottle events, since we scheduled we might have missed several
1323 * ticks already, also for a heavily scheduling task there is little
1324 * guarantee it'll get a tick in a timely manner.
1325 */
1326 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1327 perf_log_throttle(event, 1);
1328 event->hw.interrupts = 0;
1329 }
1330
797 /* 1331 /*
798 * The new state must be visible before we turn it on in the hardware: 1332 * The new state must be visible before we turn it on in the hardware:
799 */ 1333 */
@@ -807,7 +1341,7 @@ event_sched_in(struct perf_event *event,
807 1341
808 event->tstamp_running += tstamp - event->tstamp_stopped; 1342 event->tstamp_running += tstamp - event->tstamp_stopped;
809 1343
810 event->shadow_ctx_time = tstamp - ctx->timestamp; 1344 perf_set_shadow_time(event, ctx, tstamp);
811 1345
812 if (!is_software_event(event)) 1346 if (!is_software_event(event))
813 cpuctx->active_oncpu++; 1347 cpuctx->active_oncpu++;
@@ -928,12 +1462,15 @@ static void add_event_to_ctx(struct perf_event *event,
928 event->tstamp_stopped = tstamp; 1462 event->tstamp_stopped = tstamp;
929} 1463}
930 1464
1465static void perf_event_context_sched_in(struct perf_event_context *ctx,
1466 struct task_struct *tsk);
1467
931/* 1468/*
932 * Cross CPU call to install and enable a performance event 1469 * Cross CPU call to install and enable a performance event
933 * 1470 *
934 * Must be called with ctx->mutex held 1471 * Must be called with ctx->mutex held
935 */ 1472 */
936static void __perf_install_in_context(void *info) 1473static int __perf_install_in_context(void *info)
937{ 1474{
938 struct perf_event *event = info; 1475 struct perf_event *event = info;
939 struct perf_event_context *ctx = event->ctx; 1476 struct perf_event_context *ctx = event->ctx;
@@ -942,21 +1479,22 @@ static void __perf_install_in_context(void *info)
942 int err; 1479 int err;
943 1480
944 /* 1481 /*
945 * If this is a task context, we need to check whether it is 1482 * In case we're installing a new context to an already running task,
946 * the current task context of this cpu. If not it has been 1483 * could also happen before perf_event_task_sched_in() on architectures
947 * scheduled out before the smp call arrived. 1484 * which do context switches with IRQs enabled.
948 * Or possibly this is the right context but it isn't
949 * on this cpu because it had no events.
950 */ 1485 */
951 if (ctx->task && cpuctx->task_ctx != ctx) { 1486 if (ctx->task && !cpuctx->task_ctx)
952 if (cpuctx->task_ctx || ctx->task != current) 1487 perf_event_context_sched_in(ctx, ctx->task);
953 return;
954 cpuctx->task_ctx = ctx;
955 }
956 1488
957 raw_spin_lock(&ctx->lock); 1489 raw_spin_lock(&ctx->lock);
958 ctx->is_active = 1; 1490 ctx->is_active = 1;
959 update_context_time(ctx); 1491 update_context_time(ctx);
1492 /*
1493 * update cgrp time only if current cgrp
1494 * matches event->cgrp. Must be done before
1495 * calling add_event_to_ctx()
1496 */
1497 update_cgrp_time_from_event(event);
960 1498
961 add_event_to_ctx(event, ctx); 1499 add_event_to_ctx(event, ctx);
962 1500
@@ -997,6 +1535,8 @@ static void __perf_install_in_context(void *info)
997 1535
998unlock: 1536unlock:
999 raw_spin_unlock(&ctx->lock); 1537 raw_spin_unlock(&ctx->lock);
1538
1539 return 0;
1000} 1540}
1001 1541
1002/* 1542/*
@@ -1008,8 +1548,6 @@ unlock:
1008 * If the event is attached to a task which is on a CPU we use a smp 1548 * If the event is attached to a task which is on a CPU we use a smp
1009 * call to enable it in the task context. The task might have been 1549 * call to enable it in the task context. The task might have been
1010 * scheduled away, but we check this in the smp call again. 1550 * scheduled away, but we check this in the smp call again.
1011 *
1012 * Must be called with ctx->mutex held.
1013 */ 1551 */
1014static void 1552static void
1015perf_install_in_context(struct perf_event_context *ctx, 1553perf_install_in_context(struct perf_event_context *ctx,
@@ -1018,6 +1556,8 @@ perf_install_in_context(struct perf_event_context *ctx,
1018{ 1556{
1019 struct task_struct *task = ctx->task; 1557 struct task_struct *task = ctx->task;
1020 1558
1559 lockdep_assert_held(&ctx->mutex);
1560
1021 event->ctx = ctx; 1561 event->ctx = ctx;
1022 1562
1023 if (!task) { 1563 if (!task) {
@@ -1025,31 +1565,29 @@ perf_install_in_context(struct perf_event_context *ctx,
1025 * Per cpu events are installed via an smp call and 1565 * Per cpu events are installed via an smp call and
1026 * the install is always successful. 1566 * the install is always successful.
1027 */ 1567 */
1028 smp_call_function_single(cpu, __perf_install_in_context, 1568 cpu_function_call(cpu, __perf_install_in_context, event);
1029 event, 1);
1030 return; 1569 return;
1031 } 1570 }
1032 1571
1033retry: 1572retry:
1034 task_oncpu_function_call(task, __perf_install_in_context, 1573 if (!task_function_call(task, __perf_install_in_context, event))
1035 event); 1574 return;
1036 1575
1037 raw_spin_lock_irq(&ctx->lock); 1576 raw_spin_lock_irq(&ctx->lock);
1038 /* 1577 /*
1039 * we need to retry the smp call. 1578 * If we failed to find a running task, but find the context active now
1579 * that we've acquired the ctx->lock, retry.
1040 */ 1580 */
1041 if (ctx->is_active && list_empty(&event->group_entry)) { 1581 if (ctx->is_active) {
1042 raw_spin_unlock_irq(&ctx->lock); 1582 raw_spin_unlock_irq(&ctx->lock);
1043 goto retry; 1583 goto retry;
1044 } 1584 }
1045 1585
1046 /* 1586 /*
1047 * The lock prevents that this context is scheduled in so we 1587 * Since the task isn't running, its safe to add the event, us holding
1048 * can add the event safely, if it the call above did not 1588 * the ctx->lock ensures the task won't get scheduled in.
1049 * succeed.
1050 */ 1589 */
1051 if (list_empty(&event->group_entry)) 1590 add_event_to_ctx(event, ctx);
1052 add_event_to_ctx(event, ctx);
1053 raw_spin_unlock_irq(&ctx->lock); 1591 raw_spin_unlock_irq(&ctx->lock);
1054} 1592}
1055 1593
@@ -1078,7 +1616,7 @@ static void __perf_event_mark_enabled(struct perf_event *event,
1078/* 1616/*
1079 * Cross CPU call to enable a performance event 1617 * Cross CPU call to enable a performance event
1080 */ 1618 */
1081static void __perf_event_enable(void *info) 1619static int __perf_event_enable(void *info)
1082{ 1620{
1083 struct perf_event *event = info; 1621 struct perf_event *event = info;
1084 struct perf_event_context *ctx = event->ctx; 1622 struct perf_event_context *ctx = event->ctx;
@@ -1086,26 +1624,27 @@ static void __perf_event_enable(void *info)
1086 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1624 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1087 int err; 1625 int err;
1088 1626
1089 /* 1627 if (WARN_ON_ONCE(!ctx->is_active))
1090 * If this is a per-task event, need to check whether this 1628 return -EINVAL;
1091 * event's task is the current task on this cpu.
1092 */
1093 if (ctx->task && cpuctx->task_ctx != ctx) {
1094 if (cpuctx->task_ctx || ctx->task != current)
1095 return;
1096 cpuctx->task_ctx = ctx;
1097 }
1098 1629
1099 raw_spin_lock(&ctx->lock); 1630 raw_spin_lock(&ctx->lock);
1100 ctx->is_active = 1;
1101 update_context_time(ctx); 1631 update_context_time(ctx);
1102 1632
1103 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1633 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1104 goto unlock; 1634 goto unlock;
1635
1636 /*
1637 * set current task's cgroup time reference point
1638 */
1639 perf_cgroup_set_timestamp(current, ctx);
1640
1105 __perf_event_mark_enabled(event, ctx); 1641 __perf_event_mark_enabled(event, ctx);
1106 1642
1107 if (!event_filter_match(event)) 1643 if (!event_filter_match(event)) {
1644 if (is_cgroup_event(event))
1645 perf_cgroup_defer_enabled(event);
1108 goto unlock; 1646 goto unlock;
1647 }
1109 1648
1110 /* 1649 /*
1111 * If the event is in a group and isn't the group leader, 1650 * If the event is in a group and isn't the group leader,
@@ -1138,6 +1677,8 @@ static void __perf_event_enable(void *info)
1138 1677
1139unlock: 1678unlock:
1140 raw_spin_unlock(&ctx->lock); 1679 raw_spin_unlock(&ctx->lock);
1680
1681 return 0;
1141} 1682}
1142 1683
1143/* 1684/*
@@ -1158,8 +1699,7 @@ void perf_event_enable(struct perf_event *event)
1158 /* 1699 /*
1159 * Enable the event on the cpu that it's on 1700 * Enable the event on the cpu that it's on
1160 */ 1701 */
1161 smp_call_function_single(event->cpu, __perf_event_enable, 1702 cpu_function_call(event->cpu, __perf_event_enable, event);
1162 event, 1);
1163 return; 1703 return;
1164 } 1704 }
1165 1705
@@ -1178,8 +1718,15 @@ void perf_event_enable(struct perf_event *event)
1178 event->state = PERF_EVENT_STATE_OFF; 1718 event->state = PERF_EVENT_STATE_OFF;
1179 1719
1180retry: 1720retry:
1721 if (!ctx->is_active) {
1722 __perf_event_mark_enabled(event, ctx);
1723 goto out;
1724 }
1725
1181 raw_spin_unlock_irq(&ctx->lock); 1726 raw_spin_unlock_irq(&ctx->lock);
1182 task_oncpu_function_call(task, __perf_event_enable, event); 1727
1728 if (!task_function_call(task, __perf_event_enable, event))
1729 return;
1183 1730
1184 raw_spin_lock_irq(&ctx->lock); 1731 raw_spin_lock_irq(&ctx->lock);
1185 1732
@@ -1187,15 +1734,14 @@ retry:
1187 * If the context is active and the event is still off, 1734 * If the context is active and the event is still off,
1188 * we need to retry the cross-call. 1735 * we need to retry the cross-call.
1189 */ 1736 */
1190 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) 1737 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
1738 /*
1739 * task could have been flipped by a concurrent
1740 * perf_event_context_sched_out()
1741 */
1742 task = ctx->task;
1191 goto retry; 1743 goto retry;
1192 1744 }
1193 /*
1194 * Since we have the lock this context can't be scheduled
1195 * in, so we can change the state safely.
1196 */
1197 if (event->state == PERF_EVENT_STATE_OFF)
1198 __perf_event_mark_enabled(event, ctx);
1199 1745
1200out: 1746out:
1201 raw_spin_unlock_irq(&ctx->lock); 1747 raw_spin_unlock_irq(&ctx->lock);
@@ -1227,6 +1773,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
1227 if (likely(!ctx->nr_events)) 1773 if (likely(!ctx->nr_events))
1228 goto out; 1774 goto out;
1229 update_context_time(ctx); 1775 update_context_time(ctx);
1776 update_cgrp_time_from_cpuctx(cpuctx);
1230 1777
1231 if (!ctx->nr_active) 1778 if (!ctx->nr_active)
1232 goto out; 1779 goto out;
@@ -1339,8 +1886,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1339 } 1886 }
1340} 1887}
1341 1888
1342void perf_event_context_sched_out(struct task_struct *task, int ctxn, 1889static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1343 struct task_struct *next) 1890 struct task_struct *next)
1344{ 1891{
1345 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; 1892 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1346 struct perf_event_context *next_ctx; 1893 struct perf_event_context *next_ctx;
@@ -1416,6 +1963,14 @@ void __perf_event_task_sched_out(struct task_struct *task,
1416 1963
1417 for_each_task_context_nr(ctxn) 1964 for_each_task_context_nr(ctxn)
1418 perf_event_context_sched_out(task, ctxn, next); 1965 perf_event_context_sched_out(task, ctxn, next);
1966
1967 /*
1968 * if cgroup events exist on this CPU, then we need
1969 * to check if we have to switch out PMU state.
1970 * cgroup event are system-wide mode only
1971 */
1972 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
1973 perf_cgroup_sched_out(task);
1419} 1974}
1420 1975
1421static void task_ctx_sched_out(struct perf_event_context *ctx, 1976static void task_ctx_sched_out(struct perf_event_context *ctx,
@@ -1454,6 +2009,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
1454 if (!event_filter_match(event)) 2009 if (!event_filter_match(event))
1455 continue; 2010 continue;
1456 2011
2012 /* may need to reset tstamp_enabled */
2013 if (is_cgroup_event(event))
2014 perf_cgroup_mark_enabled(event, ctx);
2015
1457 if (group_can_go_on(event, cpuctx, 1)) 2016 if (group_can_go_on(event, cpuctx, 1))
1458 group_sched_in(event, cpuctx, ctx); 2017 group_sched_in(event, cpuctx, ctx);
1459 2018
@@ -1486,6 +2045,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1486 if (!event_filter_match(event)) 2045 if (!event_filter_match(event))
1487 continue; 2046 continue;
1488 2047
2048 /* may need to reset tstamp_enabled */
2049 if (is_cgroup_event(event))
2050 perf_cgroup_mark_enabled(event, ctx);
2051
1489 if (group_can_go_on(event, cpuctx, can_add_hw)) { 2052 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1490 if (group_sched_in(event, cpuctx, ctx)) 2053 if (group_sched_in(event, cpuctx, ctx))
1491 can_add_hw = 0; 2054 can_add_hw = 0;
@@ -1496,15 +2059,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1496static void 2059static void
1497ctx_sched_in(struct perf_event_context *ctx, 2060ctx_sched_in(struct perf_event_context *ctx,
1498 struct perf_cpu_context *cpuctx, 2061 struct perf_cpu_context *cpuctx,
1499 enum event_type_t event_type) 2062 enum event_type_t event_type,
2063 struct task_struct *task)
1500{ 2064{
2065 u64 now;
2066
1501 raw_spin_lock(&ctx->lock); 2067 raw_spin_lock(&ctx->lock);
1502 ctx->is_active = 1; 2068 ctx->is_active = 1;
1503 if (likely(!ctx->nr_events)) 2069 if (likely(!ctx->nr_events))
1504 goto out; 2070 goto out;
1505 2071
1506 ctx->timestamp = perf_clock(); 2072 now = perf_clock();
1507 2073 ctx->timestamp = now;
2074 perf_cgroup_set_timestamp(task, ctx);
1508 /* 2075 /*
1509 * First go through the list and put on any pinned groups 2076 * First go through the list and put on any pinned groups
1510 * in order to give them the best chance of going on. 2077 * in order to give them the best chance of going on.
@@ -1521,11 +2088,12 @@ out:
1521} 2088}
1522 2089
1523static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 2090static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1524 enum event_type_t event_type) 2091 enum event_type_t event_type,
2092 struct task_struct *task)
1525{ 2093{
1526 struct perf_event_context *ctx = &cpuctx->ctx; 2094 struct perf_event_context *ctx = &cpuctx->ctx;
1527 2095
1528 ctx_sched_in(ctx, cpuctx, event_type); 2096 ctx_sched_in(ctx, cpuctx, event_type, task);
1529} 2097}
1530 2098
1531static void task_ctx_sched_in(struct perf_event_context *ctx, 2099static void task_ctx_sched_in(struct perf_event_context *ctx,
@@ -1533,15 +2101,16 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
1533{ 2101{
1534 struct perf_cpu_context *cpuctx; 2102 struct perf_cpu_context *cpuctx;
1535 2103
1536 cpuctx = __get_cpu_context(ctx); 2104 cpuctx = __get_cpu_context(ctx);
1537 if (cpuctx->task_ctx == ctx) 2105 if (cpuctx->task_ctx == ctx)
1538 return; 2106 return;
1539 2107
1540 ctx_sched_in(ctx, cpuctx, event_type); 2108 ctx_sched_in(ctx, cpuctx, event_type, NULL);
1541 cpuctx->task_ctx = ctx; 2109 cpuctx->task_ctx = ctx;
1542} 2110}
1543 2111
1544void perf_event_context_sched_in(struct perf_event_context *ctx) 2112static void perf_event_context_sched_in(struct perf_event_context *ctx,
2113 struct task_struct *task)
1545{ 2114{
1546 struct perf_cpu_context *cpuctx; 2115 struct perf_cpu_context *cpuctx;
1547 2116
@@ -1557,9 +2126,9 @@ void perf_event_context_sched_in(struct perf_event_context *ctx)
1557 */ 2126 */
1558 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2127 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1559 2128
1560 ctx_sched_in(ctx, cpuctx, EVENT_PINNED); 2129 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1561 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2130 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1562 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); 2131 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1563 2132
1564 cpuctx->task_ctx = ctx; 2133 cpuctx->task_ctx = ctx;
1565 2134
@@ -1592,14 +2161,17 @@ void __perf_event_task_sched_in(struct task_struct *task)
1592 if (likely(!ctx)) 2161 if (likely(!ctx))
1593 continue; 2162 continue;
1594 2163
1595 perf_event_context_sched_in(ctx); 2164 perf_event_context_sched_in(ctx, task);
1596 } 2165 }
2166 /*
2167 * if cgroup events exist on this CPU, then we need
2168 * to check if we have to switch in PMU state.
2169 * cgroup event are system-wide mode only
2170 */
2171 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2172 perf_cgroup_sched_in(task);
1597} 2173}
1598 2174
1599#define MAX_INTERRUPTS (~0ULL)
1600
1601static void perf_log_throttle(struct perf_event *event, int enable);
1602
1603static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2175static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1604{ 2176{
1605 u64 frequency = event->attr.sample_freq; 2177 u64 frequency = event->attr.sample_freq;
@@ -1627,7 +2199,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1627 * Reduce accuracy by one bit such that @a and @b converge 2199 * Reduce accuracy by one bit such that @a and @b converge
1628 * to a similar magnitude. 2200 * to a similar magnitude.
1629 */ 2201 */
1630#define REDUCE_FLS(a, b) \ 2202#define REDUCE_FLS(a, b) \
1631do { \ 2203do { \
1632 if (a##_fls > b##_fls) { \ 2204 if (a##_fls > b##_fls) { \
1633 a >>= 1; \ 2205 a >>= 1; \
@@ -1797,7 +2369,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1797 if (ctx) 2369 if (ctx)
1798 rotate_ctx(ctx); 2370 rotate_ctx(ctx);
1799 2371
1800 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2372 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
1801 if (ctx) 2373 if (ctx)
1802 task_ctx_sched_in(ctx, EVENT_FLEXIBLE); 2374 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1803 2375
@@ -1852,6 +2424,14 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1852 if (!ctx || !ctx->nr_events) 2424 if (!ctx || !ctx->nr_events)
1853 goto out; 2425 goto out;
1854 2426
2427 /*
2428 * We must ctxsw out cgroup events to avoid conflict
2429 * when invoking perf_task_event_sched_in() later on
2430 * in this function. Otherwise we end up trying to
2431 * ctxswin cgroup events which are already scheduled
2432 * in.
2433 */
2434 perf_cgroup_sched_out(current);
1855 task_ctx_sched_out(ctx, EVENT_ALL); 2435 task_ctx_sched_out(ctx, EVENT_ALL);
1856 2436
1857 raw_spin_lock(&ctx->lock); 2437 raw_spin_lock(&ctx->lock);
@@ -1876,7 +2456,10 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1876 2456
1877 raw_spin_unlock(&ctx->lock); 2457 raw_spin_unlock(&ctx->lock);
1878 2458
1879 perf_event_context_sched_in(ctx); 2459 /*
2460 * Also calls ctxswin for cgroup events, if any:
2461 */
2462 perf_event_context_sched_in(ctx, ctx->task);
1880out: 2463out:
1881 local_irq_restore(flags); 2464 local_irq_restore(flags);
1882} 2465}
@@ -1901,8 +2484,10 @@ static void __perf_event_read(void *info)
1901 return; 2484 return;
1902 2485
1903 raw_spin_lock(&ctx->lock); 2486 raw_spin_lock(&ctx->lock);
1904 if (ctx->is_active) 2487 if (ctx->is_active) {
1905 update_context_time(ctx); 2488 update_context_time(ctx);
2489 update_cgrp_time_from_event(event);
2490 }
1906 update_event_times(event); 2491 update_event_times(event);
1907 if (event->state == PERF_EVENT_STATE_ACTIVE) 2492 if (event->state == PERF_EVENT_STATE_ACTIVE)
1908 event->pmu->read(event); 2493 event->pmu->read(event);
@@ -1933,8 +2518,10 @@ static u64 perf_event_read(struct perf_event *event)
1933 * (e.g., thread is blocked), in that case 2518 * (e.g., thread is blocked), in that case
1934 * we cannot update context time 2519 * we cannot update context time
1935 */ 2520 */
1936 if (ctx->is_active) 2521 if (ctx->is_active) {
1937 update_context_time(ctx); 2522 update_context_time(ctx);
2523 update_cgrp_time_from_event(event);
2524 }
1938 update_event_times(event); 2525 update_event_times(event);
1939 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2526 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1940 } 2527 }
@@ -2213,6 +2800,9 @@ errout:
2213 2800
2214} 2801}
2215 2802
2803/*
2804 * Returns a matching context with refcount and pincount.
2805 */
2216static struct perf_event_context * 2806static struct perf_event_context *
2217find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) 2807find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2218{ 2808{
@@ -2237,6 +2827,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2237 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 2827 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2238 ctx = &cpuctx->ctx; 2828 ctx = &cpuctx->ctx;
2239 get_ctx(ctx); 2829 get_ctx(ctx);
2830 ++ctx->pin_count;
2240 2831
2241 return ctx; 2832 return ctx;
2242 } 2833 }
@@ -2250,6 +2841,7 @@ retry:
2250 ctx = perf_lock_task_context(task, ctxn, &flags); 2841 ctx = perf_lock_task_context(task, ctxn, &flags);
2251 if (ctx) { 2842 if (ctx) {
2252 unclone_ctx(ctx); 2843 unclone_ctx(ctx);
2844 ++ctx->pin_count;
2253 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2845 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2254 } 2846 }
2255 2847
@@ -2271,8 +2863,10 @@ retry:
2271 err = -ESRCH; 2863 err = -ESRCH;
2272 else if (task->perf_event_ctxp[ctxn]) 2864 else if (task->perf_event_ctxp[ctxn])
2273 err = -EAGAIN; 2865 err = -EAGAIN;
2274 else 2866 else {
2867 ++ctx->pin_count;
2275 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); 2868 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2869 }
2276 mutex_unlock(&task->perf_event_mutex); 2870 mutex_unlock(&task->perf_event_mutex);
2277 2871
2278 if (unlikely(err)) { 2872 if (unlikely(err)) {
@@ -2312,7 +2906,7 @@ static void free_event(struct perf_event *event)
2312 2906
2313 if (!event->parent) { 2907 if (!event->parent) {
2314 if (event->attach_state & PERF_ATTACH_TASK) 2908 if (event->attach_state & PERF_ATTACH_TASK)
2315 jump_label_dec(&perf_task_events); 2909 jump_label_dec(&perf_sched_events);
2316 if (event->attr.mmap || event->attr.mmap_data) 2910 if (event->attr.mmap || event->attr.mmap_data)
2317 atomic_dec(&nr_mmap_events); 2911 atomic_dec(&nr_mmap_events);
2318 if (event->attr.comm) 2912 if (event->attr.comm)
@@ -2321,6 +2915,10 @@ static void free_event(struct perf_event *event)
2321 atomic_dec(&nr_task_events); 2915 atomic_dec(&nr_task_events);
2322 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 2916 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2323 put_callchain_buffers(); 2917 put_callchain_buffers();
2918 if (is_cgroup_event(event)) {
2919 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2920 jump_label_dec(&perf_sched_events);
2921 }
2324 } 2922 }
2325 2923
2326 if (event->buffer) { 2924 if (event->buffer) {
@@ -2328,6 +2926,9 @@ static void free_event(struct perf_event *event)
2328 event->buffer = NULL; 2926 event->buffer = NULL;
2329 } 2927 }
2330 2928
2929 if (is_cgroup_event(event))
2930 perf_detach_cgroup(event);
2931
2331 if (event->destroy) 2932 if (event->destroy)
2332 event->destroy(event); 2933 event->destroy(event);
2333 2934
@@ -4395,26 +4996,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4395 if (unlikely(!is_sampling_event(event))) 4996 if (unlikely(!is_sampling_event(event)))
4396 return 0; 4997 return 0;
4397 4998
4398 if (!throttle) { 4999 if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
4399 hwc->interrupts++; 5000 if (throttle) {
4400 } else { 5001 hwc->interrupts = MAX_INTERRUPTS;
4401 if (hwc->interrupts != MAX_INTERRUPTS) { 5002 perf_log_throttle(event, 0);
4402 hwc->interrupts++;
4403 if (HZ * hwc->interrupts >
4404 (u64)sysctl_perf_event_sample_rate) {
4405 hwc->interrupts = MAX_INTERRUPTS;
4406 perf_log_throttle(event, 0);
4407 ret = 1;
4408 }
4409 } else {
4410 /*
4411 * Keep re-disabling events even though on the previous
4412 * pass we disabled it - just in case we raced with a
4413 * sched-in and the event got enabled again:
4414 */
4415 ret = 1; 5003 ret = 1;
4416 } 5004 }
4417 } 5005 } else
5006 hwc->interrupts++;
4418 5007
4419 if (event->attr.freq) { 5008 if (event->attr.freq) {
4420 u64 now = perf_clock(); 5009 u64 now = perf_clock();
@@ -4556,7 +5145,7 @@ static int perf_exclude_event(struct perf_event *event,
4556 struct pt_regs *regs) 5145 struct pt_regs *regs)
4557{ 5146{
4558 if (event->hw.state & PERF_HES_STOPPED) 5147 if (event->hw.state & PERF_HES_STOPPED)
4559 return 0; 5148 return 1;
4560 5149
4561 if (regs) { 5150 if (regs) {
4562 if (event->attr.exclude_user && user_mode(regs)) 5151 if (event->attr.exclude_user && user_mode(regs))
@@ -4912,6 +5501,8 @@ static int perf_tp_event_match(struct perf_event *event,
4912 struct perf_sample_data *data, 5501 struct perf_sample_data *data,
4913 struct pt_regs *regs) 5502 struct pt_regs *regs)
4914{ 5503{
5504 if (event->hw.state & PERF_HES_STOPPED)
5505 return 0;
4915 /* 5506 /*
4916 * All tracepoints are from kernel-space. 5507 * All tracepoints are from kernel-space.
4917 */ 5508 */
@@ -5051,6 +5642,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5051 u64 period; 5642 u64 period;
5052 5643
5053 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 5644 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
5645
5646 if (event->state != PERF_EVENT_STATE_ACTIVE)
5647 return HRTIMER_NORESTART;
5648
5054 event->pmu->read(event); 5649 event->pmu->read(event);
5055 5650
5056 perf_sample_data_init(&data, 0); 5651 perf_sample_data_init(&data, 0);
@@ -5077,9 +5672,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
5077 if (!is_sampling_event(event)) 5672 if (!is_sampling_event(event))
5078 return; 5673 return;
5079 5674
5080 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5081 hwc->hrtimer.function = perf_swevent_hrtimer;
5082
5083 period = local64_read(&hwc->period_left); 5675 period = local64_read(&hwc->period_left);
5084 if (period) { 5676 if (period) {
5085 if (period < 0) 5677 if (period < 0)
@@ -5106,6 +5698,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5106 } 5698 }
5107} 5699}
5108 5700
5701static void perf_swevent_init_hrtimer(struct perf_event *event)
5702{
5703 struct hw_perf_event *hwc = &event->hw;
5704
5705 if (!is_sampling_event(event))
5706 return;
5707
5708 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5709 hwc->hrtimer.function = perf_swevent_hrtimer;
5710
5711 /*
5712 * Since hrtimers have a fixed rate, we can do a static freq->period
5713 * mapping and avoid the whole period adjust feedback stuff.
5714 */
5715 if (event->attr.freq) {
5716 long freq = event->attr.sample_freq;
5717
5718 event->attr.sample_period = NSEC_PER_SEC / freq;
5719 hwc->sample_period = event->attr.sample_period;
5720 local64_set(&hwc->period_left, hwc->sample_period);
5721 event->attr.freq = 0;
5722 }
5723}
5724
5109/* 5725/*
5110 * Software event: cpu wall time clock 5726 * Software event: cpu wall time clock
5111 */ 5727 */
@@ -5158,6 +5774,8 @@ static int cpu_clock_event_init(struct perf_event *event)
5158 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) 5774 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5159 return -ENOENT; 5775 return -ENOENT;
5160 5776
5777 perf_swevent_init_hrtimer(event);
5778
5161 return 0; 5779 return 0;
5162} 5780}
5163 5781
@@ -5213,16 +5831,9 @@ static void task_clock_event_del(struct perf_event *event, int flags)
5213 5831
5214static void task_clock_event_read(struct perf_event *event) 5832static void task_clock_event_read(struct perf_event *event)
5215{ 5833{
5216 u64 time; 5834 u64 now = perf_clock();
5217 5835 u64 delta = now - event->ctx->timestamp;
5218 if (!in_nmi()) { 5836 u64 time = event->ctx->time + delta;
5219 update_context_time(event->ctx);
5220 time = event->ctx->time;
5221 } else {
5222 u64 now = perf_clock();
5223 u64 delta = now - event->ctx->timestamp;
5224 time = event->ctx->time + delta;
5225 }
5226 5837
5227 task_clock_event_update(event, time); 5838 task_clock_event_update(event, time);
5228} 5839}
@@ -5235,6 +5846,8 @@ static int task_clock_event_init(struct perf_event *event)
5235 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) 5846 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5236 return -ENOENT; 5847 return -ENOENT;
5237 5848
5849 perf_swevent_init_hrtimer(event);
5850
5238 return 0; 5851 return 0;
5239} 5852}
5240 5853
@@ -5506,17 +6119,22 @@ struct pmu *perf_init_event(struct perf_event *event)
5506{ 6119{
5507 struct pmu *pmu = NULL; 6120 struct pmu *pmu = NULL;
5508 int idx; 6121 int idx;
6122 int ret;
5509 6123
5510 idx = srcu_read_lock(&pmus_srcu); 6124 idx = srcu_read_lock(&pmus_srcu);
5511 6125
5512 rcu_read_lock(); 6126 rcu_read_lock();
5513 pmu = idr_find(&pmu_idr, event->attr.type); 6127 pmu = idr_find(&pmu_idr, event->attr.type);
5514 rcu_read_unlock(); 6128 rcu_read_unlock();
5515 if (pmu) 6129 if (pmu) {
6130 ret = pmu->event_init(event);
6131 if (ret)
6132 pmu = ERR_PTR(ret);
5516 goto unlock; 6133 goto unlock;
6134 }
5517 6135
5518 list_for_each_entry_rcu(pmu, &pmus, entry) { 6136 list_for_each_entry_rcu(pmu, &pmus, entry) {
5519 int ret = pmu->event_init(event); 6137 ret = pmu->event_init(event);
5520 if (!ret) 6138 if (!ret)
5521 goto unlock; 6139 goto unlock;
5522 6140
@@ -5642,7 +6260,7 @@ done:
5642 6260
5643 if (!event->parent) { 6261 if (!event->parent) {
5644 if (event->attach_state & PERF_ATTACH_TASK) 6262 if (event->attach_state & PERF_ATTACH_TASK)
5645 jump_label_inc(&perf_task_events); 6263 jump_label_inc(&perf_sched_events);
5646 if (event->attr.mmap || event->attr.mmap_data) 6264 if (event->attr.mmap || event->attr.mmap_data)
5647 atomic_inc(&nr_mmap_events); 6265 atomic_inc(&nr_mmap_events);
5648 if (event->attr.comm) 6266 if (event->attr.comm)
@@ -5817,7 +6435,7 @@ SYSCALL_DEFINE5(perf_event_open,
5817 int err; 6435 int err;
5818 6436
5819 /* for future expandability... */ 6437 /* for future expandability... */
5820 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) 6438 if (flags & ~PERF_FLAG_ALL)
5821 return -EINVAL; 6439 return -EINVAL;
5822 6440
5823 err = perf_copy_attr(attr_uptr, &attr); 6441 err = perf_copy_attr(attr_uptr, &attr);
@@ -5834,6 +6452,15 @@ SYSCALL_DEFINE5(perf_event_open,
5834 return -EINVAL; 6452 return -EINVAL;
5835 } 6453 }
5836 6454
6455 /*
6456 * In cgroup mode, the pid argument is used to pass the fd
6457 * opened to the cgroup directory in cgroupfs. The cpu argument
6458 * designates the cpu on which to monitor threads from that
6459 * cgroup.
6460 */
6461 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6462 return -EINVAL;
6463
5837 event_fd = get_unused_fd_flags(O_RDWR); 6464 event_fd = get_unused_fd_flags(O_RDWR);
5838 if (event_fd < 0) 6465 if (event_fd < 0)
5839 return event_fd; 6466 return event_fd;
@@ -5851,7 +6478,7 @@ SYSCALL_DEFINE5(perf_event_open,
5851 group_leader = NULL; 6478 group_leader = NULL;
5852 } 6479 }
5853 6480
5854 if (pid != -1) { 6481 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
5855 task = find_lively_task_by_vpid(pid); 6482 task = find_lively_task_by_vpid(pid);
5856 if (IS_ERR(task)) { 6483 if (IS_ERR(task)) {
5857 err = PTR_ERR(task); 6484 err = PTR_ERR(task);
@@ -5865,6 +6492,19 @@ SYSCALL_DEFINE5(perf_event_open,
5865 goto err_task; 6492 goto err_task;
5866 } 6493 }
5867 6494
6495 if (flags & PERF_FLAG_PID_CGROUP) {
6496 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6497 if (err)
6498 goto err_alloc;
6499 /*
6500 * one more event:
6501 * - that has cgroup constraint on event->cpu
6502 * - that may need work on context switch
6503 */
6504 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6505 jump_label_inc(&perf_sched_events);
6506 }
6507
5868 /* 6508 /*
5869 * Special case software events and allow them to be part of 6509 * Special case software events and allow them to be part of
5870 * any hardware group. 6510 * any hardware group.
@@ -5903,6 +6543,11 @@ SYSCALL_DEFINE5(perf_event_open,
5903 goto err_alloc; 6543 goto err_alloc;
5904 } 6544 }
5905 6545
6546 if (task) {
6547 put_task_struct(task);
6548 task = NULL;
6549 }
6550
5906 /* 6551 /*
5907 * Look up the group leader (we will attach this event to it): 6552 * Look up the group leader (we will attach this event to it):
5908 */ 6553 */
@@ -5950,10 +6595,10 @@ SYSCALL_DEFINE5(perf_event_open,
5950 struct perf_event_context *gctx = group_leader->ctx; 6595 struct perf_event_context *gctx = group_leader->ctx;
5951 6596
5952 mutex_lock(&gctx->mutex); 6597 mutex_lock(&gctx->mutex);
5953 perf_event_remove_from_context(group_leader); 6598 perf_remove_from_context(group_leader);
5954 list_for_each_entry(sibling, &group_leader->sibling_list, 6599 list_for_each_entry(sibling, &group_leader->sibling_list,
5955 group_entry) { 6600 group_entry) {
5956 perf_event_remove_from_context(sibling); 6601 perf_remove_from_context(sibling);
5957 put_ctx(gctx); 6602 put_ctx(gctx);
5958 } 6603 }
5959 mutex_unlock(&gctx->mutex); 6604 mutex_unlock(&gctx->mutex);
@@ -5976,6 +6621,7 @@ SYSCALL_DEFINE5(perf_event_open,
5976 6621
5977 perf_install_in_context(ctx, event, cpu); 6622 perf_install_in_context(ctx, event, cpu);
5978 ++ctx->generation; 6623 ++ctx->generation;
6624 perf_unpin_context(ctx);
5979 mutex_unlock(&ctx->mutex); 6625 mutex_unlock(&ctx->mutex);
5980 6626
5981 event->owner = current; 6627 event->owner = current;
@@ -6001,6 +6647,7 @@ SYSCALL_DEFINE5(perf_event_open,
6001 return event_fd; 6647 return event_fd;
6002 6648
6003err_context: 6649err_context:
6650 perf_unpin_context(ctx);
6004 put_ctx(ctx); 6651 put_ctx(ctx);
6005err_alloc: 6652err_alloc:
6006 free_event(event); 6653 free_event(event);
@@ -6051,6 +6698,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6051 mutex_lock(&ctx->mutex); 6698 mutex_lock(&ctx->mutex);
6052 perf_install_in_context(ctx, event, cpu); 6699 perf_install_in_context(ctx, event, cpu);
6053 ++ctx->generation; 6700 ++ctx->generation;
6701 perf_unpin_context(ctx);
6054 mutex_unlock(&ctx->mutex); 6702 mutex_unlock(&ctx->mutex);
6055 6703
6056 return event; 6704 return event;
@@ -6102,17 +6750,20 @@ __perf_event_exit_task(struct perf_event *child_event,
6102 struct perf_event_context *child_ctx, 6750 struct perf_event_context *child_ctx,
6103 struct task_struct *child) 6751 struct task_struct *child)
6104{ 6752{
6105 struct perf_event *parent_event; 6753 if (child_event->parent) {
6754 raw_spin_lock_irq(&child_ctx->lock);
6755 perf_group_detach(child_event);
6756 raw_spin_unlock_irq(&child_ctx->lock);
6757 }
6106 6758
6107 perf_event_remove_from_context(child_event); 6759 perf_remove_from_context(child_event);
6108 6760
6109 parent_event = child_event->parent;
6110 /* 6761 /*
6111 * It can happen that parent exits first, and has events 6762 * It can happen that the parent exits first, and has events
6112 * that are still around due to the child reference. These 6763 * that are still around due to the child reference. These
6113 * events need to be zapped - but otherwise linger. 6764 * events need to be zapped.
6114 */ 6765 */
6115 if (parent_event) { 6766 if (child_event->parent) {
6116 sync_child_event(child_event, child); 6767 sync_child_event(child_event, child);
6117 free_event(child_event); 6768 free_event(child_event);
6118 } 6769 }
@@ -6411,7 +7062,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
6411 return 0; 7062 return 0;
6412 } 7063 }
6413 7064
6414 child_ctx = child->perf_event_ctxp[ctxn]; 7065 child_ctx = child->perf_event_ctxp[ctxn];
6415 if (!child_ctx) { 7066 if (!child_ctx) {
6416 /* 7067 /*
6417 * This is executed from the parent task context, so 7068 * This is executed from the parent task context, so
@@ -6526,6 +7177,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6526 mutex_unlock(&parent_ctx->mutex); 7177 mutex_unlock(&parent_ctx->mutex);
6527 7178
6528 perf_unpin_context(parent_ctx); 7179 perf_unpin_context(parent_ctx);
7180 put_ctx(parent_ctx);
6529 7181
6530 return ret; 7182 return ret;
6531} 7183}
@@ -6595,9 +7247,9 @@ static void __perf_event_exit_context(void *__info)
6595 perf_pmu_rotate_stop(ctx->pmu); 7247 perf_pmu_rotate_stop(ctx->pmu);
6596 7248
6597 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 7249 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
6598 __perf_event_remove_from_context(event); 7250 __perf_remove_from_context(event);
6599 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) 7251 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
6600 __perf_event_remove_from_context(event); 7252 __perf_remove_from_context(event);
6601} 7253}
6602 7254
6603static void perf_event_exit_cpu_context(int cpu) 7255static void perf_event_exit_cpu_context(int cpu)
@@ -6721,3 +7373,83 @@ unlock:
6721 return ret; 7373 return ret;
6722} 7374}
6723device_initcall(perf_event_sysfs_init); 7375device_initcall(perf_event_sysfs_init);
7376
7377#ifdef CONFIG_CGROUP_PERF
7378static struct cgroup_subsys_state *perf_cgroup_create(
7379 struct cgroup_subsys *ss, struct cgroup *cont)
7380{
7381 struct perf_cgroup *jc;
7382
7383 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
7384 if (!jc)
7385 return ERR_PTR(-ENOMEM);
7386
7387 jc->info = alloc_percpu(struct perf_cgroup_info);
7388 if (!jc->info) {
7389 kfree(jc);
7390 return ERR_PTR(-ENOMEM);
7391 }
7392
7393 return &jc->css;
7394}
7395
7396static void perf_cgroup_destroy(struct cgroup_subsys *ss,
7397 struct cgroup *cont)
7398{
7399 struct perf_cgroup *jc;
7400 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
7401 struct perf_cgroup, css);
7402 free_percpu(jc->info);
7403 kfree(jc);
7404}
7405
7406static int __perf_cgroup_move(void *info)
7407{
7408 struct task_struct *task = info;
7409 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
7410 return 0;
7411}
7412
7413static void perf_cgroup_move(struct task_struct *task)
7414{
7415 task_function_call(task, __perf_cgroup_move, task);
7416}
7417
7418static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7419 struct cgroup *old_cgrp, struct task_struct *task,
7420 bool threadgroup)
7421{
7422 perf_cgroup_move(task);
7423 if (threadgroup) {
7424 struct task_struct *c;
7425 rcu_read_lock();
7426 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
7427 perf_cgroup_move(c);
7428 }
7429 rcu_read_unlock();
7430 }
7431}
7432
7433static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7434 struct cgroup *old_cgrp, struct task_struct *task)
7435{
7436 /*
7437 * cgroup_exit() is called in the copy_process() failure path.
7438 * Ignore this case since the task hasn't ran yet, this avoids
7439 * trying to poke a half freed task state from generic code.
7440 */
7441 if (!(task->flags & PF_EXITING))
7442 return;
7443
7444 perf_cgroup_move(task);
7445}
7446
7447struct cgroup_subsys perf_subsys = {
7448 .name = "perf_event",
7449 .subsys_id = perf_subsys_id,
7450 .create = perf_cgroup_create,
7451 .destroy = perf_cgroup_destroy,
7452 .exit = perf_cgroup_exit,
7453 .attach = perf_cgroup_attach,
7454};
7455#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/pid.c b/kernel/pid.c
index 39b65b69584f..57a8346a270e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -217,11 +217,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
217 return -1; 217 return -1;
218} 218}
219 219
220int next_pidmap(struct pid_namespace *pid_ns, int last) 220int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
221{ 221{
222 int offset; 222 int offset;
223 struct pidmap *map, *end; 223 struct pidmap *map, *end;
224 224
225 if (last >= PID_MAX_LIMIT)
226 return -1;
227
225 offset = (last + 1) & BITS_PER_PAGE_MASK; 228 offset = (last + 1) & BITS_PER_PAGE_MASK;
226 map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; 229 map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
227 end = &pid_ns->pidmap[PIDMAP_ENTRIES]; 230 end = &pid_ns->pidmap[PIDMAP_ENTRIES];
@@ -435,6 +438,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
435 rcu_read_unlock(); 438 rcu_read_unlock();
436 return pid; 439 return pid;
437} 440}
441EXPORT_SYMBOL_GPL(get_task_pid);
438 442
439struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) 443struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
440{ 444{
@@ -446,6 +450,7 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
446 rcu_read_unlock(); 450 rcu_read_unlock();
447 return result; 451 return result;
448} 452}
453EXPORT_SYMBOL_GPL(get_pid_task);
449 454
450struct pid *find_get_pid(pid_t nr) 455struct pid *find_get_pid(pid_t nr)
451{ 456{
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a5aff94e1f0b..e9c9adc84ca6 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -14,6 +14,7 @@
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/acct.h> 15#include <linux/acct.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/proc_fs.h>
17 18
18#define BITS_PER_PAGE (PAGE_SIZE*8) 19#define BITS_PER_PAGE (PAGE_SIZE*8)
19 20
@@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
72{ 73{
73 struct pid_namespace *ns; 74 struct pid_namespace *ns;
74 unsigned int level = parent_pid_ns->level + 1; 75 unsigned int level = parent_pid_ns->level + 1;
75 int i; 76 int i, err = -ENOMEM;
76 77
77 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); 78 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
78 if (ns == NULL) 79 if (ns == NULL)
@@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
96 for (i = 1; i < PIDMAP_ENTRIES; i++) 97 for (i = 1; i < PIDMAP_ENTRIES; i++)
97 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 98 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
98 99
100 err = pid_ns_prepare_proc(ns);
101 if (err)
102 goto out_put_parent_pid_ns;
103
99 return ns; 104 return ns;
100 105
106out_put_parent_pid_ns:
107 put_pid_ns(parent_pid_ns);
101out_free_map: 108out_free_map:
102 kfree(ns->pidmap[0].page); 109 kfree(ns->pidmap[0].page);
103out_free: 110out_free:
104 kmem_cache_free(pid_ns_cachep, ns); 111 kmem_cache_free(pid_ns_cachep, ns);
105out: 112out:
106 return ERR_PTR(-ENOMEM); 113 return ERR_PTR(err);
107} 114}
108 115
109static void destroy_pid_namespace(struct pid_namespace *ns) 116static void destroy_pid_namespace(struct pid_namespace *ns)
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index aeaa7f846821..0da058bff8eb 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -103,11 +103,14 @@ static struct pm_qos_object *pm_qos_array[] = {
103 103
104static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 104static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
105 size_t count, loff_t *f_pos); 105 size_t count, loff_t *f_pos);
106static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
107 size_t count, loff_t *f_pos);
106static int pm_qos_power_open(struct inode *inode, struct file *filp); 108static int pm_qos_power_open(struct inode *inode, struct file *filp);
107static int pm_qos_power_release(struct inode *inode, struct file *filp); 109static int pm_qos_power_release(struct inode *inode, struct file *filp);
108 110
109static const struct file_operations pm_qos_power_fops = { 111static const struct file_operations pm_qos_power_fops = {
110 .write = pm_qos_power_write, 112 .write = pm_qos_power_write,
113 .read = pm_qos_power_read,
111 .open = pm_qos_power_open, 114 .open = pm_qos_power_open,
112 .release = pm_qos_power_release, 115 .release = pm_qos_power_release,
113 .llseek = noop_llseek, 116 .llseek = noop_llseek,
@@ -376,6 +379,27 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
376} 379}
377 380
378 381
382static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
383 size_t count, loff_t *f_pos)
384{
385 s32 value;
386 unsigned long flags;
387 struct pm_qos_object *o;
388 struct pm_qos_request_list *pm_qos_req = filp->private_data;;
389
390 if (!pm_qos_req)
391 return -EINVAL;
392 if (!pm_qos_request_active(pm_qos_req))
393 return -EINVAL;
394
395 o = pm_qos_array[pm_qos_req->pm_qos_class];
396 spin_lock_irqsave(&pm_qos_lock, flags);
397 value = pm_qos_get_value(o);
398 spin_unlock_irqrestore(&pm_qos_lock, flags);
399
400 return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
401}
402
379static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 403static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
380 size_t count, loff_t *f_pos) 404 size_t count, loff_t *f_pos)
381{ 405{
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 05bb7173850e..0791b13df7bf 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p)
176 return p->utime; 176 return p->utime;
177} 177}
178 178
179int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) 179static int
180posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
180{ 181{
181 int error = check_clock(which_clock); 182 int error = check_clock(which_clock);
182 if (!error) { 183 if (!error) {
@@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
194 return error; 195 return error;
195} 196}
196 197
197int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) 198static int
199posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
198{ 200{
199 /* 201 /*
200 * You can never reset a CPU clock, but we check for other errors 202 * You can never reset a CPU clock, but we check for other errors
@@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
317} 319}
318 320
319 321
320int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) 322static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
321{ 323{
322 const pid_t pid = CPUCLOCK_PID(which_clock); 324 const pid_t pid = CPUCLOCK_PID(which_clock);
323 int error = -EINVAL; 325 int error = -EINVAL;
@@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
379 * This is called from sys_timer_create() and do_cpu_nanosleep() with the 381 * This is called from sys_timer_create() and do_cpu_nanosleep() with the
380 * new timer already all-zeros initialized. 382 * new timer already all-zeros initialized.
381 */ 383 */
382int posix_cpu_timer_create(struct k_itimer *new_timer) 384static int posix_cpu_timer_create(struct k_itimer *new_timer)
383{ 385{
384 int ret = 0; 386 int ret = 0;
385 const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); 387 const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
@@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
425 * If we return TIMER_RETRY, it's necessary to release the timer's lock 427 * If we return TIMER_RETRY, it's necessary to release the timer's lock
426 * and try again. (This happens when the timer is in the middle of firing.) 428 * and try again. (This happens when the timer is in the middle of firing.)
427 */ 429 */
428int posix_cpu_timer_del(struct k_itimer *timer) 430static int posix_cpu_timer_del(struct k_itimer *timer)
429{ 431{
430 struct task_struct *p = timer->it.cpu.task; 432 struct task_struct *p = timer->it.cpu.task;
431 int ret = 0; 433 int ret = 0;
@@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
665 * If we return TIMER_RETRY, it's necessary to release the timer's lock 667 * If we return TIMER_RETRY, it's necessary to release the timer's lock
666 * and try again. (This happens when the timer is in the middle of firing.) 668 * and try again. (This happens when the timer is in the middle of firing.)
667 */ 669 */
668int posix_cpu_timer_set(struct k_itimer *timer, int flags, 670static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
669 struct itimerspec *new, struct itimerspec *old) 671 struct itimerspec *new, struct itimerspec *old)
670{ 672{
671 struct task_struct *p = timer->it.cpu.task; 673 struct task_struct *p = timer->it.cpu.task;
672 union cpu_time_count old_expires, new_expires, old_incr, val; 674 union cpu_time_count old_expires, new_expires, old_incr, val;
@@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
820 return ret; 822 return ret;
821} 823}
822 824
823void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) 825static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
824{ 826{
825 union cpu_time_count now; 827 union cpu_time_count now;
826 struct task_struct *p = timer->it.cpu.task; 828 struct task_struct *p = timer->it.cpu.task;
@@ -1345,7 +1347,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1345 1347
1346 /* 1348 /*
1347 * Now that all the timers on our list have the firing flag, 1349 * Now that all the timers on our list have the firing flag,
1348 * noone will touch their list entries but us. We'll take 1350 * no one will touch their list entries but us. We'll take
1349 * each timer's lock before clearing its firing flag, so no 1351 * each timer's lock before clearing its firing flag, so no
1350 * timer call will interfere. 1352 * timer call will interfere.
1351 */ 1353 */
@@ -1481,11 +1483,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1481 return error; 1483 return error;
1482} 1484}
1483 1485
1484int posix_cpu_nsleep(const clockid_t which_clock, int flags, 1486static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1485 struct timespec *rqtp, struct timespec __user *rmtp) 1487
1488static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1489 struct timespec *rqtp, struct timespec __user *rmtp)
1486{ 1490{
1487 struct restart_block *restart_block = 1491 struct restart_block *restart_block =
1488 &current_thread_info()->restart_block; 1492 &current_thread_info()->restart_block;
1489 struct itimerspec it; 1493 struct itimerspec it;
1490 int error; 1494 int error;
1491 1495
@@ -1501,56 +1505,47 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1501 1505
1502 if (error == -ERESTART_RESTARTBLOCK) { 1506 if (error == -ERESTART_RESTARTBLOCK) {
1503 1507
1504 if (flags & TIMER_ABSTIME) 1508 if (flags & TIMER_ABSTIME)
1505 return -ERESTARTNOHAND; 1509 return -ERESTARTNOHAND;
1506 /* 1510 /*
1507 * Report back to the user the time still remaining. 1511 * Report back to the user the time still remaining.
1508 */ 1512 */
1509 if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) 1513 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1510 return -EFAULT; 1514 return -EFAULT;
1511 1515
1512 restart_block->fn = posix_cpu_nsleep_restart; 1516 restart_block->fn = posix_cpu_nsleep_restart;
1513 restart_block->arg0 = which_clock; 1517 restart_block->nanosleep.index = which_clock;
1514 restart_block->arg1 = (unsigned long) rmtp; 1518 restart_block->nanosleep.rmtp = rmtp;
1515 restart_block->arg2 = rqtp->tv_sec; 1519 restart_block->nanosleep.expires = timespec_to_ns(rqtp);
1516 restart_block->arg3 = rqtp->tv_nsec;
1517 } 1520 }
1518 return error; 1521 return error;
1519} 1522}
1520 1523
1521long posix_cpu_nsleep_restart(struct restart_block *restart_block) 1524static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1522{ 1525{
1523 clockid_t which_clock = restart_block->arg0; 1526 clockid_t which_clock = restart_block->nanosleep.index;
1524 struct timespec __user *rmtp;
1525 struct timespec t; 1527 struct timespec t;
1526 struct itimerspec it; 1528 struct itimerspec it;
1527 int error; 1529 int error;
1528 1530
1529 rmtp = (struct timespec __user *) restart_block->arg1; 1531 t = ns_to_timespec(restart_block->nanosleep.expires);
1530 t.tv_sec = restart_block->arg2;
1531 t.tv_nsec = restart_block->arg3;
1532 1532
1533 restart_block->fn = do_no_restart_syscall;
1534 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); 1533 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
1535 1534
1536 if (error == -ERESTART_RESTARTBLOCK) { 1535 if (error == -ERESTART_RESTARTBLOCK) {
1536 struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
1537 /* 1537 /*
1538 * Report back to the user the time still remaining. 1538 * Report back to the user the time still remaining.
1539 */ 1539 */
1540 if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) 1540 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1541 return -EFAULT; 1541 return -EFAULT;
1542 1542
1543 restart_block->fn = posix_cpu_nsleep_restart; 1543 restart_block->nanosleep.expires = timespec_to_ns(&t);
1544 restart_block->arg0 = which_clock;
1545 restart_block->arg1 = (unsigned long) rmtp;
1546 restart_block->arg2 = t.tv_sec;
1547 restart_block->arg3 = t.tv_nsec;
1548 } 1544 }
1549 return error; 1545 return error;
1550 1546
1551} 1547}
1552 1548
1553
1554#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) 1549#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
1555#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) 1550#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
1556 1551
@@ -1594,38 +1589,37 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
1594 timer->it_clock = THREAD_CLOCK; 1589 timer->it_clock = THREAD_CLOCK;
1595 return posix_cpu_timer_create(timer); 1590 return posix_cpu_timer_create(timer);
1596} 1591}
1597static int thread_cpu_nsleep(const clockid_t which_clock, int flags, 1592
1598 struct timespec *rqtp, struct timespec __user *rmtp) 1593struct k_clock clock_posix_cpu = {
1599{ 1594 .clock_getres = posix_cpu_clock_getres,
1600 return -EINVAL; 1595 .clock_set = posix_cpu_clock_set,
1601} 1596 .clock_get = posix_cpu_clock_get,
1602static long thread_cpu_nsleep_restart(struct restart_block *restart_block) 1597 .timer_create = posix_cpu_timer_create,
1603{ 1598 .nsleep = posix_cpu_nsleep,
1604 return -EINVAL; 1599 .nsleep_restart = posix_cpu_nsleep_restart,
1605} 1600 .timer_set = posix_cpu_timer_set,
1601 .timer_del = posix_cpu_timer_del,
1602 .timer_get = posix_cpu_timer_get,
1603};
1606 1604
1607static __init int init_posix_cpu_timers(void) 1605static __init int init_posix_cpu_timers(void)
1608{ 1606{
1609 struct k_clock process = { 1607 struct k_clock process = {
1610 .clock_getres = process_cpu_clock_getres, 1608 .clock_getres = process_cpu_clock_getres,
1611 .clock_get = process_cpu_clock_get, 1609 .clock_get = process_cpu_clock_get,
1612 .clock_set = do_posix_clock_nosettime, 1610 .timer_create = process_cpu_timer_create,
1613 .timer_create = process_cpu_timer_create, 1611 .nsleep = process_cpu_nsleep,
1614 .nsleep = process_cpu_nsleep, 1612 .nsleep_restart = process_cpu_nsleep_restart,
1615 .nsleep_restart = process_cpu_nsleep_restart,
1616 }; 1613 };
1617 struct k_clock thread = { 1614 struct k_clock thread = {
1618 .clock_getres = thread_cpu_clock_getres, 1615 .clock_getres = thread_cpu_clock_getres,
1619 .clock_get = thread_cpu_clock_get, 1616 .clock_get = thread_cpu_clock_get,
1620 .clock_set = do_posix_clock_nosettime, 1617 .timer_create = thread_cpu_timer_create,
1621 .timer_create = thread_cpu_timer_create,
1622 .nsleep = thread_cpu_nsleep,
1623 .nsleep_restart = thread_cpu_nsleep_restart,
1624 }; 1618 };
1625 struct timespec ts; 1619 struct timespec ts;
1626 1620
1627 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); 1621 posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
1628 register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); 1622 posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
1629 1623
1630 cputime_to_timespec(cputime_one_jiffy, &ts); 1624 cputime_to_timespec(cputime_one_jiffy, &ts);
1631 onecputick = ts.tv_nsec; 1625 onecputick = ts.tv_nsec;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 93bd2eb2bc53..e5498d7405c3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -41,6 +41,7 @@
41#include <linux/init.h> 41#include <linux/init.h>
42#include <linux/compiler.h> 42#include <linux/compiler.h>
43#include <linux/idr.h> 43#include <linux/idr.h>
44#include <linux/posix-clock.h>
44#include <linux/posix-timers.h> 45#include <linux/posix-timers.h>
45#include <linux/syscalls.h> 46#include <linux/syscalls.h>
46#include <linux/wait.h> 47#include <linux/wait.h>
@@ -81,6 +82,14 @@ static DEFINE_SPINLOCK(idr_lock);
81#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" 82#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
82#endif 83#endif
83 84
85/*
86 * parisc wants ENOTSUP instead of EOPNOTSUPP
87 */
88#ifndef ENOTSUP
89# define ENANOSLEEP_NOTSUP EOPNOTSUPP
90#else
91# define ENANOSLEEP_NOTSUP ENOTSUP
92#endif
84 93
85/* 94/*
86 * The timer ID is turned into a timer address by idr_find(). 95 * The timer ID is turned into a timer address by idr_find().
@@ -94,11 +103,7 @@ static DEFINE_SPINLOCK(idr_lock);
94/* 103/*
95 * CLOCKs: The POSIX standard calls for a couple of clocks and allows us 104 * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
96 * to implement others. This structure defines the various 105 * to implement others. This structure defines the various
97 * clocks and allows the possibility of adding others. We 106 * clocks.
98 * provide an interface to add clocks to the table and expect
99 * the "arch" code to add at least one clock that is high
100 * resolution. Here we define the standard CLOCK_REALTIME as a
101 * 1/HZ resolution clock.
102 * 107 *
103 * RESOLUTION: Clock resolution is used to round up timer and interval 108 * RESOLUTION: Clock resolution is used to round up timer and interval
104 * times, NOT to report clock times, which are reported with as 109 * times, NOT to report clock times, which are reported with as
@@ -108,20 +113,13 @@ static DEFINE_SPINLOCK(idr_lock);
108 * necessary code is written. The standard says we should say 113 * necessary code is written. The standard says we should say
109 * something about this issue in the documentation... 114 * something about this issue in the documentation...
110 * 115 *
111 * FUNCTIONS: The CLOCKs structure defines possible functions to handle 116 * FUNCTIONS: The CLOCKs structure defines possible functions to
112 * various clock functions. For clocks that use the standard 117 * handle various clock functions.
113 * system timer code these entries should be NULL. This will
114 * allow dispatch without the overhead of indirect function
115 * calls. CLOCKS that depend on other sources (e.g. WWV or GPS)
116 * must supply functions here, even if the function just returns
117 * ENOSYS. The standard POSIX timer management code assumes the
118 * following: 1.) The k_itimer struct (sched.h) is used for the
119 * timer. 2.) The list, it_lock, it_clock, it_id and it_pid
120 * fields are not modified by timer code.
121 * 118 *
122 * At this time all functions EXCEPT clock_nanosleep can be 119 * The standard POSIX timer management code assumes the
123 * redirected by the CLOCKS structure. Clock_nanosleep is in 120 * following: 1.) The k_itimer struct (sched.h) is used for
124 * there, but the code ignores it. 121 * the timer. 2.) The list, it_lock, it_clock, it_id and
122 * it_pid fields are not modified by timer code.
125 * 123 *
126 * Permissions: It is assumed that the clock_settime() function defined 124 * Permissions: It is assumed that the clock_settime() function defined
127 * for each clock will take care of permission checks. Some 125 * for each clock will take care of permission checks. Some
@@ -138,6 +136,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
138 */ 136 */
139static int common_nsleep(const clockid_t, int flags, struct timespec *t, 137static int common_nsleep(const clockid_t, int flags, struct timespec *t,
140 struct timespec __user *rmtp); 138 struct timespec __user *rmtp);
139static int common_timer_create(struct k_itimer *new_timer);
141static void common_timer_get(struct k_itimer *, struct itimerspec *); 140static void common_timer_get(struct k_itimer *, struct itimerspec *);
142static int common_timer_set(struct k_itimer *, int, 141static int common_timer_set(struct k_itimer *, int,
143 struct itimerspec *, struct itimerspec *); 142 struct itimerspec *, struct itimerspec *);
@@ -158,76 +157,24 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
158 spin_unlock_irqrestore(&timr->it_lock, flags); 157 spin_unlock_irqrestore(&timr->it_lock, flags);
159} 158}
160 159
161/* 160/* Get clock_realtime */
162 * Call the k_clock hook function if non-null, or the default function. 161static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
163 */
164#define CLOCK_DISPATCH(clock, call, arglist) \
165 ((clock) < 0 ? posix_cpu_##call arglist : \
166 (posix_clocks[clock].call != NULL \
167 ? (*posix_clocks[clock].call) arglist : common_##call arglist))
168
169/*
170 * Default clock hook functions when the struct k_clock passed
171 * to register_posix_clock leaves a function pointer null.
172 *
173 * The function common_CALL is the default implementation for
174 * the function pointer CALL in struct k_clock.
175 */
176
177static inline int common_clock_getres(const clockid_t which_clock,
178 struct timespec *tp)
179{
180 tp->tv_sec = 0;
181 tp->tv_nsec = posix_clocks[which_clock].res;
182 return 0;
183}
184
185/*
186 * Get real time for posix timers
187 */
188static int common_clock_get(clockid_t which_clock, struct timespec *tp)
189{ 162{
190 ktime_get_real_ts(tp); 163 ktime_get_real_ts(tp);
191 return 0; 164 return 0;
192} 165}
193 166
194static inline int common_clock_set(const clockid_t which_clock, 167/* Set clock_realtime */
195 struct timespec *tp) 168static int posix_clock_realtime_set(const clockid_t which_clock,
169 const struct timespec *tp)
196{ 170{
197 return do_sys_settimeofday(tp, NULL); 171 return do_sys_settimeofday(tp, NULL);
198} 172}
199 173
200static int common_timer_create(struct k_itimer *new_timer) 174static int posix_clock_realtime_adj(const clockid_t which_clock,
201{ 175 struct timex *t)
202 hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
203 return 0;
204}
205
206static int no_timer_create(struct k_itimer *new_timer)
207{
208 return -EOPNOTSUPP;
209}
210
211static int no_nsleep(const clockid_t which_clock, int flags,
212 struct timespec *tsave, struct timespec __user *rmtp)
213{
214 return -EOPNOTSUPP;
215}
216
217/*
218 * Return nonzero if we know a priori this clockid_t value is bogus.
219 */
220static inline int invalid_clockid(const clockid_t which_clock)
221{ 176{
222 if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ 177 return do_adjtimex(t);
223 return 0;
224 if ((unsigned) which_clock >= MAX_CLOCKS)
225 return 1;
226 if (posix_clocks[which_clock].clock_getres != NULL)
227 return 0;
228 if (posix_clocks[which_clock].res != 0)
229 return 0;
230 return 1;
231} 178}
232 179
233/* 180/*
@@ -240,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
240} 187}
241 188
242/* 189/*
243 * Get monotonic time for posix timers 190 * Get monotonic-raw time for posix timers
244 */ 191 */
245static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) 192static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
246{ 193{
@@ -267,46 +214,70 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp
267 *tp = ktime_to_timespec(KTIME_LOW_RES); 214 *tp = ktime_to_timespec(KTIME_LOW_RES);
268 return 0; 215 return 0;
269} 216}
217
218static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
219{
220 get_monotonic_boottime(tp);
221 return 0;
222}
223
224
270/* 225/*
271 * Initialize everything, well, just everything in Posix clocks/timers ;) 226 * Initialize everything, well, just everything in Posix clocks/timers ;)
272 */ 227 */
273static __init int init_posix_timers(void) 228static __init int init_posix_timers(void)
274{ 229{
275 struct k_clock clock_realtime = { 230 struct k_clock clock_realtime = {
276 .clock_getres = hrtimer_get_res, 231 .clock_getres = hrtimer_get_res,
232 .clock_get = posix_clock_realtime_get,
233 .clock_set = posix_clock_realtime_set,
234 .clock_adj = posix_clock_realtime_adj,
235 .nsleep = common_nsleep,
236 .nsleep_restart = hrtimer_nanosleep_restart,
237 .timer_create = common_timer_create,
238 .timer_set = common_timer_set,
239 .timer_get = common_timer_get,
240 .timer_del = common_timer_del,
277 }; 241 };
278 struct k_clock clock_monotonic = { 242 struct k_clock clock_monotonic = {
279 .clock_getres = hrtimer_get_res, 243 .clock_getres = hrtimer_get_res,
280 .clock_get = posix_ktime_get_ts, 244 .clock_get = posix_ktime_get_ts,
281 .clock_set = do_posix_clock_nosettime, 245 .nsleep = common_nsleep,
246 .nsleep_restart = hrtimer_nanosleep_restart,
247 .timer_create = common_timer_create,
248 .timer_set = common_timer_set,
249 .timer_get = common_timer_get,
250 .timer_del = common_timer_del,
282 }; 251 };
283 struct k_clock clock_monotonic_raw = { 252 struct k_clock clock_monotonic_raw = {
284 .clock_getres = hrtimer_get_res, 253 .clock_getres = hrtimer_get_res,
285 .clock_get = posix_get_monotonic_raw, 254 .clock_get = posix_get_monotonic_raw,
286 .clock_set = do_posix_clock_nosettime,
287 .timer_create = no_timer_create,
288 .nsleep = no_nsleep,
289 }; 255 };
290 struct k_clock clock_realtime_coarse = { 256 struct k_clock clock_realtime_coarse = {
291 .clock_getres = posix_get_coarse_res, 257 .clock_getres = posix_get_coarse_res,
292 .clock_get = posix_get_realtime_coarse, 258 .clock_get = posix_get_realtime_coarse,
293 .clock_set = do_posix_clock_nosettime,
294 .timer_create = no_timer_create,
295 .nsleep = no_nsleep,
296 }; 259 };
297 struct k_clock clock_monotonic_coarse = { 260 struct k_clock clock_monotonic_coarse = {
298 .clock_getres = posix_get_coarse_res, 261 .clock_getres = posix_get_coarse_res,
299 .clock_get = posix_get_monotonic_coarse, 262 .clock_get = posix_get_monotonic_coarse,
300 .clock_set = do_posix_clock_nosettime, 263 };
301 .timer_create = no_timer_create, 264 struct k_clock clock_boottime = {
302 .nsleep = no_nsleep, 265 .clock_getres = hrtimer_get_res,
266 .clock_get = posix_get_boottime,
267 .nsleep = common_nsleep,
268 .nsleep_restart = hrtimer_nanosleep_restart,
269 .timer_create = common_timer_create,
270 .timer_set = common_timer_set,
271 .timer_get = common_timer_get,
272 .timer_del = common_timer_del,
303 }; 273 };
304 274
305 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 275 posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
306 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 276 posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
307 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); 277 posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
308 register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); 278 posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
309 register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); 279 posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
280 posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
310 281
311 posix_timers_cache = kmem_cache_create("posix_timers_cache", 282 posix_timers_cache = kmem_cache_create("posix_timers_cache",
312 sizeof (struct k_itimer), 0, SLAB_PANIC, 283 sizeof (struct k_itimer), 0, SLAB_PANIC,
@@ -342,7 +313,7 @@ static void schedule_next_timer(struct k_itimer *timr)
342 * restarted (i.e. we have flagged this in the sys_private entry of the 313 * restarted (i.e. we have flagged this in the sys_private entry of the
343 * info block). 314 * info block).
344 * 315 *
345 * To protect aginst the timer going away while the interrupt is queued, 316 * To protect against the timer going away while the interrupt is queued,
346 * we require that the it_requeue_pending flag be set. 317 * we require that the it_requeue_pending flag be set.
347 */ 318 */
348void do_schedule_next_timer(struct siginfo *info) 319void do_schedule_next_timer(struct siginfo *info)
@@ -482,17 +453,29 @@ static struct pid *good_sigevent(sigevent_t * event)
482 return task_pid(rtn); 453 return task_pid(rtn);
483} 454}
484 455
485void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) 456void posix_timers_register_clock(const clockid_t clock_id,
457 struct k_clock *new_clock)
486{ 458{
487 if ((unsigned) clock_id >= MAX_CLOCKS) { 459 if ((unsigned) clock_id >= MAX_CLOCKS) {
488 printk("POSIX clock register failed for clock_id %d\n", 460 printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
461 clock_id);
462 return;
463 }
464
465 if (!new_clock->clock_get) {
466 printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n",
467 clock_id);
468 return;
469 }
470 if (!new_clock->clock_getres) {
471 printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n",
489 clock_id); 472 clock_id);
490 return; 473 return;
491 } 474 }
492 475
493 posix_clocks[clock_id] = *new_clock; 476 posix_clocks[clock_id] = *new_clock;
494} 477}
495EXPORT_SYMBOL_GPL(register_posix_clock); 478EXPORT_SYMBOL_GPL(posix_timers_register_clock);
496 479
497static struct k_itimer * alloc_posix_timer(void) 480static struct k_itimer * alloc_posix_timer(void)
498{ 481{
@@ -523,19 +506,39 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
523 kmem_cache_free(posix_timers_cache, tmr); 506 kmem_cache_free(posix_timers_cache, tmr);
524} 507}
525 508
509static struct k_clock *clockid_to_kclock(const clockid_t id)
510{
511 if (id < 0)
512 return (id & CLOCKFD_MASK) == CLOCKFD ?
513 &clock_posix_dynamic : &clock_posix_cpu;
514
515 if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
516 return NULL;
517 return &posix_clocks[id];
518}
519
520static int common_timer_create(struct k_itimer *new_timer)
521{
522 hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
523 return 0;
524}
525
526/* Create a POSIX.1b interval timer. */ 526/* Create a POSIX.1b interval timer. */
527 527
528SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, 528SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
529 struct sigevent __user *, timer_event_spec, 529 struct sigevent __user *, timer_event_spec,
530 timer_t __user *, created_timer_id) 530 timer_t __user *, created_timer_id)
531{ 531{
532 struct k_clock *kc = clockid_to_kclock(which_clock);
532 struct k_itimer *new_timer; 533 struct k_itimer *new_timer;
533 int error, new_timer_id; 534 int error, new_timer_id;
534 sigevent_t event; 535 sigevent_t event;
535 int it_id_set = IT_ID_NOT_SET; 536 int it_id_set = IT_ID_NOT_SET;
536 537
537 if (invalid_clockid(which_clock)) 538 if (!kc)
538 return -EINVAL; 539 return -EINVAL;
540 if (!kc->timer_create)
541 return -EOPNOTSUPP;
539 542
540 new_timer = alloc_posix_timer(); 543 new_timer = alloc_posix_timer();
541 if (unlikely(!new_timer)) 544 if (unlikely(!new_timer))
@@ -597,7 +600,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
597 goto out; 600 goto out;
598 } 601 }
599 602
600 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); 603 error = kc->timer_create(new_timer);
601 if (error) 604 if (error)
602 goto out; 605 goto out;
603 606
@@ -607,7 +610,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
607 spin_unlock_irq(&current->sighand->siglock); 610 spin_unlock_irq(&current->sighand->siglock);
608 611
609 return 0; 612 return 0;
610 /* 613 /*
611 * In the case of the timer belonging to another task, after 614 * In the case of the timer belonging to another task, after
612 * the task is unlocked, the timer is owned by the other task 615 * the task is unlocked, the timer is owned by the other task
613 * and may cease to exist at any time. Don't use or modify 616 * and may cease to exist at any time. Don't use or modify
@@ -709,22 +712,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
709SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, 712SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
710 struct itimerspec __user *, setting) 713 struct itimerspec __user *, setting)
711{ 714{
712 struct k_itimer *timr;
713 struct itimerspec cur_setting; 715 struct itimerspec cur_setting;
716 struct k_itimer *timr;
717 struct k_clock *kc;
714 unsigned long flags; 718 unsigned long flags;
719 int ret = 0;
715 720
716 timr = lock_timer(timer_id, &flags); 721 timr = lock_timer(timer_id, &flags);
717 if (!timr) 722 if (!timr)
718 return -EINVAL; 723 return -EINVAL;
719 724
720 CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting)); 725 kc = clockid_to_kclock(timr->it_clock);
726 if (WARN_ON_ONCE(!kc || !kc->timer_get))
727 ret = -EINVAL;
728 else
729 kc->timer_get(timr, &cur_setting);
721 730
722 unlock_timer(timr, flags); 731 unlock_timer(timr, flags);
723 732
724 if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) 733 if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
725 return -EFAULT; 734 return -EFAULT;
726 735
727 return 0; 736 return ret;
728} 737}
729 738
730/* 739/*
@@ -813,6 +822,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
813 int error = 0; 822 int error = 0;
814 unsigned long flag; 823 unsigned long flag;
815 struct itimerspec *rtn = old_setting ? &old_spec : NULL; 824 struct itimerspec *rtn = old_setting ? &old_spec : NULL;
825 struct k_clock *kc;
816 826
817 if (!new_setting) 827 if (!new_setting)
818 return -EINVAL; 828 return -EINVAL;
@@ -828,8 +838,11 @@ retry:
828 if (!timr) 838 if (!timr)
829 return -EINVAL; 839 return -EINVAL;
830 840
831 error = CLOCK_DISPATCH(timr->it_clock, timer_set, 841 kc = clockid_to_kclock(timr->it_clock);
832 (timr, flags, &new_spec, rtn)); 842 if (WARN_ON_ONCE(!kc || !kc->timer_set))
843 error = -EINVAL;
844 else
845 error = kc->timer_set(timr, flags, &new_spec, rtn);
833 846
834 unlock_timer(timr, flag); 847 unlock_timer(timr, flag);
835 if (error == TIMER_RETRY) { 848 if (error == TIMER_RETRY) {
@@ -844,7 +857,7 @@ retry:
844 return error; 857 return error;
845} 858}
846 859
847static inline int common_timer_del(struct k_itimer *timer) 860static int common_timer_del(struct k_itimer *timer)
848{ 861{
849 timer->it.real.interval.tv64 = 0; 862 timer->it.real.interval.tv64 = 0;
850 863
@@ -855,7 +868,11 @@ static inline int common_timer_del(struct k_itimer *timer)
855 868
856static inline int timer_delete_hook(struct k_itimer *timer) 869static inline int timer_delete_hook(struct k_itimer *timer)
857{ 870{
858 return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer)); 871 struct k_clock *kc = clockid_to_kclock(timer->it_clock);
872
873 if (WARN_ON_ONCE(!kc || !kc->timer_del))
874 return -EINVAL;
875 return kc->timer_del(timer);
859} 876}
860 877
861/* Delete a POSIX.1b interval timer. */ 878/* Delete a POSIX.1b interval timer. */
@@ -927,69 +944,76 @@ void exit_itimers(struct signal_struct *sig)
927 } 944 }
928} 945}
929 946
930/* Not available / possible... functions */
931int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
932{
933 return -EINVAL;
934}
935EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
936
937int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
938 struct timespec *t, struct timespec __user *r)
939{
940#ifndef ENOTSUP
941 return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */
942#else /* parisc does define it separately. */
943 return -ENOTSUP;
944#endif
945}
946EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
947
948SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, 947SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
949 const struct timespec __user *, tp) 948 const struct timespec __user *, tp)
950{ 949{
950 struct k_clock *kc = clockid_to_kclock(which_clock);
951 struct timespec new_tp; 951 struct timespec new_tp;
952 952
953 if (invalid_clockid(which_clock)) 953 if (!kc || !kc->clock_set)
954 return -EINVAL; 954 return -EINVAL;
955
955 if (copy_from_user(&new_tp, tp, sizeof (*tp))) 956 if (copy_from_user(&new_tp, tp, sizeof (*tp)))
956 return -EFAULT; 957 return -EFAULT;
957 958
958 return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); 959 return kc->clock_set(which_clock, &new_tp);
959} 960}
960 961
961SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, 962SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
962 struct timespec __user *,tp) 963 struct timespec __user *,tp)
963{ 964{
965 struct k_clock *kc = clockid_to_kclock(which_clock);
964 struct timespec kernel_tp; 966 struct timespec kernel_tp;
965 int error; 967 int error;
966 968
967 if (invalid_clockid(which_clock)) 969 if (!kc)
968 return -EINVAL; 970 return -EINVAL;
969 error = CLOCK_DISPATCH(which_clock, clock_get, 971
970 (which_clock, &kernel_tp)); 972 error = kc->clock_get(which_clock, &kernel_tp);
973
971 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) 974 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
972 error = -EFAULT; 975 error = -EFAULT;
973 976
974 return error; 977 return error;
978}
979
980SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
981 struct timex __user *, utx)
982{
983 struct k_clock *kc = clockid_to_kclock(which_clock);
984 struct timex ktx;
985 int err;
986
987 if (!kc)
988 return -EINVAL;
989 if (!kc->clock_adj)
990 return -EOPNOTSUPP;
991
992 if (copy_from_user(&ktx, utx, sizeof(ktx)))
993 return -EFAULT;
994
995 err = kc->clock_adj(which_clock, &ktx);
996
997 if (!err && copy_to_user(utx, &ktx, sizeof(ktx)))
998 return -EFAULT;
975 999
1000 return err;
976} 1001}
977 1002
978SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, 1003SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
979 struct timespec __user *, tp) 1004 struct timespec __user *, tp)
980{ 1005{
1006 struct k_clock *kc = clockid_to_kclock(which_clock);
981 struct timespec rtn_tp; 1007 struct timespec rtn_tp;
982 int error; 1008 int error;
983 1009
984 if (invalid_clockid(which_clock)) 1010 if (!kc)
985 return -EINVAL; 1011 return -EINVAL;
986 1012
987 error = CLOCK_DISPATCH(which_clock, clock_getres, 1013 error = kc->clock_getres(which_clock, &rtn_tp);
988 (which_clock, &rtn_tp));
989 1014
990 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) { 1015 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
991 error = -EFAULT; 1016 error = -EFAULT;
992 }
993 1017
994 return error; 1018 return error;
995} 1019}
@@ -1009,10 +1033,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1009 const struct timespec __user *, rqtp, 1033 const struct timespec __user *, rqtp,
1010 struct timespec __user *, rmtp) 1034 struct timespec __user *, rmtp)
1011{ 1035{
1036 struct k_clock *kc = clockid_to_kclock(which_clock);
1012 struct timespec t; 1037 struct timespec t;
1013 1038
1014 if (invalid_clockid(which_clock)) 1039 if (!kc)
1015 return -EINVAL; 1040 return -EINVAL;
1041 if (!kc->nsleep)
1042 return -ENANOSLEEP_NOTSUP;
1016 1043
1017 if (copy_from_user(&t, rqtp, sizeof (struct timespec))) 1044 if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
1018 return -EFAULT; 1045 return -EFAULT;
@@ -1020,27 +1047,20 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1020 if (!timespec_valid(&t)) 1047 if (!timespec_valid(&t))
1021 return -EINVAL; 1048 return -EINVAL;
1022 1049
1023 return CLOCK_DISPATCH(which_clock, nsleep, 1050 return kc->nsleep(which_clock, flags, &t, rmtp);
1024 (which_clock, flags, &t, rmtp));
1025}
1026
1027/*
1028 * nanosleep_restart for monotonic and realtime clocks
1029 */
1030static int common_nsleep_restart(struct restart_block *restart_block)
1031{
1032 return hrtimer_nanosleep_restart(restart_block);
1033} 1051}
1034 1052
1035/* 1053/*
1036 * This will restart clock_nanosleep. This is required only by 1054 * This will restart clock_nanosleep. This is required only by
1037 * compat_clock_nanosleep_restart for now. 1055 * compat_clock_nanosleep_restart for now.
1038 */ 1056 */
1039long 1057long clock_nanosleep_restart(struct restart_block *restart_block)
1040clock_nanosleep_restart(struct restart_block *restart_block)
1041{ 1058{
1042 clockid_t which_clock = restart_block->arg0; 1059 clockid_t which_clock = restart_block->nanosleep.index;
1060 struct k_clock *kc = clockid_to_kclock(which_clock);
1061
1062 if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
1063 return -EINVAL;
1043 1064
1044 return CLOCK_DISPATCH(which_clock, nsleep_restart, 1065 return kc->nsleep_restart(restart_block);
1045 (restart_block));
1046} 1066}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 265729966ece..6de9a8fc3417 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,125 +1,12 @@
1config PM
2 bool "Power Management support"
3 depends on !IA64_HP_SIM
4 ---help---
5 "Power Management" means that parts of your computer are shut
6 off or put into a power conserving "sleep" mode if they are not
7 being used. There are two competing standards for doing this: APM
8 and ACPI. If you want to use either one, say Y here and then also
9 to the requisite support below.
10
11 Power Management is most important for battery powered laptop
12 computers; if you have a laptop, check out the Linux Laptop home
13 page on the WWW at <http://www.linux-on-laptops.com/> or
14 Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
15 and the Battery Powered Linux mini-HOWTO, available from
16 <http://www.tldp.org/docs.html#howto>.
17
18 Note that, even if you say N here, Linux on the x86 architecture
19 will issue the hlt instruction if nothing is to be done, thereby
20 sending the processor to sleep and saving power.
21
22config PM_DEBUG
23 bool "Power Management Debug Support"
24 depends on PM
25 ---help---
26 This option enables various debugging support in the Power Management
27 code. This is helpful when debugging and reporting PM bugs, like
28 suspend support.
29
30config PM_ADVANCED_DEBUG
31 bool "Extra PM attributes in sysfs for low-level debugging/testing"
32 depends on PM_DEBUG
33 default n
34 ---help---
35 Add extra sysfs attributes allowing one to access some Power Management
36 fields of device objects from user space. If you are not a kernel
37 developer interested in debugging/testing Power Management, say "no".
38
39config PM_VERBOSE
40 bool "Verbose Power Management debugging"
41 depends on PM_DEBUG
42 default n
43 ---help---
44 This option enables verbose messages from the Power Management code.
45
46config CAN_PM_TRACE
47 def_bool y
48 depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL
49
50config PM_TRACE
51 bool
52 help
53 This enables code to save the last PM event point across
54 reboot. The architecture needs to support this, x86 for
55 example does by saving things in the RTC, see below.
56
57 The architecture specific code must provide the extern
58 functions from <linux/resume-trace.h> as well as the
59 <asm/resume-trace.h> header with a TRACE_RESUME() macro.
60
61 The way the information is presented is architecture-
62 dependent, x86 will print the information during a
63 late_initcall.
64
65config PM_TRACE_RTC
66 bool "Suspend/resume event tracing"
67 depends on CAN_PM_TRACE
68 depends on X86
69 select PM_TRACE
70 default n
71 ---help---
72 This enables some cheesy code to save the last PM event point in the
73 RTC across reboots, so that you can debug a machine that just hangs
74 during suspend (or more commonly, during resume).
75
76 To use this debugging feature you should attempt to suspend the
77 machine, reboot it and then run
78
79 dmesg -s 1000000 | grep 'hash matches'
80
81 CAUTION: this option will cause your machine's real-time clock to be
82 set to an invalid time after a resume.
83
84config PM_SLEEP_SMP
85 bool
86 depends on SMP
87 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
88 depends on PM_SLEEP
89 select HOTPLUG
90 select HOTPLUG_CPU
91 default y
92
93config PM_SLEEP
94 bool
95 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
96 default y
97
98config PM_SLEEP_ADVANCED_DEBUG
99 bool
100 depends on PM_ADVANCED_DEBUG
101 default n
102
103config SUSPEND 1config SUSPEND
104 bool "Suspend to RAM and standby" 2 bool "Suspend to RAM and standby"
105 depends on PM && ARCH_SUSPEND_POSSIBLE 3 depends on ARCH_SUSPEND_POSSIBLE
106 default y 4 default y
107 ---help--- 5 ---help---
108 Allow the system to enter sleep states in which main memory is 6 Allow the system to enter sleep states in which main memory is
109 powered and thus its contents are preserved, such as the 7 powered and thus its contents are preserved, such as the
110 suspend-to-RAM state (e.g. the ACPI S3 state). 8 suspend-to-RAM state (e.g. the ACPI S3 state).
111 9
112config PM_TEST_SUSPEND
113 bool "Test suspend/resume and wakealarm during bootup"
114 depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
115 ---help---
116 This option will let you suspend your machine during bootup, and
117 make it wake up a few seconds later using an RTC wakeup alarm.
118 Enable this with a kernel parameter like "test_suspend=mem".
119
120 You probably want to have your system's RTC driver statically
121 linked, ensuring that it's available when this test runs.
122
123config SUSPEND_FREEZER 10config SUSPEND_FREEZER
124 bool "Enable freezer for suspend to RAM/standby" \ 11 bool "Enable freezer for suspend to RAM/standby" \
125 if ARCH_WANTS_FREEZER_CONTROL || BROKEN 12 if ARCH_WANTS_FREEZER_CONTROL || BROKEN
@@ -131,9 +18,13 @@ config SUSPEND_FREEZER
131 18
132 Turning OFF this setting is NOT recommended! If in doubt, say Y. 19 Turning OFF this setting is NOT recommended! If in doubt, say Y.
133 20
21config HIBERNATE_CALLBACKS
22 bool
23
134config HIBERNATION 24config HIBERNATION
135 bool "Hibernation (aka 'suspend to disk')" 25 bool "Hibernation (aka 'suspend to disk')"
136 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE 26 depends on SWAP && ARCH_HIBERNATION_POSSIBLE
27 select HIBERNATE_CALLBACKS
137 select LZO_COMPRESS 28 select LZO_COMPRESS
138 select LZO_DECOMPRESS 29 select LZO_DECOMPRESS
139 ---help--- 30 ---help---
@@ -196,6 +87,106 @@ config PM_STD_PARTITION
196 suspended image to. It will simply pick the first available swap 87 suspended image to. It will simply pick the first available swap
197 device. 88 device.
198 89
90config PM_SLEEP
91 def_bool y
92 depends on SUSPEND || HIBERNATE_CALLBACKS
93
94config PM_SLEEP_SMP
95 def_bool y
96 depends on SMP
97 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
98 depends on PM_SLEEP
99 select HOTPLUG
100 select HOTPLUG_CPU
101
102config PM_RUNTIME
103 bool "Run-time PM core functionality"
104 depends on !IA64_HP_SIM
105 ---help---
106 Enable functionality allowing I/O devices to be put into energy-saving
107 (low power) states at run time (or autosuspended) after a specified
108 period of inactivity and woken up in response to a hardware-generated
109 wake-up event or a driver's request.
110
111 Hardware support is generally required for this functionality to work
112 and the bus type drivers of the buses the devices are on are
113 responsible for the actual handling of the autosuspend requests and
114 wake-up events.
115
116config PM
117 def_bool y
118 depends on PM_SLEEP || PM_RUNTIME
119
120config PM_DEBUG
121 bool "Power Management Debug Support"
122 depends on PM
123 ---help---
124 This option enables various debugging support in the Power Management
125 code. This is helpful when debugging and reporting PM bugs, like
126 suspend support.
127
128config PM_VERBOSE
129 bool "Verbose Power Management debugging"
130 depends on PM_DEBUG
131 ---help---
132 This option enables verbose messages from the Power Management code.
133
134config PM_ADVANCED_DEBUG
135 bool "Extra PM attributes in sysfs for low-level debugging/testing"
136 depends on PM_DEBUG
137 ---help---
138 Add extra sysfs attributes allowing one to access some Power Management
139 fields of device objects from user space. If you are not a kernel
140 developer interested in debugging/testing Power Management, say "no".
141
142config PM_TEST_SUSPEND
143 bool "Test suspend/resume and wakealarm during bootup"
144 depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
145 ---help---
146 This option will let you suspend your machine during bootup, and
147 make it wake up a few seconds later using an RTC wakeup alarm.
148 Enable this with a kernel parameter like "test_suspend=mem".
149
150 You probably want to have your system's RTC driver statically
151 linked, ensuring that it's available when this test runs.
152
153config CAN_PM_TRACE
154 def_bool y
155 depends on PM_DEBUG && PM_SLEEP
156
157config PM_TRACE
158 bool
159 help
160 This enables code to save the last PM event point across
161 reboot. The architecture needs to support this, x86 for
162 example does by saving things in the RTC, see below.
163
164 The architecture specific code must provide the extern
165 functions from <linux/resume-trace.h> as well as the
166 <asm/resume-trace.h> header with a TRACE_RESUME() macro.
167
168 The way the information is presented is architecture-
169 dependent, x86 will print the information during a
170 late_initcall.
171
172config PM_TRACE_RTC
173 bool "Suspend/resume event tracing"
174 depends on CAN_PM_TRACE
175 depends on X86
176 select PM_TRACE
177 ---help---
178 This enables some cheesy code to save the last PM event point in the
179 RTC across reboots, so that you can debug a machine that just hangs
180 during suspend (or more commonly, during resume).
181
182 To use this debugging feature you should attempt to suspend the
183 machine, reboot it and then run
184
185 dmesg -s 1000000 | grep 'hash matches'
186
187 CAUTION: this option will cause your machine's real-time clock to be
188 set to an invalid time after a resume.
189
199config APM_EMULATION 190config APM_EMULATION
200 tristate "Advanced Power Management Emulation" 191 tristate "Advanced Power Management Emulation"
201 depends on PM && SYS_SUPPORTS_APM_EMULATION 192 depends on PM && SYS_SUPPORTS_APM_EMULATION
@@ -222,31 +213,11 @@ config APM_EMULATION
222 anything, try disabling/enabling this option (or disabling/enabling 213 anything, try disabling/enabling this option (or disabling/enabling
223 APM in your BIOS). 214 APM in your BIOS).
224 215
225config PM_RUNTIME
226 bool "Run-time PM core functionality"
227 depends on PM
228 ---help---
229 Enable functionality allowing I/O devices to be put into energy-saving
230 (low power) states at run time (or autosuspended) after a specified
231 period of inactivity and woken up in response to a hardware-generated
232 wake-up event or a driver's request.
233
234 Hardware support is generally required for this functionality to work
235 and the bus type drivers of the buses the devices are on are
236 responsible for the actual handling of the autosuspend requests and
237 wake-up events.
238
239config PM_OPS
240 bool
241 depends on PM_SLEEP || PM_RUNTIME
242 default y
243
244config ARCH_HAS_OPP 216config ARCH_HAS_OPP
245 bool 217 bool
246 218
247config PM_OPP 219config PM_OPP
248 bool "Operating Performance Point (OPP) Layer library" 220 bool "Operating Performance Point (OPP) Layer library"
249 depends on PM
250 depends on ARCH_HAS_OPP 221 depends on ARCH_HAS_OPP
251 ---help--- 222 ---help---
252 SOCs have a standard set of tuples consisting of frequency and 223 SOCs have a standard set of tuples consisting of frequency and
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c350e18b53e3..c5ebc6a90643 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,4 +1,5 @@
1ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG 1
2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
2 3
3obj-$(CONFIG_PM) += main.o 4obj-$(CONFIG_PM) += main.o
4obj-$(CONFIG_PM_SLEEP) += console.o 5obj-$(CONFIG_PM_SLEEP) += console.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 83bbc7c02df9..d09dd10c5a5e 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
28static int submit(int rw, struct block_device *bdev, sector_t sector, 28static int submit(int rw, struct block_device *bdev, sector_t sector,
29 struct page *page, struct bio **bio_chain) 29 struct page *page, struct bio **bio_chain)
30{ 30{
31 const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG; 31 const int bio_rw = rw | REQ_SYNC;
32 struct bio *bio; 32 struct bio *bio;
33 33
34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); 34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1832bd264219..50aae660174d 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -23,6 +23,7 @@
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/gfp.h> 25#include <linux/gfp.h>
26#include <linux/syscore_ops.h>
26#include <scsi/scsi_scan.h> 27#include <scsi/scsi_scan.h>
27#include <asm/suspend.h> 28#include <asm/suspend.h>
28 29
@@ -272,6 +273,11 @@ static int create_image(int platform_mode)
272 local_irq_disable(); 273 local_irq_disable();
273 274
274 error = sysdev_suspend(PMSG_FREEZE); 275 error = sysdev_suspend(PMSG_FREEZE);
276 if (!error) {
277 error = syscore_suspend();
278 if (error)
279 sysdev_resume();
280 }
275 if (error) { 281 if (error) {
276 printk(KERN_ERR "PM: Some system devices failed to power down, " 282 printk(KERN_ERR "PM: Some system devices failed to power down, "
277 "aborting hibernation\n"); 283 "aborting hibernation\n");
@@ -295,6 +301,7 @@ static int create_image(int platform_mode)
295 } 301 }
296 302
297 Power_up: 303 Power_up:
304 syscore_resume();
298 sysdev_resume(); 305 sysdev_resume();
299 /* NOTE: dpm_resume_noirq() is just a resume() for devices 306 /* NOTE: dpm_resume_noirq() is just a resume() for devices
300 * that suspended with irqs off ... no overall powerup. 307 * that suspended with irqs off ... no overall powerup.
@@ -403,6 +410,11 @@ static int resume_target_kernel(bool platform_mode)
403 local_irq_disable(); 410 local_irq_disable();
404 411
405 error = sysdev_suspend(PMSG_QUIESCE); 412 error = sysdev_suspend(PMSG_QUIESCE);
413 if (!error) {
414 error = syscore_suspend();
415 if (error)
416 sysdev_resume();
417 }
406 if (error) 418 if (error)
407 goto Enable_irqs; 419 goto Enable_irqs;
408 420
@@ -429,6 +441,7 @@ static int resume_target_kernel(bool platform_mode)
429 restore_processor_state(); 441 restore_processor_state();
430 touch_softlockup_watchdog(); 442 touch_softlockup_watchdog();
431 443
444 syscore_resume();
432 sysdev_resume(); 445 sysdev_resume();
433 446
434 Enable_irqs: 447 Enable_irqs:
@@ -516,6 +529,7 @@ int hibernation_platform_enter(void)
516 529
517 local_irq_disable(); 530 local_irq_disable();
518 sysdev_suspend(PMSG_HIBERNATE); 531 sysdev_suspend(PMSG_HIBERNATE);
532 syscore_suspend();
519 if (pm_wakeup_pending()) { 533 if (pm_wakeup_pending()) {
520 error = -EAGAIN; 534 error = -EAGAIN;
521 goto Power_up; 535 goto Power_up;
@@ -526,6 +540,7 @@ int hibernation_platform_enter(void)
526 while (1); 540 while (1);
527 541
528 Power_up: 542 Power_up:
543 syscore_resume();
529 sysdev_resume(); 544 sysdev_resume();
530 local_irq_enable(); 545 local_irq_enable();
531 enable_nonboot_cpus(); 546 enable_nonboot_cpus();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 701853042c28..de9aef8742f4 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -17,9 +17,6 @@
17 17
18DEFINE_MUTEX(pm_mutex); 18DEFINE_MUTEX(pm_mutex);
19 19
20unsigned int pm_flags;
21EXPORT_SYMBOL(pm_flags);
22
23#ifdef CONFIG_PM_SLEEP 20#ifdef CONFIG_PM_SLEEP
24 21
25/* Routines for PM-transition notifications */ 22/* Routines for PM-transition notifications */
@@ -227,7 +224,7 @@ power_attr(state);
227 * writing to 'state'. It first should read from 'wakeup_count' and store 224 * writing to 'state'. It first should read from 'wakeup_count' and store
228 * the read value. Then, after carrying out its own preparations for the system 225 * the read value. Then, after carrying out its own preparations for the system
229 * transition to a sleep state, it should write the stored value to 226 * transition to a sleep state, it should write the stored value to
230 * 'wakeup_count'. If that fails, at least one wakeup event has occured since 227 * 'wakeup_count'. If that fails, at least one wakeup event has occurred since
231 * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it 228 * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it
232 * is allowed to write to 'state', but the transition will be aborted if there 229 * is allowed to write to 'state', but the transition will be aborted if there
233 * are any wakeup events detected after 'wakeup_count' was written to. 230 * are any wakeup events detected after 'wakeup_count' was written to.
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 64db648ff911..ca0aacc24874 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -42,15 +42,15 @@ static void swsusp_unset_page_forbidden(struct page *);
42 42
43/* 43/*
44 * Preferred image size in bytes (tunable via /sys/power/image_size). 44 * Preferred image size in bytes (tunable via /sys/power/image_size).
45 * When it is set to N, swsusp will do its best to ensure the image 45 * When it is set to N, the image creating code will do its best to
46 * size will not exceed N bytes, but if that is impossible, it will 46 * ensure the image size will not exceed N bytes, but if that is
47 * try to create the smallest image possible. 47 * impossible, it will try to create the smallest image possible.
48 */ 48 */
49unsigned long image_size; 49unsigned long image_size;
50 50
51void __init hibernate_image_size_init(void) 51void __init hibernate_image_size_init(void)
52{ 52{
53 image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; 53 image_size = (totalram_pages / 3) * PAGE_SIZE;
54} 54}
55 55
56/* List of PBEs needed for restoring the pages that were allocated before 56/* List of PBEs needed for restoring the pages that were allocated before
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index de6f86bfa303..8935369d503a 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -22,6 +22,7 @@
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/suspend.h> 24#include <linux/suspend.h>
25#include <linux/syscore_ops.h>
25#include <trace/events/power.h> 26#include <trace/events/power.h>
26 27
27#include "power.h" 28#include "power.h"
@@ -164,10 +165,16 @@ static int suspend_enter(suspend_state_t state)
164 165
165 error = sysdev_suspend(PMSG_SUSPEND); 166 error = sysdev_suspend(PMSG_SUSPEND);
166 if (!error) { 167 if (!error) {
168 error = syscore_suspend();
169 if (error)
170 sysdev_resume();
171 }
172 if (!error) {
167 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { 173 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
168 error = suspend_ops->enter(state); 174 error = suspend_ops->enter(state);
169 events_check_enabled = false; 175 events_check_enabled = false;
170 } 176 }
177 syscore_resume();
171 sysdev_resume(); 178 sysdev_resume();
172 } 179 }
173 180
diff --git a/kernel/printk.c b/kernel/printk.c
index 36231525e22f..da8ca817eae3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -53,7 +53,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
53#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) 53#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
54 54
55/* printk's without a loglevel use this.. */ 55/* printk's without a loglevel use this.. */
56#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */ 56#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
57 57
58/* We show everything that is MORE important than this.. */ 58/* We show everything that is MORE important than this.. */
59#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ 59#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
@@ -113,6 +113,11 @@ static unsigned con_start; /* Index into log_buf: next char to be sent to consol
113static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ 113static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
114 114
115/* 115/*
116 * If exclusive_console is non-NULL then only this console is to be printed to.
117 */
118static struct console *exclusive_console;
119
120/*
116 * Array of consoles built from command line options (console=) 121 * Array of consoles built from command line options (console=)
117 */ 122 */
118struct console_cmdline 123struct console_cmdline
@@ -476,6 +481,8 @@ static void __call_console_drivers(unsigned start, unsigned end)
476 struct console *con; 481 struct console *con;
477 482
478 for_each_console(con) { 483 for_each_console(con) {
484 if (exclusive_console && con != exclusive_console)
485 continue;
479 if ((con->flags & CON_ENABLED) && con->write && 486 if ((con->flags & CON_ENABLED) && con->write &&
480 (cpu_online(smp_processor_id()) || 487 (cpu_online(smp_processor_id()) ||
481 (con->flags & CON_ANYTIME))) 488 (con->flags & CON_ANYTIME)))
@@ -515,6 +522,71 @@ static void _call_console_drivers(unsigned start,
515} 522}
516 523
517/* 524/*
525 * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
526 * lower 3 bit are the log level, the rest are the log facility. In case
527 * userspace passes usual userspace syslog messages to /dev/kmsg or
528 * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
529 * to extract the correct log level for in-kernel processing, and not mangle
530 * the original value.
531 *
532 * If a prefix is found, the length of the prefix is returned. If 'level' is
533 * passed, it will be filled in with the log level without a possible facility
534 * value. If 'special' is passed, the special printk prefix chars are accepted
535 * and returned. If no valid header is found, 0 is returned and the passed
536 * variables are not touched.
537 */
538static size_t log_prefix(const char *p, unsigned int *level, char *special)
539{
540 unsigned int lev = 0;
541 char sp = '\0';
542 size_t len;
543
544 if (p[0] != '<' || !p[1])
545 return 0;
546 if (p[2] == '>') {
547 /* usual single digit level number or special char */
548 switch (p[1]) {
549 case '0' ... '7':
550 lev = p[1] - '0';
551 break;
552 case 'c': /* KERN_CONT */
553 case 'd': /* KERN_DEFAULT */
554 sp = p[1];
555 break;
556 default:
557 return 0;
558 }
559 len = 3;
560 } else {
561 /* multi digit including the level and facility number */
562 char *endp = NULL;
563
564 if (p[1] < '0' && p[1] > '9')
565 return 0;
566
567 lev = (simple_strtoul(&p[1], &endp, 10) & 7);
568 if (endp == NULL || endp[0] != '>')
569 return 0;
570 len = (endp + 1) - p;
571 }
572
573 /* do not accept special char if not asked for */
574 if (sp && !special)
575 return 0;
576
577 if (special) {
578 *special = sp;
579 /* return special char, do not touch level */
580 if (sp)
581 return len;
582 }
583
584 if (level)
585 *level = lev;
586 return len;
587}
588
589/*
518 * Call the console drivers, asking them to write out 590 * Call the console drivers, asking them to write out
519 * log_buf[start] to log_buf[end - 1]. 591 * log_buf[start] to log_buf[end - 1].
520 * The console_lock must be held. 592 * The console_lock must be held.
@@ -529,13 +601,9 @@ static void call_console_drivers(unsigned start, unsigned end)
529 cur_index = start; 601 cur_index = start;
530 start_print = start; 602 start_print = start;
531 while (cur_index != end) { 603 while (cur_index != end) {
532 if (msg_level < 0 && ((end - cur_index) > 2) && 604 if (msg_level < 0 && ((end - cur_index) > 2)) {
533 LOG_BUF(cur_index + 0) == '<' && 605 /* strip log prefix */
534 LOG_BUF(cur_index + 1) >= '0' && 606 cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
535 LOG_BUF(cur_index + 1) <= '7' &&
536 LOG_BUF(cur_index + 2) == '>') {
537 msg_level = LOG_BUF(cur_index + 1) - '0';
538 cur_index += 3;
539 start_print = cur_index; 607 start_print = cur_index;
540 } 608 }
541 while (cur_index != end) { 609 while (cur_index != end) {
@@ -733,6 +801,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
733 unsigned long flags; 801 unsigned long flags;
734 int this_cpu; 802 int this_cpu;
735 char *p; 803 char *p;
804 size_t plen;
805 char special;
736 806
737 boot_delay_msec(); 807 boot_delay_msec();
738 printk_delay(); 808 printk_delay();
@@ -773,45 +843,52 @@ asmlinkage int vprintk(const char *fmt, va_list args)
773 printed_len += vscnprintf(printk_buf + printed_len, 843 printed_len += vscnprintf(printk_buf + printed_len,
774 sizeof(printk_buf) - printed_len, fmt, args); 844 sizeof(printk_buf) - printed_len, fmt, args);
775 845
776
777 p = printk_buf; 846 p = printk_buf;
778 847
779 /* Do we have a loglevel in the string? */ 848 /* Read log level and handle special printk prefix */
780 if (p[0] == '<') { 849 plen = log_prefix(p, &current_log_level, &special);
781 unsigned char c = p[1]; 850 if (plen) {
782 if (c && p[2] == '>') { 851 p += plen;
783 switch (c) { 852
784 case '0' ... '7': /* loglevel */ 853 switch (special) {
785 current_log_level = c - '0'; 854 case 'c': /* Strip <c> KERN_CONT, continue line */
786 /* Fallthrough - make sure we're on a new line */ 855 plen = 0;
787 case 'd': /* KERN_DEFAULT */ 856 break;
788 if (!new_text_line) { 857 case 'd': /* Strip <d> KERN_DEFAULT, start new line */
789 emit_log_char('\n'); 858 plen = 0;
790 new_text_line = 1; 859 default:
791 } 860 if (!new_text_line) {
792 /* Fallthrough - skip the loglevel */ 861 emit_log_char('\n');
793 case 'c': /* KERN_CONT */ 862 new_text_line = 1;
794 p += 3;
795 break;
796 } 863 }
797 } 864 }
798 } 865 }
799 866
800 /* 867 /*
801 * Copy the output into log_buf. If the caller didn't provide 868 * Copy the output into log_buf. If the caller didn't provide
802 * appropriate log level tags, we insert them here 869 * the appropriate log prefix, we insert them here
803 */ 870 */
804 for ( ; *p; p++) { 871 for (; *p; p++) {
805 if (new_text_line) { 872 if (new_text_line) {
806 /* Always output the token */
807 emit_log_char('<');
808 emit_log_char(current_log_level + '0');
809 emit_log_char('>');
810 printed_len += 3;
811 new_text_line = 0; 873 new_text_line = 0;
812 874
875 if (plen) {
876 /* Copy original log prefix */
877 int i;
878
879 for (i = 0; i < plen; i++)
880 emit_log_char(printk_buf[i]);
881 printed_len += plen;
882 } else {
883 /* Add log prefix */
884 emit_log_char('<');
885 emit_log_char(current_log_level + '0');
886 emit_log_char('>');
887 printed_len += 3;
888 }
889
813 if (printk_time) { 890 if (printk_time) {
814 /* Follow the token with the time */ 891 /* Add the current time stamp */
815 char tbuf[50], *tp; 892 char tbuf[50], *tp;
816 unsigned tlen; 893 unsigned tlen;
817 unsigned long long t; 894 unsigned long long t;
@@ -1160,6 +1237,11 @@ void console_unlock(void)
1160 local_irq_restore(flags); 1237 local_irq_restore(flags);
1161 } 1238 }
1162 console_locked = 0; 1239 console_locked = 0;
1240
1241 /* Release the exclusive_console once it is used */
1242 if (unlikely(exclusive_console))
1243 exclusive_console = NULL;
1244
1163 up(&console_sem); 1245 up(&console_sem);
1164 spin_unlock_irqrestore(&logbuf_lock, flags); 1246 spin_unlock_irqrestore(&logbuf_lock, flags);
1165 if (wake_klogd) 1247 if (wake_klogd)
@@ -1246,6 +1328,18 @@ void console_start(struct console *console)
1246} 1328}
1247EXPORT_SYMBOL(console_start); 1329EXPORT_SYMBOL(console_start);
1248 1330
1331static int __read_mostly keep_bootcon;
1332
1333static int __init keep_bootcon_setup(char *str)
1334{
1335 keep_bootcon = 1;
1336 printk(KERN_INFO "debug: skip boot console de-registration.\n");
1337
1338 return 0;
1339}
1340
1341early_param("keep_bootcon", keep_bootcon_setup);
1342
1249/* 1343/*
1250 * The console driver calls this routine during kernel initialization 1344 * The console driver calls this routine during kernel initialization
1251 * to register the console printing procedure with printk() and to 1345 * to register the console printing procedure with printk() and to
@@ -1382,6 +1476,12 @@ void register_console(struct console *newcon)
1382 spin_lock_irqsave(&logbuf_lock, flags); 1476 spin_lock_irqsave(&logbuf_lock, flags);
1383 con_start = log_start; 1477 con_start = log_start;
1384 spin_unlock_irqrestore(&logbuf_lock, flags); 1478 spin_unlock_irqrestore(&logbuf_lock, flags);
1479 /*
1480 * We're about to replay the log buffer. Only do this to the
1481 * just-registered console to avoid excessive message spam to
1482 * the already-registered consoles.
1483 */
1484 exclusive_console = newcon;
1385 } 1485 }
1386 console_unlock(); 1486 console_unlock();
1387 console_sysfs_notify(); 1487 console_sysfs_notify();
@@ -1393,7 +1493,9 @@ void register_console(struct console *newcon)
1393 * users know there might be something in the kernel's log buffer that 1493 * users know there might be something in the kernel's log buffer that
1394 * went to the bootconsole (that they do not see on the real console) 1494 * went to the bootconsole (that they do not see on the real console)
1395 */ 1495 */
1396 if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { 1496 if (bcon &&
1497 ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
1498 !keep_bootcon) {
1397 /* we need to iterate through twice, to make sure we print 1499 /* we need to iterate through twice, to make sure we print
1398 * everything out, before we unregister the console(s) 1500 * everything out, before we unregister the console(s)
1399 */ 1501 */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1708b1e2972d..dc7ab65f3b36 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
22#include <linux/syscalls.h> 22#include <linux/syscalls.h>
23#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/regset.h> 24#include <linux/regset.h>
25#include <linux/hw_breakpoint.h>
25 26
26 27
27/* 28/*
@@ -134,21 +135,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
134 return 0; 135 return 0;
135 rcu_read_lock(); 136 rcu_read_lock();
136 tcred = __task_cred(task); 137 tcred = __task_cred(task);
137 if ((cred->uid != tcred->euid || 138 if (cred->user->user_ns == tcred->user->user_ns &&
138 cred->uid != tcred->suid || 139 (cred->uid == tcred->euid &&
139 cred->uid != tcred->uid || 140 cred->uid == tcred->suid &&
140 cred->gid != tcred->egid || 141 cred->uid == tcred->uid &&
141 cred->gid != tcred->sgid || 142 cred->gid == tcred->egid &&
142 cred->gid != tcred->gid) && 143 cred->gid == tcred->sgid &&
143 !capable(CAP_SYS_PTRACE)) { 144 cred->gid == tcred->gid))
144 rcu_read_unlock(); 145 goto ok;
145 return -EPERM; 146 if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
146 } 147 goto ok;
148 rcu_read_unlock();
149 return -EPERM;
150ok:
147 rcu_read_unlock(); 151 rcu_read_unlock();
148 smp_rmb(); 152 smp_rmb();
149 if (task->mm) 153 if (task->mm)
150 dumpable = get_dumpable(task->mm); 154 dumpable = get_dumpable(task->mm);
151 if (!dumpable && !capable(CAP_SYS_PTRACE)) 155 if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE))
152 return -EPERM; 156 return -EPERM;
153 157
154 return security_ptrace_access_check(task, mode); 158 return security_ptrace_access_check(task, mode);
@@ -163,7 +167,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
163 return !err; 167 return !err;
164} 168}
165 169
166int ptrace_attach(struct task_struct *task) 170static int ptrace_attach(struct task_struct *task)
167{ 171{
168 int retval; 172 int retval;
169 173
@@ -198,7 +202,7 @@ int ptrace_attach(struct task_struct *task)
198 goto unlock_tasklist; 202 goto unlock_tasklist;
199 203
200 task->ptrace = PT_PTRACED; 204 task->ptrace = PT_PTRACED;
201 if (capable(CAP_SYS_PTRACE)) 205 if (task_ns_capable(task, CAP_SYS_PTRACE))
202 task->ptrace |= PT_PTRACE_CAP; 206 task->ptrace |= PT_PTRACE_CAP;
203 207
204 __ptrace_link(task, current); 208 __ptrace_link(task, current);
@@ -219,7 +223,7 @@ out:
219 * Performs checks and sets PT_PTRACED. 223 * Performs checks and sets PT_PTRACED.
220 * Should be used by all ptrace implementations for PTRACE_TRACEME. 224 * Should be used by all ptrace implementations for PTRACE_TRACEME.
221 */ 225 */
222int ptrace_traceme(void) 226static int ptrace_traceme(void)
223{ 227{
224 int ret = -EPERM; 228 int ret = -EPERM;
225 229
@@ -293,7 +297,7 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
293 return false; 297 return false;
294} 298}
295 299
296int ptrace_detach(struct task_struct *child, unsigned int data) 300static int ptrace_detach(struct task_struct *child, unsigned int data)
297{ 301{
298 bool dead = false; 302 bool dead = false;
299 303
@@ -876,3 +880,19 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
876 return ret; 880 return ret;
877} 881}
878#endif /* CONFIG_COMPAT */ 882#endif /* CONFIG_COMPAT */
883
884#ifdef CONFIG_HAVE_HW_BREAKPOINT
885int ptrace_get_breakpoints(struct task_struct *tsk)
886{
887 if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt))
888 return 0;
889
890 return -1;
891}
892
893void ptrace_put_breakpoints(struct task_struct *tsk)
894{
895 if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt))
896 flush_ptrace_hw_breakpoint(tsk);
897}
898#endif /* CONFIG_HAVE_HW_BREAKPOINT */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a23a57a976d1..f3240e987928 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -214,11 +214,12 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
214 * Ensure that queued callbacks are all executed. 214 * Ensure that queued callbacks are all executed.
215 * If we detect that we are nested in a RCU read-side critical 215 * If we detect that we are nested in a RCU read-side critical
216 * section, we should simply fail, otherwise we would deadlock. 216 * section, we should simply fail, otherwise we would deadlock.
217 * Note that the machinery to reliably determine whether
218 * or not we are in an RCU read-side critical section
219 * exists only in the preemptible RCU implementations
220 * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why
221 * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT.
217 */ 222 */
218#ifndef CONFIG_PREEMPT
219 WARN_ON(1);
220 return 0;
221#else
222 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 223 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
223 irqs_disabled()) { 224 irqs_disabled()) {
224 WARN_ON(1); 225 WARN_ON(1);
@@ -229,7 +230,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
229 rcu_barrier_bh(); 230 rcu_barrier_bh();
230 debug_object_free(head, &rcuhead_debug_descr); 231 debug_object_free(head, &rcuhead_debug_descr);
231 return 1; 232 return 1;
232#endif
233 default: 233 default:
234 return 0; 234 return 0;
235 } 235 }
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 015abaea962a..3cb8e362e883 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -852,7 +852,7 @@ void exit_rcu(void)
852 if (t->rcu_read_lock_nesting == 0) 852 if (t->rcu_read_lock_nesting == 0)
853 return; 853 return;
854 t->rcu_read_lock_nesting = 1; 854 t->rcu_read_lock_nesting = 1;
855 rcu_read_unlock(); 855 __rcu_read_unlock();
856} 856}
857 857
858#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 858#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 89613f97ff26..c224da41890c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,7 +47,6 @@
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <asm/byteorder.h> 49#include <asm/byteorder.h>
50#include <linux/sched.h>
51 50
52MODULE_LICENSE("GPL"); 51MODULE_LICENSE("GPL");
53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index c7eaa37a768b..34683efa2cce 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
126 pos, buf, s - buf); 126 pos, buf, s - buf);
127} 127}
128 128
129#if BITS_PER_LONG == 32
130u64 res_counter_read_u64(struct res_counter *counter, int member)
131{
132 unsigned long flags;
133 u64 ret;
134
135 spin_lock_irqsave(&counter->lock, flags);
136 ret = *res_counter_member(counter, member);
137 spin_unlock_irqrestore(&counter->lock, flags);
138
139 return ret;
140}
141#else
129u64 res_counter_read_u64(struct res_counter *counter, int member) 142u64 res_counter_read_u64(struct res_counter *counter, int member)
130{ 143{
131 return *res_counter_member(counter, member); 144 return *res_counter_member(counter, member);
132} 145}
146#endif
133 147
134int res_counter_memparse_write_strategy(const char *buf, 148int res_counter_memparse_write_strategy(const char *buf,
135 unsigned long long *res) 149 unsigned long long *res)
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index ddabb54bb5c8..3c7cbc2c33be 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
215 put_pid(waiter->deadlock_task_pid); 215 put_pid(waiter->deadlock_task_pid);
216 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); 216 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
217 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 217 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
218 TRACE_WARN_ON(waiter->task);
219 memset(waiter, 0x22, sizeof(*waiter)); 218 memset(waiter, 0x22, sizeof(*waiter));
220} 219}
221 220
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 66cb89bc5ef1..5c9ccd380966 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -9,7 +9,6 @@
9#include <linux/kthread.h> 9#include <linux/kthread.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/smp_lock.h>
13#include <linux/spinlock.h> 12#include <linux/spinlock.h>
14#include <linux/sysdev.h> 13#include <linux/sysdev.h>
15#include <linux/timer.h> 14#include <linux/timer.h>
@@ -27,7 +26,6 @@ struct test_thread_data {
27 int opcode; 26 int opcode;
28 int opdata; 27 int opdata;
29 int mutexes[MAX_RT_TEST_MUTEXES]; 28 int mutexes[MAX_RT_TEST_MUTEXES];
30 int bkl;
31 int event; 29 int event;
32 struct sys_device sysdev; 30 struct sys_device sysdev;
33}; 31};
@@ -46,9 +44,8 @@ enum test_opcodes {
46 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ 44 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
47 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ 45 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
48 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ 46 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
49 RTTEST_LOCKBKL, /* 9 Lock BKL */ 47 /* 9, 10 - reserved for BKL commemoration */
50 RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ 48 RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */
51 RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */
52 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ 49 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
53 RTTEST_RESET = 99, /* 99 Reset all pending operations */ 50 RTTEST_RESET = 99, /* 99 Reset all pending operations */
54}; 51};
@@ -74,13 +71,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
74 td->mutexes[i] = 0; 71 td->mutexes[i] = 0;
75 } 72 }
76 } 73 }
77
78 if (!lockwakeup && td->bkl == 4) {
79#ifdef CONFIG_LOCK_KERNEL
80 unlock_kernel();
81#endif
82 td->bkl = 0;
83 }
84 return 0; 74 return 0;
85 75
86 case RTTEST_RESETEVENT: 76 case RTTEST_RESETEVENT:
@@ -131,25 +121,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
131 td->mutexes[id] = 0; 121 td->mutexes[id] = 0;
132 return 0; 122 return 0;
133 123
134 case RTTEST_LOCKBKL:
135 if (td->bkl)
136 return 0;
137 td->bkl = 1;
138#ifdef CONFIG_LOCK_KERNEL
139 lock_kernel();
140#endif
141 td->bkl = 4;
142 return 0;
143
144 case RTTEST_UNLOCKBKL:
145 if (td->bkl != 4)
146 break;
147#ifdef CONFIG_LOCK_KERNEL
148 unlock_kernel();
149#endif
150 td->bkl = 0;
151 return 0;
152
153 default: 124 default:
154 break; 125 break;
155 } 126 }
@@ -196,7 +167,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
196 td->event = atomic_add_return(1, &rttest_event); 167 td->event = atomic_add_return(1, &rttest_event);
197 break; 168 break;
198 169
199 case RTTEST_LOCKBKL:
200 default: 170 default:
201 break; 171 break;
202 } 172 }
@@ -229,8 +199,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
229 td->event = atomic_add_return(1, &rttest_event); 199 td->event = atomic_add_return(1, &rttest_event);
230 return; 200 return;
231 201
232 case RTTEST_LOCKBKL:
233 return;
234 default: 202 default:
235 return; 203 return;
236 } 204 }
@@ -380,11 +348,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
380 spin_lock(&rttest_lock); 348 spin_lock(&rttest_lock);
381 349
382 curr += sprintf(curr, 350 curr += sprintf(curr,
383 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", 351 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
384 td->opcode, td->event, tsk->state, 352 td->opcode, td->event, tsk->state,
385 (MAX_RT_PRIO - 1) - tsk->prio, 353 (MAX_RT_PRIO - 1) - tsk->prio,
386 (MAX_RT_PRIO - 1) - tsk->normal_prio, 354 (MAX_RT_PRIO - 1) - tsk->normal_prio,
387 tsk->pi_blocked_on, td->bkl); 355 tsk->pi_blocked_on);
388 356
389 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) 357 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
390 curr += sprintf(curr, "%d", td->mutexes[i]); 358 curr += sprintf(curr, "%d", td->mutexes[i]);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a9604815786a..ab449117aaf2 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -20,41 +20,34 @@
20/* 20/*
21 * lock->owner state tracking: 21 * lock->owner state tracking:
22 * 22 *
23 * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 23 * lock->owner holds the task_struct pointer of the owner. Bit 0
24 * are used to keep track of the "owner is pending" and "lock has 24 * is used to keep track of the "lock has waiters" state.
25 * waiters" state.
26 * 25 *
27 * owner bit1 bit0 26 * owner bit0
28 * NULL 0 0 lock is free (fast acquire possible) 27 * NULL 0 lock is free (fast acquire possible)
29 * NULL 0 1 invalid state 28 * NULL 1 lock is free and has waiters and the top waiter
30 * NULL 1 0 Transitional State* 29 * is going to take the lock*
31 * NULL 1 1 invalid state 30 * taskpointer 0 lock is held (fast release possible)
32 * taskpointer 0 0 lock is held (fast release possible) 31 * taskpointer 1 lock is held and has waiters**
33 * taskpointer 0 1 task is pending owner
34 * taskpointer 1 0 lock is held and has waiters
35 * taskpointer 1 1 task is pending owner and lock has more waiters
36 *
37 * Pending ownership is assigned to the top (highest priority)
38 * waiter of the lock, when the lock is released. The thread is woken
39 * up and can now take the lock. Until the lock is taken (bit 0
40 * cleared) a competing higher priority thread can steal the lock
41 * which puts the woken up thread back on the waiters list.
42 * 32 *
43 * The fast atomic compare exchange based acquire and release is only 33 * The fast atomic compare exchange based acquire and release is only
44 * possible when bit 0 and 1 of lock->owner are 0. 34 * possible when bit 0 of lock->owner is 0.
35 *
36 * (*) It also can be a transitional state when grabbing the lock
37 * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
38 * we need to set the bit0 before looking at the lock, and the owner may be
39 * NULL in this small time, hence this can be a transitional state.
45 * 40 *
46 * (*) There's a small time where the owner can be NULL and the 41 * (**) There is a small time when bit 0 is set but there are no
47 * "lock has waiters" bit is set. This can happen when grabbing the lock. 42 * waiters. This can happen when grabbing the lock in the slow path.
48 * To prevent a cmpxchg of the owner releasing the lock, we need to set this 43 * To prevent a cmpxchg of the owner releasing the lock, we need to
49 * bit before looking at the lock, hence the reason this is a transitional 44 * set this bit before looking at the lock.
50 * state.
51 */ 45 */
52 46
53static void 47static void
54rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, 48rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
55 unsigned long mask)
56{ 49{
57 unsigned long val = (unsigned long)owner | mask; 50 unsigned long val = (unsigned long)owner;
58 51
59 if (rt_mutex_has_waiters(lock)) 52 if (rt_mutex_has_waiters(lock))
60 val |= RT_MUTEX_HAS_WAITERS; 53 val |= RT_MUTEX_HAS_WAITERS;
@@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
203 * reached or the state of the chain has changed while we 196 * reached or the state of the chain has changed while we
204 * dropped the locks. 197 * dropped the locks.
205 */ 198 */
206 if (!waiter || !waiter->task) 199 if (!waiter)
207 goto out_unlock_pi; 200 goto out_unlock_pi;
208 201
209 /* 202 /*
210 * Check the orig_waiter state. After we dropped the locks, 203 * Check the orig_waiter state. After we dropped the locks,
211 * the previous owner of the lock might have released the lock 204 * the previous owner of the lock might have released the lock.
212 * and made us the pending owner:
213 */ 205 */
214 if (orig_waiter && !orig_waiter->task) 206 if (orig_waiter && !rt_mutex_owner(orig_lock))
215 goto out_unlock_pi; 207 goto out_unlock_pi;
216 208
217 /* 209 /*
@@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
254 246
255 /* Release the task */ 247 /* Release the task */
256 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 248 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
249 if (!rt_mutex_owner(lock)) {
250 /*
251 * If the requeue above changed the top waiter, then we need
252 * to wake the new top waiter up to try to get the lock.
253 */
254
255 if (top_waiter != rt_mutex_top_waiter(lock))
256 wake_up_process(rt_mutex_top_waiter(lock)->task);
257 raw_spin_unlock(&lock->wait_lock);
258 goto out_put_task;
259 }
257 put_task_struct(task); 260 put_task_struct(task);
258 261
259 /* Grab the next task */ 262 /* Grab the next task */
@@ -296,78 +299,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
296} 299}
297 300
298/* 301/*
299 * Optimization: check if we can steal the lock from the
300 * assigned pending owner [which might not have taken the
301 * lock yet]:
302 */
303static inline int try_to_steal_lock(struct rt_mutex *lock,
304 struct task_struct *task)
305{
306 struct task_struct *pendowner = rt_mutex_owner(lock);
307 struct rt_mutex_waiter *next;
308 unsigned long flags;
309
310 if (!rt_mutex_owner_pending(lock))
311 return 0;
312
313 if (pendowner == task)
314 return 1;
315
316 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
317 if (task->prio >= pendowner->prio) {
318 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
319 return 0;
320 }
321
322 /*
323 * Check if a waiter is enqueued on the pending owners
324 * pi_waiters list. Remove it and readjust pending owners
325 * priority.
326 */
327 if (likely(!rt_mutex_has_waiters(lock))) {
328 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
329 return 1;
330 }
331
332 /* No chain handling, pending owner is not blocked on anything: */
333 next = rt_mutex_top_waiter(lock);
334 plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
335 __rt_mutex_adjust_prio(pendowner);
336 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
337
338 /*
339 * We are going to steal the lock and a waiter was
340 * enqueued on the pending owners pi_waiters queue. So
341 * we have to enqueue this waiter into
342 * task->pi_waiters list. This covers the case,
343 * where task is boosted because it holds another
344 * lock and gets unboosted because the booster is
345 * interrupted, so we would delay a waiter with higher
346 * priority as task->normal_prio.
347 *
348 * Note: in the rare case of a SCHED_OTHER task changing
349 * its priority and thus stealing the lock, next->task
350 * might be task:
351 */
352 if (likely(next->task != task)) {
353 raw_spin_lock_irqsave(&task->pi_lock, flags);
354 plist_add(&next->pi_list_entry, &task->pi_waiters);
355 __rt_mutex_adjust_prio(task);
356 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
357 }
358 return 1;
359}
360
361/*
362 * Try to take an rt-mutex 302 * Try to take an rt-mutex
363 * 303 *
364 * This fails
365 * - when the lock has a real owner
366 * - when a different pending owner exists and has higher priority than current
367 *
368 * Must be called with lock->wait_lock held. 304 * Must be called with lock->wait_lock held.
305 *
306 * @lock: the lock to be acquired.
307 * @task: the task which wants to acquire the lock
308 * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
369 */ 309 */
370static int try_to_take_rt_mutex(struct rt_mutex *lock) 310static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
311 struct rt_mutex_waiter *waiter)
371{ 312{
372 /* 313 /*
373 * We have to be careful here if the atomic speedups are 314 * We have to be careful here if the atomic speedups are
@@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
390 */ 331 */
391 mark_rt_mutex_waiters(lock); 332 mark_rt_mutex_waiters(lock);
392 333
393 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) 334 if (rt_mutex_owner(lock))
394 return 0; 335 return 0;
395 336
337 /*
338 * It will get the lock because of one of these conditions:
339 * 1) there is no waiter
340 * 2) higher priority than waiters
341 * 3) it is top waiter
342 */
343 if (rt_mutex_has_waiters(lock)) {
344 if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) {
345 if (!waiter || waiter != rt_mutex_top_waiter(lock))
346 return 0;
347 }
348 }
349
350 if (waiter || rt_mutex_has_waiters(lock)) {
351 unsigned long flags;
352 struct rt_mutex_waiter *top;
353
354 raw_spin_lock_irqsave(&task->pi_lock, flags);
355
356 /* remove the queued waiter. */
357 if (waiter) {
358 plist_del(&waiter->list_entry, &lock->wait_list);
359 task->pi_blocked_on = NULL;
360 }
361
362 /*
363 * We have to enqueue the top waiter(if it exists) into
364 * task->pi_waiters list.
365 */
366 if (rt_mutex_has_waiters(lock)) {
367 top = rt_mutex_top_waiter(lock);
368 top->pi_list_entry.prio = top->list_entry.prio;
369 plist_add(&top->pi_list_entry, &task->pi_waiters);
370 }
371 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
372 }
373
396 /* We got the lock. */ 374 /* We got the lock. */
397 debug_rt_mutex_lock(lock); 375 debug_rt_mutex_lock(lock);
398 376
399 rt_mutex_set_owner(lock, current, 0); 377 rt_mutex_set_owner(lock, task);
400 378
401 rt_mutex_deadlock_account_lock(lock, current); 379 rt_mutex_deadlock_account_lock(lock, task);
402 380
403 return 1; 381 return 1;
404} 382}
@@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
436 414
437 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 415 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
438 416
417 if (!owner)
418 return 0;
419
439 if (waiter == rt_mutex_top_waiter(lock)) { 420 if (waiter == rt_mutex_top_waiter(lock)) {
440 raw_spin_lock_irqsave(&owner->pi_lock, flags); 421 raw_spin_lock_irqsave(&owner->pi_lock, flags);
441 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); 422 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
@@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
472/* 453/*
473 * Wake up the next waiter on the lock. 454 * Wake up the next waiter on the lock.
474 * 455 *
475 * Remove the top waiter from the current tasks waiter list and from 456 * Remove the top waiter from the current tasks waiter list and wake it up.
476 * the lock waiter list. Set it as pending owner. Then wake it up.
477 * 457 *
478 * Called with lock->wait_lock held. 458 * Called with lock->wait_lock held.
479 */ 459 */
480static void wakeup_next_waiter(struct rt_mutex *lock) 460static void wakeup_next_waiter(struct rt_mutex *lock)
481{ 461{
482 struct rt_mutex_waiter *waiter; 462 struct rt_mutex_waiter *waiter;
483 struct task_struct *pendowner;
484 unsigned long flags; 463 unsigned long flags;
485 464
486 raw_spin_lock_irqsave(&current->pi_lock, flags); 465 raw_spin_lock_irqsave(&current->pi_lock, flags);
487 466
488 waiter = rt_mutex_top_waiter(lock); 467 waiter = rt_mutex_top_waiter(lock);
489 plist_del(&waiter->list_entry, &lock->wait_list);
490 468
491 /* 469 /*
492 * Remove it from current->pi_waiters. We do not adjust a 470 * Remove it from current->pi_waiters. We do not adjust a
@@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
495 * lock->wait_lock. 473 * lock->wait_lock.
496 */ 474 */
497 plist_del(&waiter->pi_list_entry, &current->pi_waiters); 475 plist_del(&waiter->pi_list_entry, &current->pi_waiters);
498 pendowner = waiter->task;
499 waiter->task = NULL;
500 476
501 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); 477 rt_mutex_set_owner(lock, NULL);
502 478
503 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 479 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
504 480
505 /* 481 wake_up_process(waiter->task);
506 * Clear the pi_blocked_on variable and enqueue a possible
507 * waiter into the pi_waiters list of the pending owner. This
508 * prevents that in case the pending owner gets unboosted a
509 * waiter with higher priority than pending-owner->normal_prio
510 * is blocked on the unboosted (pending) owner.
511 */
512 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
513
514 WARN_ON(!pendowner->pi_blocked_on);
515 WARN_ON(pendowner->pi_blocked_on != waiter);
516 WARN_ON(pendowner->pi_blocked_on->lock != lock);
517
518 pendowner->pi_blocked_on = NULL;
519
520 if (rt_mutex_has_waiters(lock)) {
521 struct rt_mutex_waiter *next;
522
523 next = rt_mutex_top_waiter(lock);
524 plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
525 }
526 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
527
528 wake_up_process(pendowner);
529} 482}
530 483
531/* 484/*
532 * Remove a waiter from a lock 485 * Remove a waiter from a lock and give up
533 * 486 *
534 * Must be called with lock->wait_lock held 487 * Must be called with lock->wait_lock held and
488 * have just failed to try_to_take_rt_mutex().
535 */ 489 */
536static void remove_waiter(struct rt_mutex *lock, 490static void remove_waiter(struct rt_mutex *lock,
537 struct rt_mutex_waiter *waiter) 491 struct rt_mutex_waiter *waiter)
@@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock,
543 497
544 raw_spin_lock_irqsave(&current->pi_lock, flags); 498 raw_spin_lock_irqsave(&current->pi_lock, flags);
545 plist_del(&waiter->list_entry, &lock->wait_list); 499 plist_del(&waiter->list_entry, &lock->wait_list);
546 waiter->task = NULL;
547 current->pi_blocked_on = NULL; 500 current->pi_blocked_on = NULL;
548 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 501 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
549 502
550 if (first && owner != current) { 503 if (!owner)
504 return;
505
506 if (first) {
551 507
552 raw_spin_lock_irqsave(&owner->pi_lock, flags); 508 raw_spin_lock_irqsave(&owner->pi_lock, flags);
553 509
@@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task)
614 * or TASK_UNINTERRUPTIBLE) 570 * or TASK_UNINTERRUPTIBLE)
615 * @timeout: the pre-initialized and started timer, or NULL for none 571 * @timeout: the pre-initialized and started timer, or NULL for none
616 * @waiter: the pre-initialized rt_mutex_waiter 572 * @waiter: the pre-initialized rt_mutex_waiter
617 * @detect_deadlock: passed to task_blocks_on_rt_mutex
618 * 573 *
619 * lock->wait_lock must be held by the caller. 574 * lock->wait_lock must be held by the caller.
620 */ 575 */
621static int __sched 576static int __sched
622__rt_mutex_slowlock(struct rt_mutex *lock, int state, 577__rt_mutex_slowlock(struct rt_mutex *lock, int state,
623 struct hrtimer_sleeper *timeout, 578 struct hrtimer_sleeper *timeout,
624 struct rt_mutex_waiter *waiter, 579 struct rt_mutex_waiter *waiter)
625 int detect_deadlock)
626{ 580{
627 int ret = 0; 581 int ret = 0;
628 582
629 for (;;) { 583 for (;;) {
630 /* Try to acquire the lock: */ 584 /* Try to acquire the lock: */
631 if (try_to_take_rt_mutex(lock)) 585 if (try_to_take_rt_mutex(lock, current, waiter))
632 break; 586 break;
633 587
634 /* 588 /*
@@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
645 break; 599 break;
646 } 600 }
647 601
648 /*
649 * waiter->task is NULL the first time we come here and
650 * when we have been woken up by the previous owner
651 * but the lock got stolen by a higher prio task.
652 */
653 if (!waiter->task) {
654 ret = task_blocks_on_rt_mutex(lock, waiter, current,
655 detect_deadlock);
656 /*
657 * If we got woken up by the owner then start loop
658 * all over without going into schedule to try
659 * to get the lock now:
660 */
661 if (unlikely(!waiter->task)) {
662 /*
663 * Reset the return value. We might
664 * have returned with -EDEADLK and the
665 * owner released the lock while we
666 * were walking the pi chain.
667 */
668 ret = 0;
669 continue;
670 }
671 if (unlikely(ret))
672 break;
673 }
674
675 raw_spin_unlock(&lock->wait_lock); 602 raw_spin_unlock(&lock->wait_lock);
676 603
677 debug_rt_mutex_print_deadlock(waiter); 604 debug_rt_mutex_print_deadlock(waiter);
678 605
679 if (waiter->task) 606 schedule_rt_mutex(lock);
680 schedule_rt_mutex(lock);
681 607
682 raw_spin_lock(&lock->wait_lock); 608 raw_spin_lock(&lock->wait_lock);
683 set_current_state(state); 609 set_current_state(state);
@@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
698 int ret = 0; 624 int ret = 0;
699 625
700 debug_rt_mutex_init_waiter(&waiter); 626 debug_rt_mutex_init_waiter(&waiter);
701 waiter.task = NULL;
702 627
703 raw_spin_lock(&lock->wait_lock); 628 raw_spin_lock(&lock->wait_lock);
704 629
705 /* Try to acquire the lock again: */ 630 /* Try to acquire the lock again: */
706 if (try_to_take_rt_mutex(lock)) { 631 if (try_to_take_rt_mutex(lock, current, NULL)) {
707 raw_spin_unlock(&lock->wait_lock); 632 raw_spin_unlock(&lock->wait_lock);
708 return 0; 633 return 0;
709 } 634 }
@@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
717 timeout->task = NULL; 642 timeout->task = NULL;
718 } 643 }
719 644
720 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, 645 ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock);
721 detect_deadlock); 646
647 if (likely(!ret))
648 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
722 649
723 set_current_state(TASK_RUNNING); 650 set_current_state(TASK_RUNNING);
724 651
725 if (unlikely(waiter.task)) 652 if (unlikely(ret))
726 remove_waiter(lock, &waiter); 653 remove_waiter(lock, &waiter);
727 654
728 /* 655 /*
@@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
737 if (unlikely(timeout)) 664 if (unlikely(timeout))
738 hrtimer_cancel(&timeout->timer); 665 hrtimer_cancel(&timeout->timer);
739 666
740 /*
741 * Readjust priority, when we did not get the lock. We might
742 * have been the pending owner and boosted. Since we did not
743 * take the lock, the PI boost has to go.
744 */
745 if (unlikely(ret))
746 rt_mutex_adjust_prio(current);
747
748 debug_rt_mutex_free_waiter(&waiter); 667 debug_rt_mutex_free_waiter(&waiter);
749 668
750 return ret; 669 return ret;
@@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
762 681
763 if (likely(rt_mutex_owner(lock) != current)) { 682 if (likely(rt_mutex_owner(lock) != current)) {
764 683
765 ret = try_to_take_rt_mutex(lock); 684 ret = try_to_take_rt_mutex(lock, current, NULL);
766 /* 685 /*
767 * try_to_take_rt_mutex() sets the lock waiters 686 * try_to_take_rt_mutex() sets the lock waiters
768 * bit unconditionally. Clean this up. 687 * bit unconditionally. Clean this up.
@@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
992{ 911{
993 __rt_mutex_init(lock, NULL); 912 __rt_mutex_init(lock, NULL);
994 debug_rt_mutex_proxy_lock(lock, proxy_owner); 913 debug_rt_mutex_proxy_lock(lock, proxy_owner);
995 rt_mutex_set_owner(lock, proxy_owner, 0); 914 rt_mutex_set_owner(lock, proxy_owner);
996 rt_mutex_deadlock_account_lock(lock, proxy_owner); 915 rt_mutex_deadlock_account_lock(lock, proxy_owner);
997} 916}
998 917
@@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
1008 struct task_struct *proxy_owner) 927 struct task_struct *proxy_owner)
1009{ 928{
1010 debug_rt_mutex_proxy_unlock(lock); 929 debug_rt_mutex_proxy_unlock(lock);
1011 rt_mutex_set_owner(lock, NULL, 0); 930 rt_mutex_set_owner(lock, NULL);
1012 rt_mutex_deadlock_account_unlock(proxy_owner); 931 rt_mutex_deadlock_account_unlock(proxy_owner);
1013} 932}
1014 933
@@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1034 953
1035 raw_spin_lock(&lock->wait_lock); 954 raw_spin_lock(&lock->wait_lock);
1036 955
1037 mark_rt_mutex_waiters(lock); 956 if (try_to_take_rt_mutex(lock, task, NULL)) {
1038
1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock);
1042 rt_mutex_set_owner(lock, task, 0);
1043 raw_spin_unlock(&lock->wait_lock); 957 raw_spin_unlock(&lock->wait_lock);
1044 rt_mutex_deadlock_account_lock(lock, task);
1045 return 1; 958 return 1;
1046 } 959 }
1047 960
1048 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); 961 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
1049 962
1050 if (ret && !waiter->task) { 963 if (ret && !rt_mutex_owner(lock)) {
1051 /* 964 /*
1052 * Reset the return value. We might have 965 * Reset the return value. We might have
1053 * returned with -EDEADLK and the owner 966 * returned with -EDEADLK and the owner
@@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1056 */ 969 */
1057 ret = 0; 970 ret = 0;
1058 } 971 }
972
973 if (unlikely(ret))
974 remove_waiter(lock, waiter);
975
1059 raw_spin_unlock(&lock->wait_lock); 976 raw_spin_unlock(&lock->wait_lock);
1060 977
1061 debug_rt_mutex_print_deadlock(waiter); 978 debug_rt_mutex_print_deadlock(waiter);
@@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1110 1027
1111 set_current_state(TASK_INTERRUPTIBLE); 1028 set_current_state(TASK_INTERRUPTIBLE);
1112 1029
1113 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, 1030 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
1114 detect_deadlock);
1115 1031
1116 set_current_state(TASK_RUNNING); 1032 set_current_state(TASK_RUNNING);
1117 1033
1118 if (unlikely(waiter->task)) 1034 if (unlikely(ret))
1119 remove_waiter(lock, waiter); 1035 remove_waiter(lock, waiter);
1120 1036
1121 /* 1037 /*
@@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1126 1042
1127 raw_spin_unlock(&lock->wait_lock); 1043 raw_spin_unlock(&lock->wait_lock);
1128 1044
1129 /*
1130 * Readjust priority, when we did not get the lock. We might have been
1131 * the pending owner and boosted. Since we did not take the lock, the
1132 * PI boost has to go.
1133 */
1134 if (unlikely(ret))
1135 rt_mutex_adjust_prio(current);
1136
1137 return ret; 1045 return ret;
1138} 1046}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 97a2f81866af..53a66c85261b 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p)
91/* 91/*
92 * lock->owner state tracking: 92 * lock->owner state tracking:
93 */ 93 */
94#define RT_MUTEX_OWNER_PENDING 1UL 94#define RT_MUTEX_HAS_WAITERS 1UL
95#define RT_MUTEX_HAS_WAITERS 2UL 95#define RT_MUTEX_OWNER_MASKALL 1UL
96#define RT_MUTEX_OWNER_MASKALL 3UL
97 96
98static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) 97static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
99{ 98{
@@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
101 ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); 100 ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
102} 101}
103 102
104static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
105{
106 return (struct task_struct *)
107 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
108}
109
110static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
111{
112 return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
113}
114
115/* 103/*
116 * PI-futex support (proxy locking functions, etc.): 104 * PI-futex support (proxy locking functions, etc.):
117 */ 105 */
diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4ec7ba..312f8b95c2d4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -32,7 +32,6 @@
32#include <linux/init.h> 32#include <linux/init.h>
33#include <linux/uaccess.h> 33#include <linux/uaccess.h>
34#include <linux/highmem.h> 34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h> 35#include <asm/mmu_context.h>
37#include <linux/interrupt.h> 36#include <linux/interrupt.h>
38#include <linux/capability.h> 37#include <linux/capability.h>
@@ -324,7 +323,7 @@ struct cfs_rq {
324 * 'curr' points to currently running entity on this cfs_rq. 323 * 'curr' points to currently running entity on this cfs_rq.
325 * It is set to NULL otherwise (i.e when none are currently running). 324 * It is set to NULL otherwise (i.e when none are currently running).
326 */ 325 */
327 struct sched_entity *curr, *next, *last; 326 struct sched_entity *curr, *next, *last, *skip;
328 327
329 unsigned int nr_spread_over; 328 unsigned int nr_spread_over;
330 329
@@ -606,9 +605,6 @@ static inline struct task_group *task_group(struct task_struct *p)
606 struct task_group *tg; 605 struct task_group *tg;
607 struct cgroup_subsys_state *css; 606 struct cgroup_subsys_state *css;
608 607
609 if (p->flags & PF_EXITING)
610 return &root_task_group;
611
612 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 608 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
613 lockdep_is_held(&task_rq(p)->lock)); 609 lockdep_is_held(&task_rq(p)->lock));
614 tg = container_of(css, struct task_group, css); 610 tg = container_of(css, struct task_group, css);
@@ -664,10 +660,9 @@ static void update_rq_clock(struct rq *rq)
664#endif 660#endif
665 661
666/** 662/**
667 * runqueue_is_locked 663 * runqueue_is_locked - Returns true if the current cpu runqueue is locked
668 * @cpu: the processor in question. 664 * @cpu: the processor in question.
669 * 665 *
670 * Returns true if the current cpu runqueue is locked.
671 * This interface allows printk to be called with the runqueue lock 666 * This interface allows printk to be called with the runqueue lock
672 * held and know whether or not it is OK to wake up the klogd. 667 * held and know whether or not it is OK to wake up the klogd.
673 */ 668 */
@@ -1686,6 +1681,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1686 __release(rq2->lock); 1681 __release(rq2->lock);
1687} 1682}
1688 1683
1684#else /* CONFIG_SMP */
1685
1686/*
1687 * double_rq_lock - safely lock two runqueues
1688 *
1689 * Note this does not disable interrupts like task_rq_lock,
1690 * you need to do so manually before calling.
1691 */
1692static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1693 __acquires(rq1->lock)
1694 __acquires(rq2->lock)
1695{
1696 BUG_ON(!irqs_disabled());
1697 BUG_ON(rq1 != rq2);
1698 raw_spin_lock(&rq1->lock);
1699 __acquire(rq2->lock); /* Fake it out ;) */
1700}
1701
1702/*
1703 * double_rq_unlock - safely unlock two runqueues
1704 *
1705 * Note this does not restore interrupts like task_rq_unlock,
1706 * you need to do so manually after calling.
1707 */
1708static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1709 __releases(rq1->lock)
1710 __releases(rq2->lock)
1711{
1712 BUG_ON(rq1 != rq2);
1713 raw_spin_unlock(&rq1->lock);
1714 __release(rq2->lock);
1715}
1716
1689#endif 1717#endif
1690 1718
1691static void calc_load_account_idle(struct rq *this_rq); 1719static void calc_load_account_idle(struct rq *this_rq);
@@ -1880,7 +1908,7 @@ void account_system_vtime(struct task_struct *curr)
1880 */ 1908 */
1881 if (hardirq_count()) 1909 if (hardirq_count())
1882 __this_cpu_add(cpu_hardirq_time, delta); 1910 __this_cpu_add(cpu_hardirq_time, delta);
1883 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) 1911 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
1884 __this_cpu_add(cpu_softirq_time, delta); 1912 __this_cpu_add(cpu_softirq_time, delta);
1885 1913
1886 irq_time_write_end(); 1914 irq_time_write_end();
@@ -1920,8 +1948,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
1920 sched_rt_avg_update(rq, irq_delta); 1948 sched_rt_avg_update(rq, irq_delta);
1921} 1949}
1922 1950
1951static int irqtime_account_hi_update(void)
1952{
1953 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1954 unsigned long flags;
1955 u64 latest_ns;
1956 int ret = 0;
1957
1958 local_irq_save(flags);
1959 latest_ns = this_cpu_read(cpu_hardirq_time);
1960 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
1961 ret = 1;
1962 local_irq_restore(flags);
1963 return ret;
1964}
1965
1966static int irqtime_account_si_update(void)
1967{
1968 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1969 unsigned long flags;
1970 u64 latest_ns;
1971 int ret = 0;
1972
1973 local_irq_save(flags);
1974 latest_ns = this_cpu_read(cpu_softirq_time);
1975 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
1976 ret = 1;
1977 local_irq_restore(flags);
1978 return ret;
1979}
1980
1923#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 1981#else /* CONFIG_IRQ_TIME_ACCOUNTING */
1924 1982
1983#define sched_clock_irqtime (0)
1984
1925static void update_rq_clock_task(struct rq *rq, s64 delta) 1985static void update_rq_clock_task(struct rq *rq, s64 delta)
1926{ 1986{
1927 rq->clock_task += delta; 1987 rq->clock_task += delta;
@@ -2025,14 +2085,14 @@ inline int task_curr(const struct task_struct *p)
2025 2085
2026static inline void check_class_changed(struct rq *rq, struct task_struct *p, 2086static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2027 const struct sched_class *prev_class, 2087 const struct sched_class *prev_class,
2028 int oldprio, int running) 2088 int oldprio)
2029{ 2089{
2030 if (prev_class != p->sched_class) { 2090 if (prev_class != p->sched_class) {
2031 if (prev_class->switched_from) 2091 if (prev_class->switched_from)
2032 prev_class->switched_from(rq, p, running); 2092 prev_class->switched_from(rq, p);
2033 p->sched_class->switched_to(rq, p, running); 2093 p->sched_class->switched_to(rq, p);
2034 } else 2094 } else if (oldprio != p->prio)
2035 p->sched_class->prio_changed(rq, p, oldprio, running); 2095 p->sched_class->prio_changed(rq, p, oldprio);
2036} 2096}
2037 2097
2038static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 2098static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2224,7 +2284,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2224 * yield - it could be a while. 2284 * yield - it could be a while.
2225 */ 2285 */
2226 if (unlikely(on_rq)) { 2286 if (unlikely(on_rq)) {
2227 schedule_timeout_uninterruptible(1); 2287 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
2288
2289 set_current_state(TASK_UNINTERRUPTIBLE);
2290 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2228 continue; 2291 continue;
2229 } 2292 }
2230 2293
@@ -2246,7 +2309,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2246 * Cause a process which is running on another CPU to enter 2309 * Cause a process which is running on another CPU to enter
2247 * kernel-mode, without any delay. (to get signals handled.) 2310 * kernel-mode, without any delay. (to get signals handled.)
2248 * 2311 *
2249 * NOTE: this function doesnt have to take the runqueue lock, 2312 * NOTE: this function doesn't have to take the runqueue lock,
2250 * because all it wants to ensure is that the remote task enters 2313 * because all it wants to ensure is that the remote task enters
2251 * the kernel. If the IPI races and the task has been migrated 2314 * the kernel. If the IPI races and the task has been migrated
2252 * to another CPU then no harm is done and the purpose has been 2315 * to another CPU then no harm is done and the purpose has been
@@ -2265,27 +2328,6 @@ void kick_process(struct task_struct *p)
2265EXPORT_SYMBOL_GPL(kick_process); 2328EXPORT_SYMBOL_GPL(kick_process);
2266#endif /* CONFIG_SMP */ 2329#endif /* CONFIG_SMP */
2267 2330
2268/**
2269 * task_oncpu_function_call - call a function on the cpu on which a task runs
2270 * @p: the task to evaluate
2271 * @func: the function to be called
2272 * @info: the function call argument
2273 *
2274 * Calls the function @func when the task is currently running. This might
2275 * be on the current CPU, which just calls the function directly
2276 */
2277void task_oncpu_function_call(struct task_struct *p,
2278 void (*func) (void *info), void *info)
2279{
2280 int cpu;
2281
2282 preempt_disable();
2283 cpu = task_cpu(p);
2284 if (task_curr(p))
2285 smp_call_function_single(cpu, func, info, 1);
2286 preempt_enable();
2287}
2288
2289#ifdef CONFIG_SMP 2331#ifdef CONFIG_SMP
2290/* 2332/*
2291 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2333 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
@@ -2566,6 +2608,7 @@ static void __sched_fork(struct task_struct *p)
2566 p->se.sum_exec_runtime = 0; 2608 p->se.sum_exec_runtime = 0;
2567 p->se.prev_sum_exec_runtime = 0; 2609 p->se.prev_sum_exec_runtime = 0;
2568 p->se.nr_migrations = 0; 2610 p->se.nr_migrations = 0;
2611 p->se.vruntime = 0;
2569 2612
2570#ifdef CONFIG_SCHEDSTATS 2613#ifdef CONFIG_SCHEDSTATS
2571 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2614 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -2776,9 +2819,12 @@ static inline void
2776prepare_task_switch(struct rq *rq, struct task_struct *prev, 2819prepare_task_switch(struct rq *rq, struct task_struct *prev,
2777 struct task_struct *next) 2820 struct task_struct *next)
2778{ 2821{
2822 sched_info_switch(prev, next);
2823 perf_event_task_sched_out(prev, next);
2779 fire_sched_out_preempt_notifiers(prev, next); 2824 fire_sched_out_preempt_notifiers(prev, next);
2780 prepare_lock_switch(rq, next); 2825 prepare_lock_switch(rq, next);
2781 prepare_arch_switch(next); 2826 prepare_arch_switch(next);
2827 trace_sched_switch(prev, next);
2782} 2828}
2783 2829
2784/** 2830/**
@@ -2911,7 +2957,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2911 struct mm_struct *mm, *oldmm; 2957 struct mm_struct *mm, *oldmm;
2912 2958
2913 prepare_task_switch(rq, prev, next); 2959 prepare_task_switch(rq, prev, next);
2914 trace_sched_switch(prev, next); 2960
2915 mm = next->mm; 2961 mm = next->mm;
2916 oldmm = prev->active_mm; 2962 oldmm = prev->active_mm;
2917 /* 2963 /*
@@ -3568,6 +3614,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3568} 3614}
3569 3615
3570/* 3616/*
3617 * Account system cpu time to a process and desired cpustat field
3618 * @p: the process that the cpu time gets accounted to
3619 * @cputime: the cpu time spent in kernel space since the last update
3620 * @cputime_scaled: cputime scaled by cpu frequency
3621 * @target_cputime64: pointer to cpustat field that has to be updated
3622 */
3623static inline
3624void __account_system_time(struct task_struct *p, cputime_t cputime,
3625 cputime_t cputime_scaled, cputime64_t *target_cputime64)
3626{
3627 cputime64_t tmp = cputime_to_cputime64(cputime);
3628
3629 /* Add system time to process. */
3630 p->stime = cputime_add(p->stime, cputime);
3631 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3632 account_group_system_time(p, cputime);
3633
3634 /* Add system time to cpustat. */
3635 *target_cputime64 = cputime64_add(*target_cputime64, tmp);
3636 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3637
3638 /* Account for system time used */
3639 acct_update_integrals(p);
3640}
3641
3642/*
3571 * Account system cpu time to a process. 3643 * Account system cpu time to a process.
3572 * @p: the process that the cpu time gets accounted to 3644 * @p: the process that the cpu time gets accounted to
3573 * @hardirq_offset: the offset to subtract from hardirq_count() 3645 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3578,36 +3650,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3578 cputime_t cputime, cputime_t cputime_scaled) 3650 cputime_t cputime, cputime_t cputime_scaled)
3579{ 3651{
3580 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3652 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3581 cputime64_t tmp; 3653 cputime64_t *target_cputime64;
3582 3654
3583 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 3655 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3584 account_guest_time(p, cputime, cputime_scaled); 3656 account_guest_time(p, cputime, cputime_scaled);
3585 return; 3657 return;
3586 } 3658 }
3587 3659
3588 /* Add system time to process. */
3589 p->stime = cputime_add(p->stime, cputime);
3590 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3591 account_group_system_time(p, cputime);
3592
3593 /* Add system time to cpustat. */
3594 tmp = cputime_to_cputime64(cputime);
3595 if (hardirq_count() - hardirq_offset) 3660 if (hardirq_count() - hardirq_offset)
3596 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3661 target_cputime64 = &cpustat->irq;
3597 else if (in_serving_softirq()) 3662 else if (in_serving_softirq())
3598 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3663 target_cputime64 = &cpustat->softirq;
3599 else 3664 else
3600 cpustat->system = cputime64_add(cpustat->system, tmp); 3665 target_cputime64 = &cpustat->system;
3601
3602 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3603 3666
3604 /* Account for system time used */ 3667 __account_system_time(p, cputime, cputime_scaled, target_cputime64);
3605 acct_update_integrals(p);
3606} 3668}
3607 3669
3608/* 3670/*
3609 * Account for involuntary wait time. 3671 * Account for involuntary wait time.
3610 * @steal: the cpu time spent in involuntary wait 3672 * @cputime: the cpu time spent in involuntary wait
3611 */ 3673 */
3612void account_steal_time(cputime_t cputime) 3674void account_steal_time(cputime_t cputime)
3613{ 3675{
@@ -3635,6 +3697,73 @@ void account_idle_time(cputime_t cputime)
3635 3697
3636#ifndef CONFIG_VIRT_CPU_ACCOUNTING 3698#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3637 3699
3700#ifdef CONFIG_IRQ_TIME_ACCOUNTING
3701/*
3702 * Account a tick to a process and cpustat
3703 * @p: the process that the cpu time gets accounted to
3704 * @user_tick: is the tick from userspace
3705 * @rq: the pointer to rq
3706 *
3707 * Tick demultiplexing follows the order
3708 * - pending hardirq update
3709 * - pending softirq update
3710 * - user_time
3711 * - idle_time
3712 * - system time
3713 * - check for guest_time
3714 * - else account as system_time
3715 *
3716 * Check for hardirq is done both for system and user time as there is
3717 * no timer going off while we are on hardirq and hence we may never get an
3718 * opportunity to update it solely in system time.
3719 * p->stime and friends are only updated on system time and not on irq
3720 * softirq as those do not count in task exec_runtime any more.
3721 */
3722static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3723 struct rq *rq)
3724{
3725 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3726 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3727 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3728
3729 if (irqtime_account_hi_update()) {
3730 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3731 } else if (irqtime_account_si_update()) {
3732 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3733 } else if (this_cpu_ksoftirqd() == p) {
3734 /*
3735 * ksoftirqd time do not get accounted in cpu_softirq_time.
3736 * So, we have to handle it separately here.
3737 * Also, p->stime needs to be updated for ksoftirqd.
3738 */
3739 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3740 &cpustat->softirq);
3741 } else if (user_tick) {
3742 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3743 } else if (p == rq->idle) {
3744 account_idle_time(cputime_one_jiffy);
3745 } else if (p->flags & PF_VCPU) { /* System time or guest time */
3746 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3747 } else {
3748 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3749 &cpustat->system);
3750 }
3751}
3752
3753static void irqtime_account_idle_ticks(int ticks)
3754{
3755 int i;
3756 struct rq *rq = this_rq();
3757
3758 for (i = 0; i < ticks; i++)
3759 irqtime_account_process_tick(current, 0, rq);
3760}
3761#else /* CONFIG_IRQ_TIME_ACCOUNTING */
3762static void irqtime_account_idle_ticks(int ticks) {}
3763static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3764 struct rq *rq) {}
3765#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
3766
3638/* 3767/*
3639 * Account a single tick of cpu time. 3768 * Account a single tick of cpu time.
3640 * @p: the process that the cpu time gets accounted to 3769 * @p: the process that the cpu time gets accounted to
@@ -3645,6 +3774,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
3645 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 3774 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3646 struct rq *rq = this_rq(); 3775 struct rq *rq = this_rq();
3647 3776
3777 if (sched_clock_irqtime) {
3778 irqtime_account_process_tick(p, user_tick, rq);
3779 return;
3780 }
3781
3648 if (user_tick) 3782 if (user_tick)
3649 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 3783 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3650 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 3784 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3670,6 +3804,12 @@ void account_steal_ticks(unsigned long ticks)
3670 */ 3804 */
3671void account_idle_ticks(unsigned long ticks) 3805void account_idle_ticks(unsigned long ticks)
3672{ 3806{
3807
3808 if (sched_clock_irqtime) {
3809 irqtime_account_idle_ticks(ticks);
3810 return;
3811 }
3812
3673 account_idle_time(jiffies_to_cputime(ticks)); 3813 account_idle_time(jiffies_to_cputime(ticks));
3674} 3814}
3675 3815
@@ -3945,9 +4085,6 @@ need_resched:
3945 rcu_note_context_switch(cpu); 4085 rcu_note_context_switch(cpu);
3946 prev = rq->curr; 4086 prev = rq->curr;
3947 4087
3948 release_kernel_lock(prev);
3949need_resched_nonpreemptible:
3950
3951 schedule_debug(prev); 4088 schedule_debug(prev);
3952 4089
3953 if (sched_feat(HRTICK)) 4090 if (sched_feat(HRTICK))
@@ -3974,6 +4111,16 @@ need_resched_nonpreemptible:
3974 try_to_wake_up_local(to_wakeup); 4111 try_to_wake_up_local(to_wakeup);
3975 } 4112 }
3976 deactivate_task(rq, prev, DEQUEUE_SLEEP); 4113 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4114
4115 /*
4116 * If we are going to sleep and we have plugged IO queued, make
4117 * sure to submit it to avoid deadlocks.
4118 */
4119 if (blk_needs_flush_plug(prev)) {
4120 raw_spin_unlock(&rq->lock);
4121 blk_schedule_flush_plug(prev);
4122 raw_spin_lock(&rq->lock);
4123 }
3977 } 4124 }
3978 switch_count = &prev->nvcsw; 4125 switch_count = &prev->nvcsw;
3979 } 4126 }
@@ -3989,9 +4136,6 @@ need_resched_nonpreemptible:
3989 rq->skip_clock_update = 0; 4136 rq->skip_clock_update = 0;
3990 4137
3991 if (likely(prev != next)) { 4138 if (likely(prev != next)) {
3992 sched_info_switch(prev, next);
3993 perf_event_task_sched_out(prev, next);
3994
3995 rq->nr_switches++; 4139 rq->nr_switches++;
3996 rq->curr = next; 4140 rq->curr = next;
3997 ++*switch_count; 4141 ++*switch_count;
@@ -4010,9 +4154,6 @@ need_resched_nonpreemptible:
4010 4154
4011 post_schedule(rq); 4155 post_schedule(rq);
4012 4156
4013 if (unlikely(reacquire_kernel_lock(prev)))
4014 goto need_resched_nonpreemptible;
4015
4016 preempt_enable_no_resched(); 4157 preempt_enable_no_resched();
4017 if (need_resched()) 4158 if (need_resched())
4018 goto need_resched; 4159 goto need_resched;
@@ -4213,6 +4354,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
4213{ 4354{
4214 __wake_up_common(q, mode, 1, 0, key); 4355 __wake_up_common(q, mode, 1, 0, key);
4215} 4356}
4357EXPORT_SYMBOL_GPL(__wake_up_locked_key);
4216 4358
4217/** 4359/**
4218 * __wake_up_sync_key - wake up threads blocked on a waitqueue. 4360 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
@@ -4570,11 +4712,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4570 4712
4571 if (running) 4713 if (running)
4572 p->sched_class->set_curr_task(rq); 4714 p->sched_class->set_curr_task(rq);
4573 if (on_rq) { 4715 if (on_rq)
4574 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4716 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4575 4717
4576 check_class_changed(rq, p, prev_class, oldprio, running); 4718 check_class_changed(rq, p, prev_class, oldprio);
4577 }
4578 task_rq_unlock(rq, &flags); 4719 task_rq_unlock(rq, &flags);
4579} 4720}
4580 4721
@@ -4761,8 +4902,11 @@ static bool check_same_owner(struct task_struct *p)
4761 4902
4762 rcu_read_lock(); 4903 rcu_read_lock();
4763 pcred = __task_cred(p); 4904 pcred = __task_cred(p);
4764 match = (cred->euid == pcred->euid || 4905 if (cred->user->user_ns == pcred->user->user_ns)
4765 cred->euid == pcred->uid); 4906 match = (cred->euid == pcred->euid ||
4907 cred->euid == pcred->uid);
4908 else
4909 match = false;
4766 rcu_read_unlock(); 4910 rcu_read_unlock();
4767 return match; 4911 return match;
4768} 4912}
@@ -4822,12 +4966,15 @@ recheck:
4822 param->sched_priority > rlim_rtprio) 4966 param->sched_priority > rlim_rtprio)
4823 return -EPERM; 4967 return -EPERM;
4824 } 4968 }
4969
4825 /* 4970 /*
4826 * Like positive nice levels, dont allow tasks to 4971 * Treat SCHED_IDLE as nice 20. Only allow a switch to
4827 * move out of SCHED_IDLE either: 4972 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
4828 */ 4973 */
4829 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) 4974 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
4830 return -EPERM; 4975 if (!can_nice(p, TASK_NICE(p)))
4976 return -EPERM;
4977 }
4831 4978
4832 /* can't change other user's priorities */ 4979 /* can't change other user's priorities */
4833 if (!check_same_owner(p)) 4980 if (!check_same_owner(p))
@@ -4850,7 +4997,7 @@ recheck:
4850 */ 4997 */
4851 raw_spin_lock_irqsave(&p->pi_lock, flags); 4998 raw_spin_lock_irqsave(&p->pi_lock, flags);
4852 /* 4999 /*
4853 * To be able to change p->policy safely, the apropriate 5000 * To be able to change p->policy safely, the appropriate
4854 * runqueue lock must be held. 5001 * runqueue lock must be held.
4855 */ 5002 */
4856 rq = __task_rq_lock(p); 5003 rq = __task_rq_lock(p);
@@ -4864,6 +5011,17 @@ recheck:
4864 return -EINVAL; 5011 return -EINVAL;
4865 } 5012 }
4866 5013
5014 /*
5015 * If not changing anything there's no need to proceed further:
5016 */
5017 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
5018 param->sched_priority == p->rt_priority))) {
5019
5020 __task_rq_unlock(rq);
5021 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5022 return 0;
5023 }
5024
4867#ifdef CONFIG_RT_GROUP_SCHED 5025#ifdef CONFIG_RT_GROUP_SCHED
4868 if (user) { 5026 if (user) {
4869 /* 5027 /*
@@ -4902,11 +5060,10 @@ recheck:
4902 5060
4903 if (running) 5061 if (running)
4904 p->sched_class->set_curr_task(rq); 5062 p->sched_class->set_curr_task(rq);
4905 if (on_rq) { 5063 if (on_rq)
4906 activate_task(rq, p, 0); 5064 activate_task(rq, p, 0);
4907 5065
4908 check_class_changed(rq, p, prev_class, oldprio, running); 5066 check_class_changed(rq, p, prev_class, oldprio);
4909 }
4910 __task_rq_unlock(rq); 5067 __task_rq_unlock(rq);
4911 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5068 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4912 5069
@@ -5088,7 +5245,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5088 goto out_free_cpus_allowed; 5245 goto out_free_cpus_allowed;
5089 } 5246 }
5090 retval = -EPERM; 5247 retval = -EPERM;
5091 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5248 if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
5092 goto out_unlock; 5249 goto out_unlock;
5093 5250
5094 retval = security_task_setscheduler(p); 5251 retval = security_task_setscheduler(p);
@@ -5323,6 +5480,67 @@ void __sched yield(void)
5323} 5480}
5324EXPORT_SYMBOL(yield); 5481EXPORT_SYMBOL(yield);
5325 5482
5483/**
5484 * yield_to - yield the current processor to another thread in
5485 * your thread group, or accelerate that thread toward the
5486 * processor it's on.
5487 * @p: target task
5488 * @preempt: whether task preemption is allowed or not
5489 *
5490 * It's the caller's job to ensure that the target task struct
5491 * can't go away on us before we can do any checks.
5492 *
5493 * Returns true if we indeed boosted the target task.
5494 */
5495bool __sched yield_to(struct task_struct *p, bool preempt)
5496{
5497 struct task_struct *curr = current;
5498 struct rq *rq, *p_rq;
5499 unsigned long flags;
5500 bool yielded = 0;
5501
5502 local_irq_save(flags);
5503 rq = this_rq();
5504
5505again:
5506 p_rq = task_rq(p);
5507 double_rq_lock(rq, p_rq);
5508 while (task_rq(p) != p_rq) {
5509 double_rq_unlock(rq, p_rq);
5510 goto again;
5511 }
5512
5513 if (!curr->sched_class->yield_to_task)
5514 goto out;
5515
5516 if (curr->sched_class != p->sched_class)
5517 goto out;
5518
5519 if (task_running(p_rq, p) || p->state)
5520 goto out;
5521
5522 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5523 if (yielded) {
5524 schedstat_inc(rq, yld_count);
5525 /*
5526 * Make p's CPU reschedule; pick_next_entity takes care of
5527 * fairness.
5528 */
5529 if (preempt && rq != p_rq)
5530 resched_task(p_rq->curr);
5531 }
5532
5533out:
5534 double_rq_unlock(rq, p_rq);
5535 local_irq_restore(flags);
5536
5537 if (yielded)
5538 schedule();
5539
5540 return yielded;
5541}
5542EXPORT_SYMBOL_GPL(yield_to);
5543
5326/* 5544/*
5327 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 5545 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
5328 * that process accounting knows that this is a task in IO wait state. 5546 * that process accounting knows that this is a task in IO wait state.
@@ -5333,6 +5551,7 @@ void __sched io_schedule(void)
5333 5551
5334 delayacct_blkio_start(); 5552 delayacct_blkio_start();
5335 atomic_inc(&rq->nr_iowait); 5553 atomic_inc(&rq->nr_iowait);
5554 blk_flush_plug(current);
5336 current->in_iowait = 1; 5555 current->in_iowait = 1;
5337 schedule(); 5556 schedule();
5338 current->in_iowait = 0; 5557 current->in_iowait = 0;
@@ -5348,6 +5567,7 @@ long __sched io_schedule_timeout(long timeout)
5348 5567
5349 delayacct_blkio_start(); 5568 delayacct_blkio_start();
5350 atomic_inc(&rq->nr_iowait); 5569 atomic_inc(&rq->nr_iowait);
5570 blk_flush_plug(current);
5351 current->in_iowait = 1; 5571 current->in_iowait = 1;
5352 ret = schedule_timeout(timeout); 5572 ret = schedule_timeout(timeout);
5353 current->in_iowait = 0; 5573 current->in_iowait = 0;
@@ -5496,7 +5716,7 @@ void show_state_filter(unsigned long state_filter)
5496 do_each_thread(g, p) { 5716 do_each_thread(g, p) {
5497 /* 5717 /*
5498 * reset the NMI-timeout, listing all files on a slow 5718 * reset the NMI-timeout, listing all files on a slow
5499 * console might take alot of time: 5719 * console might take a lot of time:
5500 */ 5720 */
5501 touch_nmi_watchdog(); 5721 touch_nmi_watchdog();
5502 if (!state_filter || (p->state & state_filter)) 5722 if (!state_filter || (p->state & state_filter))
@@ -5571,7 +5791,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5571 * The idle tasks have their own, simple scheduling class: 5791 * The idle tasks have their own, simple scheduling class:
5572 */ 5792 */
5573 idle->sched_class = &idle_sched_class; 5793 idle->sched_class = &idle_sched_class;
5574 ftrace_graph_init_task(idle); 5794 ftrace_graph_init_idle_task(idle, cpu);
5575} 5795}
5576 5796
5577/* 5797/*
@@ -6111,6 +6331,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6111 break; 6331 break;
6112#endif 6332#endif
6113 } 6333 }
6334
6335 update_max_interval();
6336
6114 return NOTIFY_OK; 6337 return NOTIFY_OK;
6115} 6338}
6116 6339
@@ -7796,6 +8019,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7796 INIT_LIST_HEAD(&cfs_rq->tasks); 8019 INIT_LIST_HEAD(&cfs_rq->tasks);
7797#ifdef CONFIG_FAIR_GROUP_SCHED 8020#ifdef CONFIG_FAIR_GROUP_SCHED
7798 cfs_rq->rq = rq; 8021 cfs_rq->rq = rq;
8022 /* allow initial update_cfs_load() to truncate */
8023#ifdef CONFIG_SMP
8024 cfs_rq->load_stamp = 1;
8025#endif
7799#endif 8026#endif
7800 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 8027 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7801} 8028}
@@ -8074,7 +8301,7 @@ static inline int preempt_count_equals(int preempt_offset)
8074{ 8301{
8075 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 8302 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
8076 8303
8077 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 8304 return (nested == preempt_offset);
8078} 8305}
8079 8306
8080void __might_sleep(const char *file, int line, int preempt_offset) 8307void __might_sleep(const char *file, int line, int preempt_offset)
@@ -8109,6 +8336,8 @@ EXPORT_SYMBOL(__might_sleep);
8109#ifdef CONFIG_MAGIC_SYSRQ 8336#ifdef CONFIG_MAGIC_SYSRQ
8110static void normalize_task(struct rq *rq, struct task_struct *p) 8337static void normalize_task(struct rq *rq, struct task_struct *p)
8111{ 8338{
8339 const struct sched_class *prev_class = p->sched_class;
8340 int old_prio = p->prio;
8112 int on_rq; 8341 int on_rq;
8113 8342
8114 on_rq = p->se.on_rq; 8343 on_rq = p->se.on_rq;
@@ -8119,6 +8348,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8119 activate_task(rq, p, 0); 8348 activate_task(rq, p, 0);
8120 resched_task(rq->curr); 8349 resched_task(rq->curr);
8121 } 8350 }
8351
8352 check_class_changed(rq, p, prev_class, old_prio);
8122} 8353}
8123 8354
8124void normalize_rt_tasks(void) 8355void normalize_rt_tasks(void)
@@ -8234,7 +8465,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8234{ 8465{
8235 struct cfs_rq *cfs_rq; 8466 struct cfs_rq *cfs_rq;
8236 struct sched_entity *se; 8467 struct sched_entity *se;
8237 struct rq *rq;
8238 int i; 8468 int i;
8239 8469
8240 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 8470 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8247,8 +8477,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8247 tg->shares = NICE_0_LOAD; 8477 tg->shares = NICE_0_LOAD;
8248 8478
8249 for_each_possible_cpu(i) { 8479 for_each_possible_cpu(i) {
8250 rq = cpu_rq(i);
8251
8252 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8480 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8253 GFP_KERNEL, cpu_to_node(i)); 8481 GFP_KERNEL, cpu_to_node(i));
8254 if (!cfs_rq) 8482 if (!cfs_rq)
@@ -8510,7 +8738,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8510 /* Propagate contribution to hierarchy */ 8738 /* Propagate contribution to hierarchy */
8511 raw_spin_lock_irqsave(&rq->lock, flags); 8739 raw_spin_lock_irqsave(&rq->lock, flags);
8512 for_each_sched_entity(se) 8740 for_each_sched_entity(se)
8513 update_cfs_shares(group_cfs_rq(se), 0); 8741 update_cfs_shares(group_cfs_rq(se));
8514 raw_spin_unlock_irqrestore(&rq->lock, flags); 8742 raw_spin_unlock_irqrestore(&rq->lock, flags);
8515 } 8743 }
8516 8744
@@ -8884,7 +9112,8 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8884} 9112}
8885 9113
8886static void 9114static void
8887cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) 9115cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
9116 struct cgroup *old_cgrp, struct task_struct *task)
8888{ 9117{
8889 /* 9118 /*
8890 * cgroup_exit() is called in the copy_process() failure path. 9119 * cgroup_exit() is called in the copy_process() failure path.
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 9fb656283157..429242f3c484 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -12,7 +12,6 @@ static atomic_t autogroup_seq_nr;
12static void __init autogroup_init(struct task_struct *init_task) 12static void __init autogroup_init(struct task_struct *init_task)
13{ 13{
14 autogroup_default.tg = &root_task_group; 14 autogroup_default.tg = &root_task_group;
15 root_task_group.autogroup = &autogroup_default;
16 kref_init(&autogroup_default.kref); 15 kref_init(&autogroup_default.kref);
17 init_rwsem(&autogroup_default.lock); 16 init_rwsem(&autogroup_default.lock);
18 init_task->signal->autogroup = &autogroup_default; 17 init_task->signal->autogroup = &autogroup_default;
@@ -130,7 +129,7 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
130 129
131static inline bool task_group_is_autogroup(struct task_group *tg) 130static inline bool task_group_is_autogroup(struct task_group *tg)
132{ 131{
133 return tg != &root_task_group && tg->autogroup; 132 return !!tg->autogroup;
134} 133}
135 134
136static inline struct task_group * 135static inline struct task_group *
@@ -161,11 +160,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
161 160
162 p->signal->autogroup = autogroup_kref_get(ag); 161 p->signal->autogroup = autogroup_kref_get(ag);
163 162
163 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
164 goto out;
165
164 t = p; 166 t = p;
165 do { 167 do {
166 sched_move_task(t); 168 sched_move_task(t);
167 } while_each_thread(p, t); 169 } while_each_thread(p, t);
168 170
171out:
169 unlock_task_sighand(p, &flags); 172 unlock_task_sighand(p, &flags);
170 autogroup_kref_put(prev); 173 autogroup_kref_put(prev);
171} 174}
@@ -176,7 +179,7 @@ void sched_autogroup_create_attach(struct task_struct *p)
176 struct autogroup *ag = autogroup_create(); 179 struct autogroup *ag = autogroup_create();
177 180
178 autogroup_move_group(p, ag); 181 autogroup_move_group(p, ag);
179 /* drop extra refrence added by autogroup_create() */ 182 /* drop extra reference added by autogroup_create() */
180 autogroup_kref_put(ag); 183 autogroup_kref_put(ag);
181} 184}
182EXPORT_SYMBOL(sched_autogroup_create_attach); 185EXPORT_SYMBOL(sched_autogroup_create_attach);
@@ -247,10 +250,14 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
247{ 250{
248 struct autogroup *ag = autogroup_task_get(p); 251 struct autogroup *ag = autogroup_task_get(p);
249 252
253 if (!task_group_is_autogroup(ag->tg))
254 goto out;
255
250 down_read(&ag->lock); 256 down_read(&ag->lock);
251 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); 257 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
252 up_read(&ag->lock); 258 up_read(&ag->lock);
253 259
260out:
254 autogroup_kref_put(ag); 261 autogroup_kref_put(ag);
255} 262}
256#endif /* CONFIG_PROC_FS */ 263#endif /* CONFIG_PROC_FS */
@@ -258,9 +265,7 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
258#ifdef CONFIG_SCHED_DEBUG 265#ifdef CONFIG_SCHED_DEBUG
259static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 266static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
260{ 267{
261 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); 268 if (!task_group_is_autogroup(tg))
262
263 if (!enabled || !tg->autogroup)
264 return 0; 269 return 0;
265 270
266 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); 271 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 7b859ffe5dad..05577055cfca 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -1,6 +1,11 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3struct autogroup { 3struct autogroup {
4 /*
5 * reference doesn't mean how many thread attach to this
6 * autogroup now. It just stands for the number of task
7 * could use this autogroup.
8 */
4 struct kref kref; 9 struct kref kref;
5 struct task_group *tg; 10 struct task_group *tg;
6 struct rw_semaphore lock; 11 struct rw_semaphore lock;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index eb6cb8edd075..7bacd83a4158 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -179,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
179 179
180 raw_spin_lock_irqsave(&rq->lock, flags); 180 raw_spin_lock_irqsave(&rq->lock, flags);
181 if (cfs_rq->rb_leftmost) 181 if (cfs_rq->rb_leftmost)
182 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; 182 MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
183 last = __pick_last_entity(cfs_rq); 183 last = __pick_last_entity(cfs_rq);
184 if (last) 184 if (last)
185 max_vruntime = last->vruntime; 185 max_vruntime = last->vruntime;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c26e2df450e..6fa833ab2cb8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/cpumask.h>
25 26
26/* 27/*
27 * Targeted preemption latency for CPU-bound tasks: 28 * Targeted preemption latency for CPU-bound tasks:
@@ -69,14 +70,6 @@ static unsigned int sched_nr_latency = 8;
69unsigned int sysctl_sched_child_runs_first __read_mostly; 70unsigned int sysctl_sched_child_runs_first __read_mostly;
70 71
71/* 72/*
72 * sys_sched_yield() compat mode
73 *
74 * This option switches the agressive yield implementation of the
75 * old scheduler back on.
76 */
77unsigned int __read_mostly sysctl_sched_compat_yield;
78
79/*
80 * SCHED_OTHER wake-up granularity. 73 * SCHED_OTHER wake-up granularity.
81 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 74 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
82 * 75 *
@@ -419,7 +412,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
419 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 412 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
420} 413}
421 414
422static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) 415static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
423{ 416{
424 struct rb_node *left = cfs_rq->rb_leftmost; 417 struct rb_node *left = cfs_rq->rb_leftmost;
425 418
@@ -429,6 +422,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
429 return rb_entry(left, struct sched_entity, run_node); 422 return rb_entry(left, struct sched_entity, run_node);
430} 423}
431 424
425static struct sched_entity *__pick_next_entity(struct sched_entity *se)
426{
427 struct rb_node *next = rb_next(&se->run_node);
428
429 if (!next)
430 return NULL;
431
432 return rb_entry(next, struct sched_entity, run_node);
433}
434
435#ifdef CONFIG_SCHED_DEBUG
432static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 436static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
433{ 437{
434 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 438 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -443,7 +447,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
443 * Scheduling class statistics methods: 447 * Scheduling class statistics methods:
444 */ 448 */
445 449
446#ifdef CONFIG_SCHED_DEBUG
447int sched_proc_update_handler(struct ctl_table *table, int write, 450int sched_proc_update_handler(struct ctl_table *table, int write,
448 void __user *buffer, size_t *lenp, 451 void __user *buffer, size_t *lenp,
449 loff_t *ppos) 452 loff_t *ppos)
@@ -540,7 +543,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
540} 543}
541 544
542static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); 545static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
543static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); 546static void update_cfs_shares(struct cfs_rq *cfs_rq);
544 547
545/* 548/*
546 * Update the current task's runtime statistics. Skip current tasks that 549 * Update the current task's runtime statistics. Skip current tasks that
@@ -733,6 +736,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
733 now - cfs_rq->load_last > 4 * period) { 736 now - cfs_rq->load_last > 4 * period) {
734 cfs_rq->load_period = 0; 737 cfs_rq->load_period = 0;
735 cfs_rq->load_avg = 0; 738 cfs_rq->load_avg = 0;
739 delta = period - 1;
736 } 740 }
737 741
738 cfs_rq->load_stamp = now; 742 cfs_rq->load_stamp = now;
@@ -763,16 +767,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
763 list_del_leaf_cfs_rq(cfs_rq); 767 list_del_leaf_cfs_rq(cfs_rq);
764} 768}
765 769
766static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, 770static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
767 long weight_delta)
768{ 771{
769 long load_weight, load, shares; 772 long load_weight, load, shares;
770 773
771 load = cfs_rq->load.weight + weight_delta; 774 load = cfs_rq->load.weight;
772 775
773 load_weight = atomic_read(&tg->load_weight); 776 load_weight = atomic_read(&tg->load_weight);
774 load_weight -= cfs_rq->load_contribution;
775 load_weight += load; 777 load_weight += load;
778 load_weight -= cfs_rq->load_contribution;
776 779
777 shares = (tg->shares * load); 780 shares = (tg->shares * load);
778 if (load_weight) 781 if (load_weight)
@@ -790,7 +793,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
790{ 793{
791 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { 794 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
792 update_cfs_load(cfs_rq, 0); 795 update_cfs_load(cfs_rq, 0);
793 update_cfs_shares(cfs_rq, 0); 796 update_cfs_shares(cfs_rq);
794 } 797 }
795} 798}
796# else /* CONFIG_SMP */ 799# else /* CONFIG_SMP */
@@ -798,8 +801,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
798{ 801{
799} 802}
800 803
801static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, 804static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
802 long weight_delta)
803{ 805{
804 return tg->shares; 806 return tg->shares;
805} 807}
@@ -824,7 +826,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
824 account_entity_enqueue(cfs_rq, se); 826 account_entity_enqueue(cfs_rq, se);
825} 827}
826 828
827static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) 829static void update_cfs_shares(struct cfs_rq *cfs_rq)
828{ 830{
829 struct task_group *tg; 831 struct task_group *tg;
830 struct sched_entity *se; 832 struct sched_entity *se;
@@ -838,7 +840,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
838 if (likely(se->load.weight == tg->shares)) 840 if (likely(se->load.weight == tg->shares))
839 return; 841 return;
840#endif 842#endif
841 shares = calc_cfs_shares(cfs_rq, tg, weight_delta); 843 shares = calc_cfs_shares(cfs_rq, tg);
842 844
843 reweight_entity(cfs_rq_of(se), se, shares); 845 reweight_entity(cfs_rq_of(se), se, shares);
844} 846}
@@ -847,7 +849,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
847{ 849{
848} 850}
849 851
850static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) 852static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
851{ 853{
852} 854}
853 855
@@ -978,8 +980,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
978 */ 980 */
979 update_curr(cfs_rq); 981 update_curr(cfs_rq);
980 update_cfs_load(cfs_rq, 0); 982 update_cfs_load(cfs_rq, 0);
981 update_cfs_shares(cfs_rq, se->load.weight);
982 account_entity_enqueue(cfs_rq, se); 983 account_entity_enqueue(cfs_rq, se);
984 update_cfs_shares(cfs_rq);
983 985
984 if (flags & ENQUEUE_WAKEUP) { 986 if (flags & ENQUEUE_WAKEUP) {
985 place_entity(cfs_rq, se, 0); 987 place_entity(cfs_rq, se, 0);
@@ -996,19 +998,49 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
996 list_add_leaf_cfs_rq(cfs_rq); 998 list_add_leaf_cfs_rq(cfs_rq);
997} 999}
998 1000
999static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 1001static void __clear_buddies_last(struct sched_entity *se)
1000{ 1002{
1001 if (!se || cfs_rq->last == se) 1003 for_each_sched_entity(se) {
1002 cfs_rq->last = NULL; 1004 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1005 if (cfs_rq->last == se)
1006 cfs_rq->last = NULL;
1007 else
1008 break;
1009 }
1010}
1003 1011
1004 if (!se || cfs_rq->next == se) 1012static void __clear_buddies_next(struct sched_entity *se)
1005 cfs_rq->next = NULL; 1013{
1014 for_each_sched_entity(se) {
1015 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1016 if (cfs_rq->next == se)
1017 cfs_rq->next = NULL;
1018 else
1019 break;
1020 }
1021}
1022
1023static void __clear_buddies_skip(struct sched_entity *se)
1024{
1025 for_each_sched_entity(se) {
1026 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1027 if (cfs_rq->skip == se)
1028 cfs_rq->skip = NULL;
1029 else
1030 break;
1031 }
1006} 1032}
1007 1033
1008static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 1034static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1009{ 1035{
1010 for_each_sched_entity(se) 1036 if (cfs_rq->last == se)
1011 __clear_buddies(cfs_rq_of(se), se); 1037 __clear_buddies_last(se);
1038
1039 if (cfs_rq->next == se)
1040 __clear_buddies_next(se);
1041
1042 if (cfs_rq->skip == se)
1043 __clear_buddies_skip(se);
1012} 1044}
1013 1045
1014static void 1046static void
@@ -1041,7 +1073,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1041 update_cfs_load(cfs_rq, 0); 1073 update_cfs_load(cfs_rq, 0);
1042 account_entity_dequeue(cfs_rq, se); 1074 account_entity_dequeue(cfs_rq, se);
1043 update_min_vruntime(cfs_rq); 1075 update_min_vruntime(cfs_rq);
1044 update_cfs_shares(cfs_rq, 0); 1076 update_cfs_shares(cfs_rq);
1045 1077
1046 /* 1078 /*
1047 * Normalize the entity after updating the min_vruntime because the 1079 * Normalize the entity after updating the min_vruntime because the
@@ -1084,7 +1116,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1084 return; 1116 return;
1085 1117
1086 if (cfs_rq->nr_running > 1) { 1118 if (cfs_rq->nr_running > 1) {
1087 struct sched_entity *se = __pick_next_entity(cfs_rq); 1119 struct sched_entity *se = __pick_first_entity(cfs_rq);
1088 s64 delta = curr->vruntime - se->vruntime; 1120 s64 delta = curr->vruntime - se->vruntime;
1089 1121
1090 if (delta < 0) 1122 if (delta < 0)
@@ -1128,13 +1160,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1128static int 1160static int
1129wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); 1161wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1130 1162
1163/*
1164 * Pick the next process, keeping these things in mind, in this order:
1165 * 1) keep things fair between processes/task groups
1166 * 2) pick the "next" process, since someone really wants that to run
1167 * 3) pick the "last" process, for cache locality
1168 * 4) do not run the "skip" process, if something else is available
1169 */
1131static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 1170static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1132{ 1171{
1133 struct sched_entity *se = __pick_next_entity(cfs_rq); 1172 struct sched_entity *se = __pick_first_entity(cfs_rq);
1134 struct sched_entity *left = se; 1173 struct sched_entity *left = se;
1135 1174
1136 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) 1175 /*
1137 se = cfs_rq->next; 1176 * Avoid running the skip buddy, if running something else can
1177 * be done without getting too unfair.
1178 */
1179 if (cfs_rq->skip == se) {
1180 struct sched_entity *second = __pick_next_entity(se);
1181 if (second && wakeup_preempt_entity(second, left) < 1)
1182 se = second;
1183 }
1138 1184
1139 /* 1185 /*
1140 * Prefer last buddy, try to return the CPU to a preempted task. 1186 * Prefer last buddy, try to return the CPU to a preempted task.
@@ -1142,6 +1188,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1142 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) 1188 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1143 se = cfs_rq->last; 1189 se = cfs_rq->last;
1144 1190
1191 /*
1192 * Someone really wants this to run. If it's not unfair, run it.
1193 */
1194 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1195 se = cfs_rq->next;
1196
1145 clear_buddies(cfs_rq, se); 1197 clear_buddies(cfs_rq, se);
1146 1198
1147 return se; 1199 return se;
@@ -1282,7 +1334,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1282 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1334 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1283 1335
1284 update_cfs_load(cfs_rq, 0); 1336 update_cfs_load(cfs_rq, 0);
1285 update_cfs_shares(cfs_rq, 0); 1337 update_cfs_shares(cfs_rq);
1286 } 1338 }
1287 1339
1288 hrtick_update(rq); 1340 hrtick_update(rq);
@@ -1312,58 +1364,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1312 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1364 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1313 1365
1314 update_cfs_load(cfs_rq, 0); 1366 update_cfs_load(cfs_rq, 0);
1315 update_cfs_shares(cfs_rq, 0); 1367 update_cfs_shares(cfs_rq);
1316 } 1368 }
1317 1369
1318 hrtick_update(rq); 1370 hrtick_update(rq);
1319} 1371}
1320 1372
1321/*
1322 * sched_yield() support is very simple - we dequeue and enqueue.
1323 *
1324 * If compat_yield is turned on then we requeue to the end of the tree.
1325 */
1326static void yield_task_fair(struct rq *rq)
1327{
1328 struct task_struct *curr = rq->curr;
1329 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1330 struct sched_entity *rightmost, *se = &curr->se;
1331
1332 /*
1333 * Are we the only task in the tree?
1334 */
1335 if (unlikely(cfs_rq->nr_running == 1))
1336 return;
1337
1338 clear_buddies(cfs_rq, se);
1339
1340 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
1341 update_rq_clock(rq);
1342 /*
1343 * Update run-time statistics of the 'current'.
1344 */
1345 update_curr(cfs_rq);
1346
1347 return;
1348 }
1349 /*
1350 * Find the rightmost entry in the rbtree:
1351 */
1352 rightmost = __pick_last_entity(cfs_rq);
1353 /*
1354 * Already in the rightmost position?
1355 */
1356 if (unlikely(!rightmost || entity_before(rightmost, se)))
1357 return;
1358
1359 /*
1360 * Minimally necessary key value to be last in the tree:
1361 * Upon rescheduling, sched_class::put_prev_task() will place
1362 * 'current' within the tree based on its new key value.
1363 */
1364 se->vruntime = rightmost->vruntime + 1;
1365}
1366
1367#ifdef CONFIG_SMP 1373#ifdef CONFIG_SMP
1368 1374
1369static void task_waking_fair(struct rq *rq, struct task_struct *p) 1375static void task_waking_fair(struct rq *rq, struct task_struct *p)
@@ -1834,6 +1840,14 @@ static void set_next_buddy(struct sched_entity *se)
1834 } 1840 }
1835} 1841}
1836 1842
1843static void set_skip_buddy(struct sched_entity *se)
1844{
1845 if (likely(task_of(se)->policy != SCHED_IDLE)) {
1846 for_each_sched_entity(se)
1847 cfs_rq_of(se)->skip = se;
1848 }
1849}
1850
1837/* 1851/*
1838 * Preempt the current task with a newly woken task if needed: 1852 * Preempt the current task with a newly woken task if needed:
1839 */ 1853 */
@@ -1857,16 +1871,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1857 if (test_tsk_need_resched(curr)) 1871 if (test_tsk_need_resched(curr))
1858 return; 1872 return;
1859 1873
1874 /* Idle tasks are by definition preempted by non-idle tasks. */
1875 if (unlikely(curr->policy == SCHED_IDLE) &&
1876 likely(p->policy != SCHED_IDLE))
1877 goto preempt;
1878
1860 /* 1879 /*
1861 * Batch and idle tasks do not preempt (their preemption is driven by 1880 * Batch and idle tasks do not preempt non-idle tasks (their preemption
1862 * the tick): 1881 * is driven by the tick):
1863 */ 1882 */
1864 if (unlikely(p->policy != SCHED_NORMAL)) 1883 if (unlikely(p->policy != SCHED_NORMAL))
1865 return; 1884 return;
1866 1885
1867 /* Idle tasks are by definition preempted by everybody. */
1868 if (unlikely(curr->policy == SCHED_IDLE))
1869 goto preempt;
1870 1886
1871 if (!sched_feat(WAKEUP_PREEMPT)) 1887 if (!sched_feat(WAKEUP_PREEMPT))
1872 return; 1888 return;
@@ -1932,6 +1948,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1932 } 1948 }
1933} 1949}
1934 1950
1951/*
1952 * sched_yield() is very simple
1953 *
1954 * The magic of dealing with the ->skip buddy is in pick_next_entity.
1955 */
1956static void yield_task_fair(struct rq *rq)
1957{
1958 struct task_struct *curr = rq->curr;
1959 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1960 struct sched_entity *se = &curr->se;
1961
1962 /*
1963 * Are we the only task in the tree?
1964 */
1965 if (unlikely(rq->nr_running == 1))
1966 return;
1967
1968 clear_buddies(cfs_rq, se);
1969
1970 if (curr->policy != SCHED_BATCH) {
1971 update_rq_clock(rq);
1972 /*
1973 * Update run-time statistics of the 'current'.
1974 */
1975 update_curr(cfs_rq);
1976 }
1977
1978 set_skip_buddy(se);
1979}
1980
1981static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
1982{
1983 struct sched_entity *se = &p->se;
1984
1985 if (!se->on_rq)
1986 return false;
1987
1988 /* Tell the scheduler that we'd really like pse to run next. */
1989 set_next_buddy(se);
1990
1991 yield_task_fair(rq);
1992
1993 return true;
1994}
1995
1935#ifdef CONFIG_SMP 1996#ifdef CONFIG_SMP
1936/************************************************** 1997/**************************************************
1937 * Fair scheduling class load-balancing methods: 1998 * Fair scheduling class load-balancing methods:
@@ -2043,21 +2104,20 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2043 enum cpu_idle_type idle, int *all_pinned, 2104 enum cpu_idle_type idle, int *all_pinned,
2044 int *this_best_prio, struct cfs_rq *busiest_cfs_rq) 2105 int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
2045{ 2106{
2046 int loops = 0, pulled = 0, pinned = 0; 2107 int loops = 0, pulled = 0;
2047 long rem_load_move = max_load_move; 2108 long rem_load_move = max_load_move;
2048 struct task_struct *p, *n; 2109 struct task_struct *p, *n;
2049 2110
2050 if (max_load_move == 0) 2111 if (max_load_move == 0)
2051 goto out; 2112 goto out;
2052 2113
2053 pinned = 1;
2054
2055 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { 2114 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
2056 if (loops++ > sysctl_sched_nr_migrate) 2115 if (loops++ > sysctl_sched_nr_migrate)
2057 break; 2116 break;
2058 2117
2059 if ((p->se.load.weight >> 1) > rem_load_move || 2118 if ((p->se.load.weight >> 1) > rem_load_move ||
2060 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) 2119 !can_migrate_task(p, busiest, this_cpu, sd, idle,
2120 all_pinned))
2061 continue; 2121 continue;
2062 2122
2063 pull_task(busiest, p, this_rq, this_cpu); 2123 pull_task(busiest, p, this_rq, this_cpu);
@@ -2092,9 +2152,6 @@ out:
2092 */ 2152 */
2093 schedstat_add(sd, lb_gained[idle], pulled); 2153 schedstat_add(sd, lb_gained[idle], pulled);
2094 2154
2095 if (all_pinned)
2096 *all_pinned = pinned;
2097
2098 return max_load_move - rem_load_move; 2155 return max_load_move - rem_load_move;
2099} 2156}
2100 2157
@@ -2123,7 +2180,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
2123 * We need to update shares after updating tg->load_weight in 2180 * We need to update shares after updating tg->load_weight in
2124 * order to adjust the weight of groups with long running tasks. 2181 * order to adjust the weight of groups with long running tasks.
2125 */ 2182 */
2126 update_cfs_shares(cfs_rq, 0); 2183 update_cfs_shares(cfs_rq);
2127 2184
2128 raw_spin_unlock_irqrestore(&rq->lock, flags); 2185 raw_spin_unlock_irqrestore(&rq->lock, flags);
2129 2186
@@ -2610,7 +2667,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2610 * @this_cpu: Cpu for which load balance is currently performed. 2667 * @this_cpu: Cpu for which load balance is currently performed.
2611 * @idle: Idle status of this_cpu 2668 * @idle: Idle status of this_cpu
2612 * @load_idx: Load index of sched_domain of this_cpu for load calc. 2669 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2613 * @sd_idle: Idle status of the sched_domain containing group.
2614 * @local_group: Does group contain this_cpu. 2670 * @local_group: Does group contain this_cpu.
2615 * @cpus: Set of cpus considered for load balancing. 2671 * @cpus: Set of cpus considered for load balancing.
2616 * @balance: Should we balance. 2672 * @balance: Should we balance.
@@ -2618,7 +2674,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2618 */ 2674 */
2619static inline void update_sg_lb_stats(struct sched_domain *sd, 2675static inline void update_sg_lb_stats(struct sched_domain *sd,
2620 struct sched_group *group, int this_cpu, 2676 struct sched_group *group, int this_cpu,
2621 enum cpu_idle_type idle, int load_idx, int *sd_idle, 2677 enum cpu_idle_type idle, int load_idx,
2622 int local_group, const struct cpumask *cpus, 2678 int local_group, const struct cpumask *cpus,
2623 int *balance, struct sg_lb_stats *sgs) 2679 int *balance, struct sg_lb_stats *sgs)
2624{ 2680{
@@ -2638,9 +2694,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2638 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 2694 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2639 struct rq *rq = cpu_rq(i); 2695 struct rq *rq = cpu_rq(i);
2640 2696
2641 if (*sd_idle && rq->nr_running)
2642 *sd_idle = 0;
2643
2644 /* Bias balancing toward cpus of our domain */ 2697 /* Bias balancing toward cpus of our domain */
2645 if (local_group) { 2698 if (local_group) {
2646 if (idle_cpu(i) && !first_idle_cpu) { 2699 if (idle_cpu(i) && !first_idle_cpu) {
@@ -2685,7 +2738,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2685 2738
2686 /* 2739 /*
2687 * Consider the group unbalanced when the imbalance is larger 2740 * Consider the group unbalanced when the imbalance is larger
2688 * than the average weight of two tasks. 2741 * than the average weight of a task.
2689 * 2742 *
2690 * APZ: with cgroup the avg task weight can vary wildly and 2743 * APZ: with cgroup the avg task weight can vary wildly and
2691 * might not be a suitable number - should we keep a 2744 * might not be a suitable number - should we keep a
@@ -2695,7 +2748,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2695 if (sgs->sum_nr_running) 2748 if (sgs->sum_nr_running)
2696 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 2749 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2697 2750
2698 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) 2751 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
2699 sgs->group_imb = 1; 2752 sgs->group_imb = 1;
2700 2753
2701 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2754 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
@@ -2755,15 +2808,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
2755 * @sd: sched_domain whose statistics are to be updated. 2808 * @sd: sched_domain whose statistics are to be updated.
2756 * @this_cpu: Cpu for which load balance is currently performed. 2809 * @this_cpu: Cpu for which load balance is currently performed.
2757 * @idle: Idle status of this_cpu 2810 * @idle: Idle status of this_cpu
2758 * @sd_idle: Idle status of the sched_domain containing sg.
2759 * @cpus: Set of cpus considered for load balancing. 2811 * @cpus: Set of cpus considered for load balancing.
2760 * @balance: Should we balance. 2812 * @balance: Should we balance.
2761 * @sds: variable to hold the statistics for this sched_domain. 2813 * @sds: variable to hold the statistics for this sched_domain.
2762 */ 2814 */
2763static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, 2815static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2764 enum cpu_idle_type idle, int *sd_idle, 2816 enum cpu_idle_type idle, const struct cpumask *cpus,
2765 const struct cpumask *cpus, int *balance, 2817 int *balance, struct sd_lb_stats *sds)
2766 struct sd_lb_stats *sds)
2767{ 2818{
2768 struct sched_domain *child = sd->child; 2819 struct sched_domain *child = sd->child;
2769 struct sched_group *sg = sd->groups; 2820 struct sched_group *sg = sd->groups;
@@ -2781,7 +2832,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2781 2832
2782 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); 2833 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2783 memset(&sgs, 0, sizeof(sgs)); 2834 memset(&sgs, 0, sizeof(sgs));
2784 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, 2835 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
2785 local_group, cpus, balance, &sgs); 2836 local_group, cpus, balance, &sgs);
2786 2837
2787 if (local_group && !(*balance)) 2838 if (local_group && !(*balance))
@@ -3007,7 +3058,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3007 3058
3008 /* 3059 /*
3009 * if *imbalance is less than the average load per runnable task 3060 * if *imbalance is less than the average load per runnable task
3010 * there is no gaurantee that any tasks will be moved so we'll have 3061 * there is no guarantee that any tasks will be moved so we'll have
3011 * a think about bumping its value to force at least one task to be 3062 * a think about bumping its value to force at least one task to be
3012 * moved 3063 * moved
3013 */ 3064 */
@@ -3033,7 +3084,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3033 * @imbalance: Variable which stores amount of weighted load which should 3084 * @imbalance: Variable which stores amount of weighted load which should
3034 * be moved to restore balance/put a group to idle. 3085 * be moved to restore balance/put a group to idle.
3035 * @idle: The idle status of this_cpu. 3086 * @idle: The idle status of this_cpu.
3036 * @sd_idle: The idleness of sd
3037 * @cpus: The set of CPUs under consideration for load-balancing. 3087 * @cpus: The set of CPUs under consideration for load-balancing.
3038 * @balance: Pointer to a variable indicating if this_cpu 3088 * @balance: Pointer to a variable indicating if this_cpu
3039 * is the appropriate cpu to perform load balancing at this_level. 3089 * is the appropriate cpu to perform load balancing at this_level.
@@ -3046,7 +3096,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3046static struct sched_group * 3096static struct sched_group *
3047find_busiest_group(struct sched_domain *sd, int this_cpu, 3097find_busiest_group(struct sched_domain *sd, int this_cpu,
3048 unsigned long *imbalance, enum cpu_idle_type idle, 3098 unsigned long *imbalance, enum cpu_idle_type idle,
3049 int *sd_idle, const struct cpumask *cpus, int *balance) 3099 const struct cpumask *cpus, int *balance)
3050{ 3100{
3051 struct sd_lb_stats sds; 3101 struct sd_lb_stats sds;
3052 3102
@@ -3056,22 +3106,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3056 * Compute the various statistics relavent for load balancing at 3106 * Compute the various statistics relavent for load balancing at
3057 * this level. 3107 * this level.
3058 */ 3108 */
3059 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, 3109 update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
3060 balance, &sds); 3110
3061 3111 /*
3062 /* Cases where imbalance does not exist from POV of this_cpu */ 3112 * this_cpu is not the appropriate cpu to perform load balancing at
3063 /* 1) this_cpu is not the appropriate cpu to perform load balancing 3113 * this level.
3064 * at this level.
3065 * 2) There is no busy sibling group to pull from.
3066 * 3) This group is the busiest group.
3067 * 4) This group is more busy than the avg busieness at this
3068 * sched_domain.
3069 * 5) The imbalance is within the specified limit.
3070 *
3071 * Note: when doing newidle balance, if the local group has excess
3072 * capacity (i.e. nr_running < group_capacity) and the busiest group
3073 * does not have any capacity, we force a load balance to pull tasks
3074 * to the local group. In this case, we skip past checks 3, 4 and 5.
3075 */ 3114 */
3076 if (!(*balance)) 3115 if (!(*balance))
3077 goto ret; 3116 goto ret;
@@ -3080,41 +3119,56 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3080 check_asym_packing(sd, &sds, this_cpu, imbalance)) 3119 check_asym_packing(sd, &sds, this_cpu, imbalance))
3081 return sds.busiest; 3120 return sds.busiest;
3082 3121
3122 /* There is no busy sibling group to pull tasks from */
3083 if (!sds.busiest || sds.busiest_nr_running == 0) 3123 if (!sds.busiest || sds.busiest_nr_running == 0)
3084 goto out_balanced; 3124 goto out_balanced;
3085 3125
3086 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 3126 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3127
3128 /*
3129 * If the busiest group is imbalanced the below checks don't
3130 * work because they assumes all things are equal, which typically
3131 * isn't true due to cpus_allowed constraints and the like.
3132 */
3133 if (sds.group_imb)
3134 goto force_balance;
3135
3136 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
3087 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 3137 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
3088 !sds.busiest_has_capacity) 3138 !sds.busiest_has_capacity)
3089 goto force_balance; 3139 goto force_balance;
3090 3140
3141 /*
3142 * If the local group is more busy than the selected busiest group
3143 * don't try and pull any tasks.
3144 */
3091 if (sds.this_load >= sds.max_load) 3145 if (sds.this_load >= sds.max_load)
3092 goto out_balanced; 3146 goto out_balanced;
3093 3147
3094 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; 3148 /*
3095 3149 * Don't pull any tasks if this group is already above the domain
3150 * average load.
3151 */
3096 if (sds.this_load >= sds.avg_load) 3152 if (sds.this_load >= sds.avg_load)
3097 goto out_balanced; 3153 goto out_balanced;
3098 3154
3099 /* 3155 if (idle == CPU_IDLE) {
3100 * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
3101 * And to check for busy balance use !idle_cpu instead of
3102 * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
3103 * even when they are idle.
3104 */
3105 if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
3106 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3107 goto out_balanced;
3108 } else {
3109 /* 3156 /*
3110 * This cpu is idle. If the busiest group load doesn't 3157 * This cpu is idle. If the busiest group load doesn't
3111 * have more tasks than the number of available cpu's and 3158 * have more tasks than the number of available cpu's and
3112 * there is no imbalance between this and busiest group 3159 * there is no imbalance between this and busiest group
3113 * wrt to idle cpu's, it is balanced. 3160 * wrt to idle cpu's, it is balanced.
3114 */ 3161 */
3115 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && 3162 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
3116 sds.busiest_nr_running <= sds.busiest_group_weight) 3163 sds.busiest_nr_running <= sds.busiest_group_weight)
3117 goto out_balanced; 3164 goto out_balanced;
3165 } else {
3166 /*
3167 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
3168 * imbalance_pct to be conservative.
3169 */
3170 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3171 goto out_balanced;
3118 } 3172 }
3119 3173
3120force_balance: 3174force_balance:
@@ -3193,7 +3247,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
3193/* Working cpumask for load_balance and load_balance_newidle. */ 3247/* Working cpumask for load_balance and load_balance_newidle. */
3194static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 3248static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3195 3249
3196static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, 3250static int need_active_balance(struct sched_domain *sd, int idle,
3197 int busiest_cpu, int this_cpu) 3251 int busiest_cpu, int this_cpu)
3198{ 3252{
3199 if (idle == CPU_NEWLY_IDLE) { 3253 if (idle == CPU_NEWLY_IDLE) {
@@ -3225,10 +3279,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
3225 * move_tasks() will succeed. ld_moved will be true and this 3279 * move_tasks() will succeed. ld_moved will be true and this
3226 * active balance code will not be triggered. 3280 * active balance code will not be triggered.
3227 */ 3281 */
3228 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3229 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3230 return 0;
3231
3232 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) 3282 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
3233 return 0; 3283 return 0;
3234 } 3284 }
@@ -3246,7 +3296,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3246 struct sched_domain *sd, enum cpu_idle_type idle, 3296 struct sched_domain *sd, enum cpu_idle_type idle,
3247 int *balance) 3297 int *balance)
3248{ 3298{
3249 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3299 int ld_moved, all_pinned = 0, active_balance = 0;
3250 struct sched_group *group; 3300 struct sched_group *group;
3251 unsigned long imbalance; 3301 unsigned long imbalance;
3252 struct rq *busiest; 3302 struct rq *busiest;
@@ -3255,20 +3305,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3255 3305
3256 cpumask_copy(cpus, cpu_active_mask); 3306 cpumask_copy(cpus, cpu_active_mask);
3257 3307
3258 /*
3259 * When power savings policy is enabled for the parent domain, idle
3260 * sibling can pick up load irrespective of busy siblings. In this case,
3261 * let the state of idle sibling percolate up as CPU_IDLE, instead of
3262 * portraying it as CPU_NOT_IDLE.
3263 */
3264 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
3265 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3266 sd_idle = 1;
3267
3268 schedstat_inc(sd, lb_count[idle]); 3308 schedstat_inc(sd, lb_count[idle]);
3269 3309
3270redo: 3310redo:
3271 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3311 group = find_busiest_group(sd, this_cpu, &imbalance, idle,
3272 cpus, balance); 3312 cpus, balance);
3273 3313
3274 if (*balance == 0) 3314 if (*balance == 0)
@@ -3297,6 +3337,7 @@ redo:
3297 * still unbalanced. ld_moved simply stays zero, so it is 3337 * still unbalanced. ld_moved simply stays zero, so it is
3298 * correctly treated as an imbalance. 3338 * correctly treated as an imbalance.
3299 */ 3339 */
3340 all_pinned = 1;
3300 local_irq_save(flags); 3341 local_irq_save(flags);
3301 double_rq_lock(this_rq, busiest); 3342 double_rq_lock(this_rq, busiest);
3302 ld_moved = move_tasks(this_rq, this_cpu, busiest, 3343 ld_moved = move_tasks(this_rq, this_cpu, busiest,
@@ -3330,8 +3371,7 @@ redo:
3330 if (idle != CPU_NEWLY_IDLE) 3371 if (idle != CPU_NEWLY_IDLE)
3331 sd->nr_balance_failed++; 3372 sd->nr_balance_failed++;
3332 3373
3333 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), 3374 if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
3334 this_cpu)) {
3335 raw_spin_lock_irqsave(&busiest->lock, flags); 3375 raw_spin_lock_irqsave(&busiest->lock, flags);
3336 3376
3337 /* don't kick the active_load_balance_cpu_stop, 3377 /* don't kick the active_load_balance_cpu_stop,
@@ -3386,10 +3426,6 @@ redo:
3386 sd->balance_interval *= 2; 3426 sd->balance_interval *= 2;
3387 } 3427 }
3388 3428
3389 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3390 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3391 ld_moved = -1;
3392
3393 goto out; 3429 goto out;
3394 3430
3395out_balanced: 3431out_balanced:
@@ -3403,11 +3439,7 @@ out_one_pinned:
3403 (sd->balance_interval < sd->max_interval)) 3439 (sd->balance_interval < sd->max_interval))
3404 sd->balance_interval *= 2; 3440 sd->balance_interval *= 2;
3405 3441
3406 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3442 ld_moved = 0;
3407 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3408 ld_moved = -1;
3409 else
3410 ld_moved = 0;
3411out: 3443out:
3412 return ld_moved; 3444 return ld_moved;
3413} 3445}
@@ -3786,6 +3818,17 @@ void select_nohz_load_balancer(int stop_tick)
3786 3818
3787static DEFINE_SPINLOCK(balancing); 3819static DEFINE_SPINLOCK(balancing);
3788 3820
3821static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3822
3823/*
3824 * Scale the max load_balance interval with the number of CPUs in the system.
3825 * This trades load-balance latency on larger machines for less cross talk.
3826 */
3827static void update_max_interval(void)
3828{
3829 max_load_balance_interval = HZ*num_online_cpus()/10;
3830}
3831
3789/* 3832/*
3790 * It checks each scheduling domain to see if it is due to be balanced, 3833 * It checks each scheduling domain to see if it is due to be balanced,
3791 * and initiates a balancing operation if so. 3834 * and initiates a balancing operation if so.
@@ -3815,10 +3858,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3815 3858
3816 /* scale ms to jiffies */ 3859 /* scale ms to jiffies */
3817 interval = msecs_to_jiffies(interval); 3860 interval = msecs_to_jiffies(interval);
3818 if (unlikely(!interval)) 3861 interval = clamp(interval, 1UL, max_load_balance_interval);
3819 interval = 1;
3820 if (interval > HZ*NR_CPUS/10)
3821 interval = HZ*NR_CPUS/10;
3822 3862
3823 need_serialize = sd->flags & SD_SERIALIZE; 3863 need_serialize = sd->flags & SD_SERIALIZE;
3824 3864
@@ -3831,8 +3871,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3831 if (load_balance(cpu, rq, sd, idle, &balance)) { 3871 if (load_balance(cpu, rq, sd, idle, &balance)) {
3832 /* 3872 /*
3833 * We've pulled tasks over so either we're no 3873 * We've pulled tasks over so either we're no
3834 * longer idle, or one of our SMT siblings is 3874 * longer idle.
3835 * not idle.
3836 */ 3875 */
3837 idle = CPU_NOT_IDLE; 3876 idle = CPU_NOT_IDLE;
3838 } 3877 }
@@ -4079,33 +4118,62 @@ static void task_fork_fair(struct task_struct *p)
4079 * Priority of the task has changed. Check to see if we preempt 4118 * Priority of the task has changed. Check to see if we preempt
4080 * the current task. 4119 * the current task.
4081 */ 4120 */
4082static void prio_changed_fair(struct rq *rq, struct task_struct *p, 4121static void
4083 int oldprio, int running) 4122prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
4084{ 4123{
4124 if (!p->se.on_rq)
4125 return;
4126
4085 /* 4127 /*
4086 * Reschedule if we are currently running on this runqueue and 4128 * Reschedule if we are currently running on this runqueue and
4087 * our priority decreased, or if we are not currently running on 4129 * our priority decreased, or if we are not currently running on
4088 * this runqueue and our priority is higher than the current's 4130 * this runqueue and our priority is higher than the current's
4089 */ 4131 */
4090 if (running) { 4132 if (rq->curr == p) {
4091 if (p->prio > oldprio) 4133 if (p->prio > oldprio)
4092 resched_task(rq->curr); 4134 resched_task(rq->curr);
4093 } else 4135 } else
4094 check_preempt_curr(rq, p, 0); 4136 check_preempt_curr(rq, p, 0);
4095} 4137}
4096 4138
4139static void switched_from_fair(struct rq *rq, struct task_struct *p)
4140{
4141 struct sched_entity *se = &p->se;
4142 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4143
4144 /*
4145 * Ensure the task's vruntime is normalized, so that when its
4146 * switched back to the fair class the enqueue_entity(.flags=0) will
4147 * do the right thing.
4148 *
4149 * If it was on_rq, then the dequeue_entity(.flags=0) will already
4150 * have normalized the vruntime, if it was !on_rq, then only when
4151 * the task is sleeping will it still have non-normalized vruntime.
4152 */
4153 if (!se->on_rq && p->state != TASK_RUNNING) {
4154 /*
4155 * Fix up our vruntime so that the current sleep doesn't
4156 * cause 'unlimited' sleep bonus.
4157 */
4158 place_entity(cfs_rq, se, 0);
4159 se->vruntime -= cfs_rq->min_vruntime;
4160 }
4161}
4162
4097/* 4163/*
4098 * We switched to the sched_fair class. 4164 * We switched to the sched_fair class.
4099 */ 4165 */
4100static void switched_to_fair(struct rq *rq, struct task_struct *p, 4166static void switched_to_fair(struct rq *rq, struct task_struct *p)
4101 int running)
4102{ 4167{
4168 if (!p->se.on_rq)
4169 return;
4170
4103 /* 4171 /*
4104 * We were most likely switched from sched_rt, so 4172 * We were most likely switched from sched_rt, so
4105 * kick off the schedule if running, otherwise just see 4173 * kick off the schedule if running, otherwise just see
4106 * if we can still preempt the current task. 4174 * if we can still preempt the current task.
4107 */ 4175 */
4108 if (running) 4176 if (rq->curr == p)
4109 resched_task(rq->curr); 4177 resched_task(rq->curr);
4110 else 4178 else
4111 check_preempt_curr(rq, p, 0); 4179 check_preempt_curr(rq, p, 0);
@@ -4171,6 +4239,7 @@ static const struct sched_class fair_sched_class = {
4171 .enqueue_task = enqueue_task_fair, 4239 .enqueue_task = enqueue_task_fair,
4172 .dequeue_task = dequeue_task_fair, 4240 .dequeue_task = dequeue_task_fair,
4173 .yield_task = yield_task_fair, 4241 .yield_task = yield_task_fair,
4242 .yield_to_task = yield_to_task_fair,
4174 4243
4175 .check_preempt_curr = check_preempt_wakeup, 4244 .check_preempt_curr = check_preempt_wakeup,
4176 4245
@@ -4191,6 +4260,7 @@ static const struct sched_class fair_sched_class = {
4191 .task_fork = task_fork_fair, 4260 .task_fork = task_fork_fair,
4192 4261
4193 .prio_changed = prio_changed_fair, 4262 .prio_changed = prio_changed_fair,
4263 .switched_from = switched_from_fair,
4194 .switched_to = switched_to_fair, 4264 .switched_to = switched_to_fair,
4195 4265
4196 .get_rr_interval = get_rr_interval_fair, 4266 .get_rr_interval = get_rr_interval_fair,
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9fa0f402c87c..a776a6396427 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -52,31 +52,15 @@ static void set_curr_task_idle(struct rq *rq)
52{ 52{
53} 53}
54 54
55static void switched_to_idle(struct rq *rq, struct task_struct *p, 55static void switched_to_idle(struct rq *rq, struct task_struct *p)
56 int running)
57{ 56{
58 /* Can this actually happen?? */ 57 BUG();
59 if (running)
60 resched_task(rq->curr);
61 else
62 check_preempt_curr(rq, p, 0);
63} 58}
64 59
65static void prio_changed_idle(struct rq *rq, struct task_struct *p, 60static void
66 int oldprio, int running) 61prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
67{ 62{
68 /* This can happen for hot plug CPUS */ 63 BUG();
69
70 /*
71 * Reschedule if we are currently running on this runqueue and
72 * our priority decreased, or if we are not currently running on
73 * this runqueue and our priority is higher than the current's
74 */
75 if (running) {
76 if (p->prio > oldprio)
77 resched_task(rq->curr);
78 } else
79 check_preempt_curr(rq, p, 0);
80} 64}
81 65
82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) 66static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
@@ -110,6 +94,4 @@ static const struct sched_class idle_sched_class = {
110 94
111 .prio_changed = prio_changed_idle, 95 .prio_changed = prio_changed_idle,
112 .switched_to = switched_to_idle, 96 .switched_to = switched_to_idle,
113
114 /* no .task_new for idle tasks */
115}; 97};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index ad6267714c84..e7cebdc65f82 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -210,11 +210,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
210 210
211static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 211static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
212{ 212{
213 int this_cpu = smp_processor_id();
214 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 213 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
215 struct sched_rt_entity *rt_se; 214 struct sched_rt_entity *rt_se;
216 215
217 rt_se = rt_rq->tg->rt_se[this_cpu]; 216 int cpu = cpu_of(rq_of_rt_rq(rt_rq));
217
218 rt_se = rt_rq->tg->rt_se[cpu];
218 219
219 if (rt_rq->rt_nr_running) { 220 if (rt_rq->rt_nr_running) {
220 if (rt_se && !on_rt_rq(rt_se)) 221 if (rt_se && !on_rt_rq(rt_se))
@@ -226,10 +227,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
226 227
227static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 228static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
228{ 229{
229 int this_cpu = smp_processor_id();
230 struct sched_rt_entity *rt_se; 230 struct sched_rt_entity *rt_se;
231 int cpu = cpu_of(rq_of_rt_rq(rt_rq));
231 232
232 rt_se = rt_rq->tg->rt_se[this_cpu]; 233 rt_se = rt_rq->tg->rt_se[cpu];
233 234
234 if (rt_se && on_rt_rq(rt_se)) 235 if (rt_se && on_rt_rq(rt_se))
235 dequeue_rt_entity(rt_se); 236 dequeue_rt_entity(rt_se);
@@ -565,8 +566,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
565 if (rt_rq->rt_time || rt_rq->rt_nr_running) 566 if (rt_rq->rt_time || rt_rq->rt_nr_running)
566 idle = 0; 567 idle = 0;
567 raw_spin_unlock(&rt_rq->rt_runtime_lock); 568 raw_spin_unlock(&rt_rq->rt_runtime_lock);
568 } else if (rt_rq->rt_nr_running) 569 } else if (rt_rq->rt_nr_running) {
569 idle = 0; 570 idle = 0;
571 if (!rt_rq_throttled(rt_rq))
572 enqueue = 1;
573 }
570 574
571 if (enqueue) 575 if (enqueue)
572 sched_rt_rq_enqueue(rt_rq); 576 sched_rt_rq_enqueue(rt_rq);
@@ -1374,7 +1378,7 @@ retry:
1374 task = pick_next_pushable_task(rq); 1378 task = pick_next_pushable_task(rq);
1375 if (task_cpu(next_task) == rq->cpu && task == next_task) { 1379 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1376 /* 1380 /*
1377 * If we get here, the task hasnt moved at all, but 1381 * If we get here, the task hasn't moved at all, but
1378 * it has failed to push. We will not try again, 1382 * it has failed to push. We will not try again,
1379 * since the other cpus will pull from us when they 1383 * since the other cpus will pull from us when they
1380 * are ready. 1384 * are ready.
@@ -1484,7 +1488,7 @@ static int pull_rt_task(struct rq *this_rq)
1484 /* 1488 /*
1485 * We continue with the search, just in 1489 * We continue with the search, just in
1486 * case there's an even higher prio task 1490 * case there's an even higher prio task
1487 * in another runqueue. (low likelyhood 1491 * in another runqueue. (low likelihood
1488 * but possible) 1492 * but possible)
1489 */ 1493 */
1490 } 1494 }
@@ -1595,8 +1599,7 @@ static void rq_offline_rt(struct rq *rq)
1595 * When switch from the rt queue, we bring ourselves to a position 1599 * When switch from the rt queue, we bring ourselves to a position
1596 * that we might want to pull RT tasks from other runqueues. 1600 * that we might want to pull RT tasks from other runqueues.
1597 */ 1601 */
1598static void switched_from_rt(struct rq *rq, struct task_struct *p, 1602static void switched_from_rt(struct rq *rq, struct task_struct *p)
1599 int running)
1600{ 1603{
1601 /* 1604 /*
1602 * If there are other RT tasks then we will reschedule 1605 * If there are other RT tasks then we will reschedule
@@ -1605,7 +1608,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
1605 * we may need to handle the pulling of RT tasks 1608 * we may need to handle the pulling of RT tasks
1606 * now. 1609 * now.
1607 */ 1610 */
1608 if (!rq->rt.rt_nr_running) 1611 if (p->se.on_rq && !rq->rt.rt_nr_running)
1609 pull_rt_task(rq); 1612 pull_rt_task(rq);
1610} 1613}
1611 1614
@@ -1624,8 +1627,7 @@ static inline void init_sched_rt_class(void)
1624 * with RT tasks. In this case we try to push them off to 1627 * with RT tasks. In this case we try to push them off to
1625 * other runqueues. 1628 * other runqueues.
1626 */ 1629 */
1627static void switched_to_rt(struct rq *rq, struct task_struct *p, 1630static void switched_to_rt(struct rq *rq, struct task_struct *p)
1628 int running)
1629{ 1631{
1630 int check_resched = 1; 1632 int check_resched = 1;
1631 1633
@@ -1636,7 +1638,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
1636 * If that current running task is also an RT task 1638 * If that current running task is also an RT task
1637 * then see if we can move to another run queue. 1639 * then see if we can move to another run queue.
1638 */ 1640 */
1639 if (!running) { 1641 if (p->se.on_rq && rq->curr != p) {
1640#ifdef CONFIG_SMP 1642#ifdef CONFIG_SMP
1641 if (rq->rt.overloaded && push_rt_task(rq) && 1643 if (rq->rt.overloaded && push_rt_task(rq) &&
1642 /* Don't resched if we changed runqueues */ 1644 /* Don't resched if we changed runqueues */
@@ -1652,10 +1654,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
1652 * Priority of the task has changed. This may cause 1654 * Priority of the task has changed. This may cause
1653 * us to initiate a push or pull. 1655 * us to initiate a push or pull.
1654 */ 1656 */
1655static void prio_changed_rt(struct rq *rq, struct task_struct *p, 1657static void
1656 int oldprio, int running) 1658prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1657{ 1659{
1658 if (running) { 1660 if (!p->se.on_rq)
1661 return;
1662
1663 if (rq->curr == p) {
1659#ifdef CONFIG_SMP 1664#ifdef CONFIG_SMP
1660 /* 1665 /*
1661 * If our priority decreases while running, we 1666 * If our priority decreases while running, we
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 2bf6b47058c1..1ba2bd40fdac 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -59,14 +59,13 @@ static void set_curr_task_stop(struct rq *rq)
59{ 59{
60} 60}
61 61
62static void switched_to_stop(struct rq *rq, struct task_struct *p, 62static void switched_to_stop(struct rq *rq, struct task_struct *p)
63 int running)
64{ 63{
65 BUG(); /* its impossible to change to this class */ 64 BUG(); /* its impossible to change to this class */
66} 65}
67 66
68static void prio_changed_stop(struct rq *rq, struct task_struct *p, 67static void
69 int oldprio, int running) 68prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
70{ 69{
71 BUG(); /* how!?, what priority? */ 70 BUG(); /* how!?, what priority? */
72} 71}
@@ -103,6 +102,4 @@ static const struct sched_class stop_sched_class = {
103 102
104 .prio_changed = prio_changed_stop, 103 .prio_changed = prio_changed_stop,
105 .switched_to = switched_to_stop, 104 .switched_to = switched_to_stop,
106
107 /* no .task_new for stop tasks */
108}; 105};
diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff10fdce..7165af5f1b11 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -226,7 +226,7 @@ static inline void print_dropped_signal(int sig)
226/* 226/*
227 * allocate a new signal queue record 227 * allocate a new signal queue record
228 * - this may be called without locks if and only if t == current, otherwise an 228 * - this may be called without locks if and only if t == current, otherwise an
229 * appopriate lock must be held to stop the target task from exiting 229 * appropriate lock must be held to stop the target task from exiting
230 */ 230 */
231static struct sigqueue * 231static struct sigqueue *
232__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) 232__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
@@ -375,15 +375,15 @@ int unhandled_signal(struct task_struct *tsk, int sig)
375 return !tracehook_consider_fatal_signal(tsk, sig); 375 return !tracehook_consider_fatal_signal(tsk, sig);
376} 376}
377 377
378 378/*
379/* Notify the system that a driver wants to block all signals for this 379 * Notify the system that a driver wants to block all signals for this
380 * process, and wants to be notified if any signals at all were to be 380 * process, and wants to be notified if any signals at all were to be
381 * sent/acted upon. If the notifier routine returns non-zero, then the 381 * sent/acted upon. If the notifier routine returns non-zero, then the
382 * signal will be acted upon after all. If the notifier routine returns 0, 382 * signal will be acted upon after all. If the notifier routine returns 0,
383 * then then signal will be blocked. Only one block per process is 383 * then then signal will be blocked. Only one block per process is
384 * allowed. priv is a pointer to private data that the notifier routine 384 * allowed. priv is a pointer to private data that the notifier routine
385 * can use to determine if the signal should be blocked or not. */ 385 * can use to determine if the signal should be blocked or not.
386 386 */
387void 387void
388block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) 388block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
389{ 389{
@@ -434,9 +434,10 @@ still_pending:
434 copy_siginfo(info, &first->info); 434 copy_siginfo(info, &first->info);
435 __sigqueue_free(first); 435 __sigqueue_free(first);
436 } else { 436 } else {
437 /* Ok, it wasn't in the queue. This must be 437 /*
438 a fast-pathed signal or we must have been 438 * Ok, it wasn't in the queue. This must be
439 out of queue space. So zero out the info. 439 * a fast-pathed signal or we must have been
440 * out of queue space. So zero out the info.
440 */ 441 */
441 info->si_signo = sig; 442 info->si_signo = sig;
442 info->si_errno = 0; 443 info->si_errno = 0;
@@ -468,7 +469,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
468} 469}
469 470
470/* 471/*
471 * Dequeue a signal and return the element to the caller, which is 472 * Dequeue a signal and return the element to the caller, which is
472 * expected to free it. 473 * expected to free it.
473 * 474 *
474 * All callers have to hold the siglock. 475 * All callers have to hold the siglock.
@@ -490,7 +491,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
490 * itimers are process shared and we restart periodic 491 * itimers are process shared and we restart periodic
491 * itimers in the signal delivery path to prevent DoS 492 * itimers in the signal delivery path to prevent DoS
492 * attacks in the high resolution timer case. This is 493 * attacks in the high resolution timer case. This is
493 * compliant with the old way of self restarting 494 * compliant with the old way of self-restarting
494 * itimers, as the SIGALRM is a legacy signal and only 495 * itimers, as the SIGALRM is a legacy signal and only
495 * queued once. Changing the restart behaviour to 496 * queued once. Changing the restart behaviour to
496 * restart the timer in the signal dequeue path is 497 * restart the timer in the signal dequeue path is
@@ -636,13 +637,33 @@ static inline bool si_fromuser(const struct siginfo *info)
636} 637}
637 638
638/* 639/*
640 * called with RCU read lock from check_kill_permission()
641 */
642static int kill_ok_by_cred(struct task_struct *t)
643{
644 const struct cred *cred = current_cred();
645 const struct cred *tcred = __task_cred(t);
646
647 if (cred->user->user_ns == tcred->user->user_ns &&
648 (cred->euid == tcred->suid ||
649 cred->euid == tcred->uid ||
650 cred->uid == tcred->suid ||
651 cred->uid == tcred->uid))
652 return 1;
653
654 if (ns_capable(tcred->user->user_ns, CAP_KILL))
655 return 1;
656
657 return 0;
658}
659
660/*
639 * Bad permissions for sending the signal 661 * Bad permissions for sending the signal
640 * - the caller must hold the RCU read lock 662 * - the caller must hold the RCU read lock
641 */ 663 */
642static int check_kill_permission(int sig, struct siginfo *info, 664static int check_kill_permission(int sig, struct siginfo *info,
643 struct task_struct *t) 665 struct task_struct *t)
644{ 666{
645 const struct cred *cred, *tcred;
646 struct pid *sid; 667 struct pid *sid;
647 int error; 668 int error;
648 669
@@ -656,14 +677,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
656 if (error) 677 if (error)
657 return error; 678 return error;
658 679
659 cred = current_cred();
660 tcred = __task_cred(t);
661 if (!same_thread_group(current, t) && 680 if (!same_thread_group(current, t) &&
662 (cred->euid ^ tcred->suid) && 681 !kill_ok_by_cred(t)) {
663 (cred->euid ^ tcred->uid) &&
664 (cred->uid ^ tcred->suid) &&
665 (cred->uid ^ tcred->uid) &&
666 !capable(CAP_KILL)) {
667 switch (sig) { 682 switch (sig) {
668 case SIGCONT: 683 case SIGCONT:
669 sid = task_session(t); 684 sid = task_session(t);
@@ -909,14 +924,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
909 if (info == SEND_SIG_FORCED) 924 if (info == SEND_SIG_FORCED)
910 goto out_set; 925 goto out_set;
911 926
912 /* Real-time signals must be queued if sent by sigqueue, or 927 /*
913 some other real-time mechanism. It is implementation 928 * Real-time signals must be queued if sent by sigqueue, or
914 defined whether kill() does so. We attempt to do so, on 929 * some other real-time mechanism. It is implementation
915 the principle of least surprise, but since kill is not 930 * defined whether kill() does so. We attempt to do so, on
916 allowed to fail with EAGAIN when low on memory we just 931 * the principle of least surprise, but since kill is not
917 make sure at least one signal gets delivered and don't 932 * allowed to fail with EAGAIN when low on memory we just
918 pass on the info struct. */ 933 * make sure at least one signal gets delivered and don't
919 934 * pass on the info struct.
935 */
920 if (sig < SIGRTMIN) 936 if (sig < SIGRTMIN)
921 override_rlimit = (is_si_special(info) || info->si_code >= 0); 937 override_rlimit = (is_si_special(info) || info->si_code >= 0);
922 else 938 else
@@ -1187,8 +1203,7 @@ retry:
1187 return error; 1203 return error;
1188} 1204}
1189 1205
1190int 1206int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1191kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1192{ 1207{
1193 int error; 1208 int error;
1194 rcu_read_lock(); 1209 rcu_read_lock();
@@ -1285,8 +1300,7 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
1285 * These are for backward compatibility with the rest of the kernel source. 1300 * These are for backward compatibility with the rest of the kernel source.
1286 */ 1301 */
1287 1302
1288int 1303int send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1289send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1290{ 1304{
1291 /* 1305 /*
1292 * Make sure legacy kernel users don't send in bad values 1306 * Make sure legacy kernel users don't send in bad values
@@ -1354,7 +1368,7 @@ EXPORT_SYMBOL(kill_pid);
1354 * These functions support sending signals using preallocated sigqueue 1368 * These functions support sending signals using preallocated sigqueue
1355 * structures. This is needed "because realtime applications cannot 1369 * structures. This is needed "because realtime applications cannot
1356 * afford to lose notifications of asynchronous events, like timer 1370 * afford to lose notifications of asynchronous events, like timer
1357 * expirations or I/O completions". In the case of Posix Timers 1371 * expirations or I/O completions". In the case of POSIX Timers
1358 * we allocate the sigqueue structure from the timer_create. If this 1372 * we allocate the sigqueue structure from the timer_create. If this
1359 * allocation fails we are able to report the failure to the application 1373 * allocation fails we are able to report the failure to the application
1360 * with an EAGAIN error. 1374 * with an EAGAIN error.
@@ -1539,7 +1553,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1539 info.si_signo = SIGCHLD; 1553 info.si_signo = SIGCHLD;
1540 info.si_errno = 0; 1554 info.si_errno = 0;
1541 /* 1555 /*
1542 * see comment in do_notify_parent() abot the following 3 lines 1556 * see comment in do_notify_parent() about the following 4 lines
1543 */ 1557 */
1544 rcu_read_lock(); 1558 rcu_read_lock();
1545 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); 1559 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
@@ -1597,7 +1611,7 @@ static inline int may_ptrace_stop(void)
1597} 1611}
1598 1612
1599/* 1613/*
1600 * Return nonzero if there is a SIGKILL that should be waking us up. 1614 * Return non-zero if there is a SIGKILL that should be waking us up.
1601 * Called with the siglock held. 1615 * Called with the siglock held.
1602 */ 1616 */
1603static int sigkill_pending(struct task_struct *tsk) 1617static int sigkill_pending(struct task_struct *tsk)
@@ -1721,7 +1735,7 @@ void ptrace_notify(int exit_code)
1721/* 1735/*
1722 * This performs the stopping for SIGSTOP and other stop signals. 1736 * This performs the stopping for SIGSTOP and other stop signals.
1723 * We have to stop all threads in the thread group. 1737 * We have to stop all threads in the thread group.
1724 * Returns nonzero if we've actually stopped and released the siglock. 1738 * Returns non-zero if we've actually stopped and released the siglock.
1725 * Returns zero if we didn't stop and still hold the siglock. 1739 * Returns zero if we didn't stop and still hold the siglock.
1726 */ 1740 */
1727static int do_signal_stop(int signr) 1741static int do_signal_stop(int signr)
@@ -1809,10 +1823,12 @@ static int ptrace_signal(int signr, siginfo_t *info,
1809 1823
1810 current->exit_code = 0; 1824 current->exit_code = 0;
1811 1825
1812 /* Update the siginfo structure if the signal has 1826 /*
1813 changed. If the debugger wanted something 1827 * Update the siginfo structure if the signal has
1814 specific in the siginfo structure then it should 1828 * changed. If the debugger wanted something
1815 have updated *info via PTRACE_SETSIGINFO. */ 1829 * specific in the siginfo structure then it should
1830 * have updated *info via PTRACE_SETSIGINFO.
1831 */
1816 if (signr != info->si_signo) { 1832 if (signr != info->si_signo) {
1817 info->si_signo = signr; 1833 info->si_signo = signr;
1818 info->si_errno = 0; 1834 info->si_errno = 0;
@@ -1871,7 +1887,7 @@ relock:
1871 for (;;) { 1887 for (;;) {
1872 struct k_sigaction *ka; 1888 struct k_sigaction *ka;
1873 /* 1889 /*
1874 * Tracing can induce an artifical signal and choose sigaction. 1890 * Tracing can induce an artificial signal and choose sigaction.
1875 * The return value in @signr determines the default action, 1891 * The return value in @signr determines the default action,
1876 * but @info->si_signo is the signal number we will report. 1892 * but @info->si_signo is the signal number we will report.
1877 */ 1893 */
@@ -2020,7 +2036,8 @@ void exit_signals(struct task_struct *tsk)
2020 if (!signal_pending(tsk)) 2036 if (!signal_pending(tsk))
2021 goto out; 2037 goto out;
2022 2038
2023 /* It could be that __group_complete_signal() choose us to 2039 /*
2040 * It could be that __group_complete_signal() choose us to
2024 * notify about group-wide signal. Another thread should be 2041 * notify about group-wide signal. Another thread should be
2025 * woken now to take the signal since we will not. 2042 * woken now to take the signal since we will not.
2026 */ 2043 */
@@ -2058,6 +2075,9 @@ EXPORT_SYMBOL(unblock_all_signals);
2058 * System call entry points. 2075 * System call entry points.
2059 */ 2076 */
2060 2077
2078/**
2079 * sys_restart_syscall - restart a system call
2080 */
2061SYSCALL_DEFINE0(restart_syscall) 2081SYSCALL_DEFINE0(restart_syscall)
2062{ 2082{
2063 struct restart_block *restart = &current_thread_info()->restart_block; 2083 struct restart_block *restart = &current_thread_info()->restart_block;
@@ -2111,6 +2131,13 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
2111 return error; 2131 return error;
2112} 2132}
2113 2133
2134/**
2135 * sys_rt_sigprocmask - change the list of currently blocked signals
2136 * @how: whether to add, remove, or set signals
2137 * @set: stores pending signals
2138 * @oset: previous value of signal mask if non-null
2139 * @sigsetsize: size of sigset_t type
2140 */
2114SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, 2141SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set,
2115 sigset_t __user *, oset, size_t, sigsetsize) 2142 sigset_t __user *, oset, size_t, sigsetsize)
2116{ 2143{
@@ -2169,8 +2196,14 @@ long do_sigpending(void __user *set, unsigned long sigsetsize)
2169 2196
2170out: 2197out:
2171 return error; 2198 return error;
2172} 2199}
2173 2200
2201/**
2202 * sys_rt_sigpending - examine a pending signal that has been raised
2203 * while blocked
2204 * @set: stores pending signals
2205 * @sigsetsize: size of sigset_t type or larger
2206 */
2174SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) 2207SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
2175{ 2208{
2176 return do_sigpending(set, sigsetsize); 2209 return do_sigpending(set, sigsetsize);
@@ -2219,9 +2252,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2219 err |= __put_user(from->si_trapno, &to->si_trapno); 2252 err |= __put_user(from->si_trapno, &to->si_trapno);
2220#endif 2253#endif
2221#ifdef BUS_MCEERR_AO 2254#ifdef BUS_MCEERR_AO
2222 /* 2255 /*
2223 * Other callers might not initialize the si_lsb field, 2256 * Other callers might not initialize the si_lsb field,
2224 * so check explicitely for the right codes here. 2257 * so check explicitly for the right codes here.
2225 */ 2258 */
2226 if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) 2259 if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
2227 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); 2260 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
@@ -2250,6 +2283,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2250 2283
2251#endif 2284#endif
2252 2285
2286/**
2287 * sys_rt_sigtimedwait - synchronously wait for queued signals specified
2288 * in @uthese
2289 * @uthese: queued signals to wait for
2290 * @uinfo: if non-null, the signal's siginfo is returned here
2291 * @uts: upper bound on process time suspension
2292 * @sigsetsize: size of sigset_t type
2293 */
2253SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, 2294SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
2254 siginfo_t __user *, uinfo, const struct timespec __user *, uts, 2295 siginfo_t __user *, uinfo, const struct timespec __user *, uts,
2255 size_t, sigsetsize) 2296 size_t, sigsetsize)
@@ -2266,7 +2307,7 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
2266 2307
2267 if (copy_from_user(&these, uthese, sizeof(these))) 2308 if (copy_from_user(&these, uthese, sizeof(these)))
2268 return -EFAULT; 2309 return -EFAULT;
2269 2310
2270 /* 2311 /*
2271 * Invert the set of allowed signals to get those we 2312 * Invert the set of allowed signals to get those we
2272 * want to block. 2313 * want to block.
@@ -2291,9 +2332,11 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
2291 + (ts.tv_sec || ts.tv_nsec)); 2332 + (ts.tv_sec || ts.tv_nsec));
2292 2333
2293 if (timeout) { 2334 if (timeout) {
2294 /* None ready -- temporarily unblock those we're 2335 /*
2336 * None ready -- temporarily unblock those we're
2295 * interested while we are sleeping in so that we'll 2337 * interested while we are sleeping in so that we'll
2296 * be awakened when they arrive. */ 2338 * be awakened when they arrive.
2339 */
2297 current->real_blocked = current->blocked; 2340 current->real_blocked = current->blocked;
2298 sigandsets(&current->blocked, &current->blocked, &these); 2341 sigandsets(&current->blocked, &current->blocked, &these);
2299 recalc_sigpending(); 2342 recalc_sigpending();
@@ -2325,6 +2368,11 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
2325 return ret; 2368 return ret;
2326} 2369}
2327 2370
2371/**
2372 * sys_kill - send a signal to a process
2373 * @pid: the PID of the process
2374 * @sig: signal to be sent
2375 */
2328SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) 2376SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
2329{ 2377{
2330 struct siginfo info; 2378 struct siginfo info;
@@ -2400,7 +2448,11 @@ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
2400 return do_tkill(tgid, pid, sig); 2448 return do_tkill(tgid, pid, sig);
2401} 2449}
2402 2450
2403/* 2451/**
2452 * sys_tkill - send signal to one specific task
2453 * @pid: the PID of the task
2454 * @sig: signal to be sent
2455 *
2404 * Send a signal to only one task, even if it's a CLONE_THREAD task. 2456 * Send a signal to only one task, even if it's a CLONE_THREAD task.
2405 */ 2457 */
2406SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) 2458SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
@@ -2412,6 +2464,12 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
2412 return do_tkill(0, pid, sig); 2464 return do_tkill(0, pid, sig);
2413} 2465}
2414 2466
2467/**
2468 * sys_rt_sigqueueinfo - send signal information to a signal
2469 * @pid: the PID of the thread
2470 * @sig: signal to be sent
2471 * @uinfo: signal info to be sent
2472 */
2415SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, 2473SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2416 siginfo_t __user *, uinfo) 2474 siginfo_t __user *, uinfo)
2417{ 2475{
@@ -2421,9 +2479,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2421 return -EFAULT; 2479 return -EFAULT;
2422 2480
2423 /* Not even root can pretend to send signals from the kernel. 2481 /* Not even root can pretend to send signals from the kernel.
2424 Nor can they impersonate a kill(), which adds source info. */ 2482 * Nor can they impersonate a kill()/tgkill(), which adds source info.
2425 if (info.si_code >= 0) 2483 */
2484 if (info.si_code >= 0 || info.si_code == SI_TKILL) {
2485 /* We used to allow any < 0 si_code */
2486 WARN_ON_ONCE(info.si_code < 0);
2426 return -EPERM; 2487 return -EPERM;
2488 }
2427 info.si_signo = sig; 2489 info.si_signo = sig;
2428 2490
2429 /* POSIX.1b doesn't mention process groups. */ 2491 /* POSIX.1b doesn't mention process groups. */
@@ -2437,9 +2499,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
2437 return -EINVAL; 2499 return -EINVAL;
2438 2500
2439 /* Not even root can pretend to send signals from the kernel. 2501 /* Not even root can pretend to send signals from the kernel.
2440 Nor can they impersonate a kill(), which adds source info. */ 2502 * Nor can they impersonate a kill()/tgkill(), which adds source info.
2441 if (info->si_code >= 0) 2503 */
2504 if (info->si_code >= 0 || info->si_code == SI_TKILL) {
2505 /* We used to allow any < 0 si_code */
2506 WARN_ON_ONCE(info->si_code < 0);
2442 return -EPERM; 2507 return -EPERM;
2508 }
2443 info->si_signo = sig; 2509 info->si_signo = sig;
2444 2510
2445 return do_send_specific(tgid, pid, sig, info); 2511 return do_send_specific(tgid, pid, sig, info);
@@ -2531,12 +2597,11 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2531 2597
2532 error = -EINVAL; 2598 error = -EINVAL;
2533 /* 2599 /*
2534 * 2600 * Note - this code used to test ss_flags incorrectly:
2535 * Note - this code used to test ss_flags incorrectly
2536 * old code may have been written using ss_flags==0 2601 * old code may have been written using ss_flags==0
2537 * to mean ss_flags==SS_ONSTACK (as this was the only 2602 * to mean ss_flags==SS_ONSTACK (as this was the only
2538 * way that worked) - this fix preserves that older 2603 * way that worked) - this fix preserves that older
2539 * mechanism 2604 * mechanism.
2540 */ 2605 */
2541 if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) 2606 if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0)
2542 goto out; 2607 goto out;
@@ -2570,6 +2635,10 @@ out:
2570 2635
2571#ifdef __ARCH_WANT_SYS_SIGPENDING 2636#ifdef __ARCH_WANT_SYS_SIGPENDING
2572 2637
2638/**
2639 * sys_sigpending - examine pending signals
2640 * @set: where mask of pending signal is returned
2641 */
2573SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) 2642SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
2574{ 2643{
2575 return do_sigpending(set, sizeof(*set)); 2644 return do_sigpending(set, sizeof(*set));
@@ -2578,8 +2647,15 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
2578#endif 2647#endif
2579 2648
2580#ifdef __ARCH_WANT_SYS_SIGPROCMASK 2649#ifdef __ARCH_WANT_SYS_SIGPROCMASK
2581/* Some platforms have their own version with special arguments others 2650/**
2582 support only sys_rt_sigprocmask. */ 2651 * sys_sigprocmask - examine and change blocked signals
2652 * @how: whether to add, remove, or set signals
2653 * @set: signals to add or remove (if non-null)
2654 * @oset: previous value of signal mask if non-null
2655 *
2656 * Some platforms have their own version with special arguments;
2657 * others support only sys_rt_sigprocmask.
2658 */
2583 2659
2584SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, 2660SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set,
2585 old_sigset_t __user *, oset) 2661 old_sigset_t __user *, oset)
@@ -2632,6 +2708,13 @@ out:
2632#endif /* __ARCH_WANT_SYS_SIGPROCMASK */ 2708#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
2633 2709
2634#ifdef __ARCH_WANT_SYS_RT_SIGACTION 2710#ifdef __ARCH_WANT_SYS_RT_SIGACTION
2711/**
2712 * sys_rt_sigaction - alter an action taken by a process
2713 * @sig: signal to be sent
2714 * @act: new sigaction
2715 * @oact: used to save the previous sigaction
2716 * @sigsetsize: size of sigset_t type
2717 */
2635SYSCALL_DEFINE4(rt_sigaction, int, sig, 2718SYSCALL_DEFINE4(rt_sigaction, int, sig,
2636 const struct sigaction __user *, act, 2719 const struct sigaction __user *, act,
2637 struct sigaction __user *, oact, 2720 struct sigaction __user *, oact,
@@ -2718,6 +2801,12 @@ SYSCALL_DEFINE0(pause)
2718#endif 2801#endif
2719 2802
2720#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND 2803#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
2804/**
2805 * sys_rt_sigsuspend - replace the signal mask for a value with the
2806 * @unewset value until a signal is received
2807 * @unewset: new signal mask value
2808 * @sigsetsize: size of sigset_t type
2809 */
2721SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) 2810SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
2722{ 2811{
2723 sigset_t newset; 2812 sigset_t newset;
diff --git a/kernel/smp.c b/kernel/smp.c
index 9910744f0856..73a195193558 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -194,7 +194,7 @@ void generic_smp_call_function_interrupt(void)
194 */ 194 */
195 list_for_each_entry_rcu(data, &call_function.queue, csd.list) { 195 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
196 int refs; 196 int refs;
197 void (*func) (void *info); 197 smp_call_func_t func;
198 198
199 /* 199 /*
200 * Since we walk the list without any locks, we might 200 * Since we walk the list without any locks, we might
@@ -214,17 +214,17 @@ void generic_smp_call_function_interrupt(void)
214 if (atomic_read(&data->refs) == 0) 214 if (atomic_read(&data->refs) == 0)
215 continue; 215 continue;
216 216
217 func = data->csd.func; /* for later warn */ 217 func = data->csd.func; /* save for later warn */
218 data->csd.func(data->csd.info); 218 func(data->csd.info);
219 219
220 /* 220 /*
221 * If the cpu mask is not still set then it enabled interrupts, 221 * If the cpu mask is not still set then func enabled
222 * we took another smp interrupt, and executed the function 222 * interrupts (BUG), and this cpu took another smp call
223 * twice on this cpu. In theory that copy decremented refs. 223 * function interrupt and executed func(info) twice
224 * on this cpu. That nested execution decremented refs.
224 */ 225 */
225 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { 226 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
226 WARN(1, "%pS enabled interrupts and double executed\n", 227 WARN(1, "%pf enabled interrupts and double executed\n", func);
227 func);
228 continue; 228 continue;
229 } 229 }
230 230
@@ -450,7 +450,7 @@ void smp_call_function_many(const struct cpumask *mask,
450{ 450{
451 struct call_function_data *data; 451 struct call_function_data *data;
452 unsigned long flags; 452 unsigned long flags;
453 int cpu, next_cpu, this_cpu = smp_processor_id(); 453 int refs, cpu, next_cpu, this_cpu = smp_processor_id();
454 454
455 /* 455 /*
456 * Can deadlock when called with interrupts disabled. 456 * Can deadlock when called with interrupts disabled.
@@ -461,7 +461,7 @@ void smp_call_function_many(const struct cpumask *mask,
461 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() 461 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
462 && !oops_in_progress && !early_boot_irqs_disabled); 462 && !oops_in_progress && !early_boot_irqs_disabled);
463 463
464 /* So, what's a CPU they want? Ignoring this one. */ 464 /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */
465 cpu = cpumask_first_and(mask, cpu_online_mask); 465 cpu = cpumask_first_and(mask, cpu_online_mask);
466 if (cpu == this_cpu) 466 if (cpu == this_cpu)
467 cpu = cpumask_next_and(cpu, mask, cpu_online_mask); 467 cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
@@ -483,22 +483,49 @@ void smp_call_function_many(const struct cpumask *mask,
483 483
484 data = &__get_cpu_var(cfd_data); 484 data = &__get_cpu_var(cfd_data);
485 csd_lock(&data->csd); 485 csd_lock(&data->csd);
486
487 /* This BUG_ON verifies our reuse assertions and can be removed */
486 BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); 488 BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
487 489
490 /*
491 * The global call function queue list add and delete are protected
492 * by a lock, but the list is traversed without any lock, relying
493 * on the rcu list add and delete to allow safe concurrent traversal.
494 * We reuse the call function data without waiting for any grace
495 * period after some other cpu removes it from the global queue.
496 * This means a cpu might find our data block as it is being
497 * filled out.
498 *
499 * We hold off the interrupt handler on the other cpu by
500 * ordering our writes to the cpu mask vs our setting of the
501 * refs counter. We assert only the cpu owning the data block
502 * will set a bit in cpumask, and each bit will only be cleared
503 * by the subject cpu. Each cpu must first find its bit is
504 * set and then check that refs is set indicating the element is
505 * ready to be processed, otherwise it must skip the entry.
506 *
507 * On the previous iteration refs was set to 0 by another cpu.
508 * To avoid the use of transitivity, set the counter to 0 here
509 * so the wmb will pair with the rmb in the interrupt handler.
510 */
511 atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */
512
488 data->csd.func = func; 513 data->csd.func = func;
489 data->csd.info = info; 514 data->csd.info = info;
490 cpumask_and(data->cpumask, mask, cpu_online_mask);
491 cpumask_clear_cpu(this_cpu, data->cpumask);
492 515
493 /* 516 /* Ensure 0 refs is visible before mask. Also orders func and info */
494 * To ensure the interrupt handler gets an complete view
495 * we order the cpumask and refs writes and order the read
496 * of them in the interrupt handler. In addition we may
497 * only clear our own cpu bit from the mask.
498 */
499 smp_wmb(); 517 smp_wmb();
500 518
501 atomic_set(&data->refs, cpumask_weight(data->cpumask)); 519 /* We rely on the "and" being processed before the store */
520 cpumask_and(data->cpumask, mask, cpu_online_mask);
521 cpumask_clear_cpu(this_cpu, data->cpumask);
522 refs = cpumask_weight(data->cpumask);
523
524 /* Some callers race with other cpus changing the passed mask */
525 if (unlikely(!refs)) {
526 csd_unlock(&data->csd);
527 return;
528 }
502 529
503 raw_spin_lock_irqsave(&call_function.lock, flags); 530 raw_spin_lock_irqsave(&call_function.lock, flags);
504 /* 531 /*
@@ -507,6 +534,12 @@ void smp_call_function_many(const struct cpumask *mask,
507 * will not miss any other list entries: 534 * will not miss any other list entries:
508 */ 535 */
509 list_add_rcu(&data->csd.list, &call_function.queue); 536 list_add_rcu(&data->csd.list, &call_function.queue);
537 /*
538 * We rely on the wmb() in list_add_rcu to complete our writes
539 * to the cpumask before this write to refs, which indicates
540 * data is on the list and is ready to be processed.
541 */
542 atomic_set(&data->refs, refs);
510 raw_spin_unlock_irqrestore(&call_function.lock, flags); 543 raw_spin_unlock_irqrestore(&call_function.lock, flags);
511 544
512 /* 545 /*
@@ -571,6 +604,87 @@ void ipi_call_unlock_irq(void)
571} 604}
572#endif /* USE_GENERIC_SMP_HELPERS */ 605#endif /* USE_GENERIC_SMP_HELPERS */
573 606
607/* Setup configured maximum number of CPUs to activate */
608unsigned int setup_max_cpus = NR_CPUS;
609EXPORT_SYMBOL(setup_max_cpus);
610
611
612/*
613 * Setup routine for controlling SMP activation
614 *
615 * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
616 * activation entirely (the MPS table probe still happens, though).
617 *
618 * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
619 * greater than 0, limits the maximum number of CPUs activated in
620 * SMP mode to <NUM>.
621 */
622
623void __weak arch_disable_smp_support(void) { }
624
625static int __init nosmp(char *str)
626{
627 setup_max_cpus = 0;
628 arch_disable_smp_support();
629
630 return 0;
631}
632
633early_param("nosmp", nosmp);
634
635/* this is hard limit */
636static int __init nrcpus(char *str)
637{
638 int nr_cpus;
639
640 get_option(&str, &nr_cpus);
641 if (nr_cpus > 0 && nr_cpus < nr_cpu_ids)
642 nr_cpu_ids = nr_cpus;
643
644 return 0;
645}
646
647early_param("nr_cpus", nrcpus);
648
649static int __init maxcpus(char *str)
650{
651 get_option(&str, &setup_max_cpus);
652 if (setup_max_cpus == 0)
653 arch_disable_smp_support();
654
655 return 0;
656}
657
658early_param("maxcpus", maxcpus);
659
660/* Setup number of possible processor ids */
661int nr_cpu_ids __read_mostly = NR_CPUS;
662EXPORT_SYMBOL(nr_cpu_ids);
663
664/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
665void __init setup_nr_cpu_ids(void)
666{
667 nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
668}
669
670/* Called by boot processor to activate the rest. */
671void __init smp_init(void)
672{
673 unsigned int cpu;
674
675 /* FIXME: This should be done in userspace --RR */
676 for_each_present_cpu(cpu) {
677 if (num_online_cpus() >= setup_max_cpus)
678 break;
679 if (!cpu_online(cpu))
680 cpu_up(cpu);
681 }
682
683 /* Any cleanup work */
684 printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
685 smp_cpus_done(setup_max_cpus);
686}
687
574/* 688/*
575 * Call a function on all processors. May be used during early boot while 689 * Call a function on all processors. May be used during early boot while
576 * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead 690 * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 68eb5efec388..174f976c2874 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_stat);
54 54
55static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; 55static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
56 56
57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 57DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
@@ -311,9 +311,21 @@ void irq_enter(void)
311} 311}
312 312
313#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 313#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
314# define invoke_softirq() __do_softirq() 314static inline void invoke_softirq(void)
315{
316 if (!force_irqthreads)
317 __do_softirq();
318 else
319 wakeup_softirqd();
320}
315#else 321#else
316# define invoke_softirq() do_softirq() 322static inline void invoke_softirq(void)
323{
324 if (!force_irqthreads)
325 do_softirq();
326 else
327 wakeup_softirqd();
328}
317#endif 329#endif
318 330
319/* 331/*
@@ -555,7 +567,7 @@ static void __tasklet_hrtimer_trampoline(unsigned long data)
555/** 567/**
556 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks 568 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
557 * @ttimer: tasklet_hrtimer which is initialized 569 * @ttimer: tasklet_hrtimer which is initialized
558 * @function: hrtimer callback funtion which gets called from softirq context 570 * @function: hrtimer callback function which gets called from softirq context
559 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) 571 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
560 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) 572 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
561 */ 573 */
@@ -721,7 +733,6 @@ static int run_ksoftirqd(void * __bind_cpu)
721{ 733{
722 set_current_state(TASK_INTERRUPTIBLE); 734 set_current_state(TASK_INTERRUPTIBLE);
723 735
724 current->flags |= PF_KSOFTIRQD;
725 while (!kthread_should_stop()) { 736 while (!kthread_should_stop()) {
726 preempt_disable(); 737 preempt_disable();
727 if (!local_softirq_pending()) { 738 if (!local_softirq_pending()) {
@@ -738,7 +749,10 @@ static int run_ksoftirqd(void * __bind_cpu)
738 don't process */ 749 don't process */
739 if (cpu_is_offline((long)__bind_cpu)) 750 if (cpu_is_offline((long)__bind_cpu))
740 goto wait_to_die; 751 goto wait_to_die;
741 do_softirq(); 752 local_irq_disable();
753 if (local_softirq_pending())
754 __do_softirq();
755 local_irq_enable();
742 preempt_enable_no_resched(); 756 preempt_enable_no_resched();
743 cond_resched(); 757 cond_resched();
744 preempt_disable(); 758 preempt_disable();
@@ -831,7 +845,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
831 switch (action) { 845 switch (action) {
832 case CPU_UP_PREPARE: 846 case CPU_UP_PREPARE:
833 case CPU_UP_PREPARE_FROZEN: 847 case CPU_UP_PREPARE_FROZEN:
834 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 848 p = kthread_create_on_node(run_ksoftirqd,
849 hcpu,
850 cpu_to_node(hotcpu),
851 "ksoftirqd/%d", hotcpu);
835 if (IS_ERR(p)) { 852 if (IS_ERR(p)) {
836 printk("ksoftirqd for %i failed\n", hotcpu); 853 printk("ksoftirqd for %i failed\n", hotcpu);
837 return notifier_from_errno(PTR_ERR(p)); 854 return notifier_from_errno(PTR_ERR(p));
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2df820b03beb..e3516b29076c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -301,8 +301,10 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
301 case CPU_UP_PREPARE: 301 case CPU_UP_PREPARE:
302 BUG_ON(stopper->thread || stopper->enabled || 302 BUG_ON(stopper->thread || stopper->enabled ||
303 !list_empty(&stopper->works)); 303 !list_empty(&stopper->works));
304 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", 304 p = kthread_create_on_node(cpu_stopper_thread,
305 cpu); 305 stopper,
306 cpu_to_node(cpu),
307 "migration/%d", cpu);
306 if (IS_ERR(p)) 308 if (IS_ERR(p))
307 return notifier_from_errno(PTR_ERR(p)); 309 return notifier_from_errno(PTR_ERR(p));
308 get_task_struct(p); 310 get_task_struct(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 18da702ec813..af468edf096a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -37,6 +37,7 @@
37#include <linux/ptrace.h> 37#include <linux/ptrace.h>
38#include <linux/fs_struct.h> 38#include <linux/fs_struct.h>
39#include <linux/gfp.h> 39#include <linux/gfp.h>
40#include <linux/syscore_ops.h>
40 41
41#include <linux/compat.h> 42#include <linux/compat.h>
42#include <linux/syscalls.h> 43#include <linux/syscalls.h>
@@ -119,16 +120,33 @@ EXPORT_SYMBOL(cad_pid);
119void (*pm_power_off_prepare)(void); 120void (*pm_power_off_prepare)(void);
120 121
121/* 122/*
123 * Returns true if current's euid is same as p's uid or euid,
124 * or has CAP_SYS_NICE to p's user_ns.
125 *
126 * Called with rcu_read_lock, creds are safe
127 */
128static bool set_one_prio_perm(struct task_struct *p)
129{
130 const struct cred *cred = current_cred(), *pcred = __task_cred(p);
131
132 if (pcred->user->user_ns == cred->user->user_ns &&
133 (pcred->uid == cred->euid ||
134 pcred->euid == cred->euid))
135 return true;
136 if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
137 return true;
138 return false;
139}
140
141/*
122 * set the priority of a task 142 * set the priority of a task
123 * - the caller must hold the RCU read lock 143 * - the caller must hold the RCU read lock
124 */ 144 */
125static int set_one_prio(struct task_struct *p, int niceval, int error) 145static int set_one_prio(struct task_struct *p, int niceval, int error)
126{ 146{
127 const struct cred *cred = current_cred(), *pcred = __task_cred(p);
128 int no_nice; 147 int no_nice;
129 148
130 if (pcred->uid != cred->euid && 149 if (!set_one_prio_perm(p)) {
131 pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
132 error = -EPERM; 150 error = -EPERM;
133 goto out; 151 goto out;
134 } 152 }
@@ -298,6 +316,7 @@ void kernel_restart_prepare(char *cmd)
298 system_state = SYSTEM_RESTART; 316 system_state = SYSTEM_RESTART;
299 device_shutdown(); 317 device_shutdown();
300 sysdev_shutdown(); 318 sysdev_shutdown();
319 syscore_shutdown();
301} 320}
302 321
303/** 322/**
@@ -336,6 +355,7 @@ void kernel_halt(void)
336{ 355{
337 kernel_shutdown_prepare(SYSTEM_HALT); 356 kernel_shutdown_prepare(SYSTEM_HALT);
338 sysdev_shutdown(); 357 sysdev_shutdown();
358 syscore_shutdown();
339 printk(KERN_EMERG "System halted.\n"); 359 printk(KERN_EMERG "System halted.\n");
340 kmsg_dump(KMSG_DUMP_HALT); 360 kmsg_dump(KMSG_DUMP_HALT);
341 machine_halt(); 361 machine_halt();
@@ -355,6 +375,7 @@ void kernel_power_off(void)
355 pm_power_off_prepare(); 375 pm_power_off_prepare();
356 disable_nonboot_cpus(); 376 disable_nonboot_cpus();
357 sysdev_shutdown(); 377 sysdev_shutdown();
378 syscore_shutdown();
358 printk(KERN_EMERG "Power down.\n"); 379 printk(KERN_EMERG "Power down.\n");
359 kmsg_dump(KMSG_DUMP_POWEROFF); 380 kmsg_dump(KMSG_DUMP_POWEROFF);
360 machine_power_off(); 381 machine_power_off();
@@ -502,7 +523,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
502 if (rgid != (gid_t) -1) { 523 if (rgid != (gid_t) -1) {
503 if (old->gid == rgid || 524 if (old->gid == rgid ||
504 old->egid == rgid || 525 old->egid == rgid ||
505 capable(CAP_SETGID)) 526 nsown_capable(CAP_SETGID))
506 new->gid = rgid; 527 new->gid = rgid;
507 else 528 else
508 goto error; 529 goto error;
@@ -511,7 +532,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
511 if (old->gid == egid || 532 if (old->gid == egid ||
512 old->egid == egid || 533 old->egid == egid ||
513 old->sgid == egid || 534 old->sgid == egid ||
514 capable(CAP_SETGID)) 535 nsown_capable(CAP_SETGID))
515 new->egid = egid; 536 new->egid = egid;
516 else 537 else
517 goto error; 538 goto error;
@@ -546,7 +567,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
546 old = current_cred(); 567 old = current_cred();
547 568
548 retval = -EPERM; 569 retval = -EPERM;
549 if (capable(CAP_SETGID)) 570 if (nsown_capable(CAP_SETGID))
550 new->gid = new->egid = new->sgid = new->fsgid = gid; 571 new->gid = new->egid = new->sgid = new->fsgid = gid;
551 else if (gid == old->gid || gid == old->sgid) 572 else if (gid == old->gid || gid == old->sgid)
552 new->egid = new->fsgid = gid; 573 new->egid = new->fsgid = gid;
@@ -613,7 +634,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
613 new->uid = ruid; 634 new->uid = ruid;
614 if (old->uid != ruid && 635 if (old->uid != ruid &&
615 old->euid != ruid && 636 old->euid != ruid &&
616 !capable(CAP_SETUID)) 637 !nsown_capable(CAP_SETUID))
617 goto error; 638 goto error;
618 } 639 }
619 640
@@ -622,7 +643,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
622 if (old->uid != euid && 643 if (old->uid != euid &&
623 old->euid != euid && 644 old->euid != euid &&
624 old->suid != euid && 645 old->suid != euid &&
625 !capable(CAP_SETUID)) 646 !nsown_capable(CAP_SETUID))
626 goto error; 647 goto error;
627 } 648 }
628 649
@@ -670,7 +691,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
670 old = current_cred(); 691 old = current_cred();
671 692
672 retval = -EPERM; 693 retval = -EPERM;
673 if (capable(CAP_SETUID)) { 694 if (nsown_capable(CAP_SETUID)) {
674 new->suid = new->uid = uid; 695 new->suid = new->uid = uid;
675 if (uid != old->uid) { 696 if (uid != old->uid) {
676 retval = set_user(new); 697 retval = set_user(new);
@@ -712,7 +733,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
712 old = current_cred(); 733 old = current_cred();
713 734
714 retval = -EPERM; 735 retval = -EPERM;
715 if (!capable(CAP_SETUID)) { 736 if (!nsown_capable(CAP_SETUID)) {
716 if (ruid != (uid_t) -1 && ruid != old->uid && 737 if (ruid != (uid_t) -1 && ruid != old->uid &&
717 ruid != old->euid && ruid != old->suid) 738 ruid != old->euid && ruid != old->suid)
718 goto error; 739 goto error;
@@ -776,7 +797,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
776 old = current_cred(); 797 old = current_cred();
777 798
778 retval = -EPERM; 799 retval = -EPERM;
779 if (!capable(CAP_SETGID)) { 800 if (!nsown_capable(CAP_SETGID)) {
780 if (rgid != (gid_t) -1 && rgid != old->gid && 801 if (rgid != (gid_t) -1 && rgid != old->gid &&
781 rgid != old->egid && rgid != old->sgid) 802 rgid != old->egid && rgid != old->sgid)
782 goto error; 803 goto error;
@@ -836,7 +857,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
836 857
837 if (uid == old->uid || uid == old->euid || 858 if (uid == old->uid || uid == old->euid ||
838 uid == old->suid || uid == old->fsuid || 859 uid == old->suid || uid == old->fsuid ||
839 capable(CAP_SETUID)) { 860 nsown_capable(CAP_SETUID)) {
840 if (uid != old_fsuid) { 861 if (uid != old_fsuid) {
841 new->fsuid = uid; 862 new->fsuid = uid;
842 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) 863 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -869,7 +890,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
869 890
870 if (gid == old->gid || gid == old->egid || 891 if (gid == old->gid || gid == old->egid ||
871 gid == old->sgid || gid == old->fsgid || 892 gid == old->sgid || gid == old->fsgid ||
872 capable(CAP_SETGID)) { 893 nsown_capable(CAP_SETGID)) {
873 if (gid != old_fsgid) { 894 if (gid != old_fsgid) {
874 new->fsgid = gid; 895 new->fsgid = gid;
875 goto change_okay; 896 goto change_okay;
@@ -1177,8 +1198,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1177 int errno; 1198 int errno;
1178 char tmp[__NEW_UTS_LEN]; 1199 char tmp[__NEW_UTS_LEN];
1179 1200
1180 if (!capable(CAP_SYS_ADMIN)) 1201 if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
1181 return -EPERM; 1202 return -EPERM;
1203
1182 if (len < 0 || len > __NEW_UTS_LEN) 1204 if (len < 0 || len > __NEW_UTS_LEN)
1183 return -EINVAL; 1205 return -EINVAL;
1184 down_write(&uts_sem); 1206 down_write(&uts_sem);
@@ -1226,7 +1248,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1226 int errno; 1248 int errno;
1227 char tmp[__NEW_UTS_LEN]; 1249 char tmp[__NEW_UTS_LEN];
1228 1250
1229 if (!capable(CAP_SYS_ADMIN)) 1251 if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
1230 return -EPERM; 1252 return -EPERM;
1231 if (len < 0 || len > __NEW_UTS_LEN) 1253 if (len < 0 || len > __NEW_UTS_LEN)
1232 return -EINVAL; 1254 return -EINVAL;
@@ -1341,6 +1363,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
1341 rlim = tsk->signal->rlim + resource; 1363 rlim = tsk->signal->rlim + resource;
1342 task_lock(tsk->group_leader); 1364 task_lock(tsk->group_leader);
1343 if (new_rlim) { 1365 if (new_rlim) {
1366 /* Keep the capable check against init_user_ns until
1367 cgroups can contain all limits */
1344 if (new_rlim->rlim_max > rlim->rlim_max && 1368 if (new_rlim->rlim_max > rlim->rlim_max &&
1345 !capable(CAP_SYS_RESOURCE)) 1369 !capable(CAP_SYS_RESOURCE))
1346 retval = -EPERM; 1370 retval = -EPERM;
@@ -1384,19 +1408,22 @@ static int check_prlimit_permission(struct task_struct *task)
1384{ 1408{
1385 const struct cred *cred = current_cred(), *tcred; 1409 const struct cred *cred = current_cred(), *tcred;
1386 1410
1387 tcred = __task_cred(task); 1411 if (current == task)
1388 if (current != task && 1412 return 0;
1389 (cred->uid != tcred->euid ||
1390 cred->uid != tcred->suid ||
1391 cred->uid != tcred->uid ||
1392 cred->gid != tcred->egid ||
1393 cred->gid != tcred->sgid ||
1394 cred->gid != tcred->gid) &&
1395 !capable(CAP_SYS_RESOURCE)) {
1396 return -EPERM;
1397 }
1398 1413
1399 return 0; 1414 tcred = __task_cred(task);
1415 if (cred->user->user_ns == tcred->user->user_ns &&
1416 (cred->uid == tcred->euid &&
1417 cred->uid == tcred->suid &&
1418 cred->uid == tcred->uid &&
1419 cred->gid == tcred->egid &&
1420 cred->gid == tcred->sgid &&
1421 cred->gid == tcred->gid))
1422 return 0;
1423 if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
1424 return 0;
1425
1426 return -EPERM;
1400} 1427}
1401 1428
1402SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, 1429SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c782fe9924c7..25cc41cd8f33 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -186,3 +186,8 @@ cond_syscall(sys_perf_event_open);
186/* fanotify! */ 186/* fanotify! */
187cond_syscall(sys_fanotify_init); 187cond_syscall(sys_fanotify_init);
188cond_syscall(sys_fanotify_mark); 188cond_syscall(sys_fanotify_mark);
189
190/* open by handle */
191cond_syscall(sys_name_to_handle_at);
192cond_syscall(sys_open_by_handle_at);
193cond_syscall(compat_sys_open_by_handle_at);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0f1bd83db985..c0bb32414b17 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -117,6 +117,7 @@ static int neg_one = -1;
117static int zero; 117static int zero;
118static int __maybe_unused one = 1; 118static int __maybe_unused one = 1;
119static int __maybe_unused two = 2; 119static int __maybe_unused two = 2;
120static int __maybe_unused three = 3;
120static unsigned long one_ul = 1; 121static unsigned long one_ul = 1;
121static int one_hundred = 100; 122static int one_hundred = 100;
122#ifdef CONFIG_PRINTK 123#ifdef CONFIG_PRINTK
@@ -169,6 +170,11 @@ static int proc_taint(struct ctl_table *table, int write,
169 void __user *buffer, size_t *lenp, loff_t *ppos); 170 void __user *buffer, size_t *lenp, loff_t *ppos);
170#endif 171#endif
171 172
173#ifdef CONFIG_PRINTK
174static int proc_dmesg_restrict(struct ctl_table *table, int write,
175 void __user *buffer, size_t *lenp, loff_t *ppos);
176#endif
177
172#ifdef CONFIG_MAGIC_SYSRQ 178#ifdef CONFIG_MAGIC_SYSRQ
173/* Note: sysrq code uses it's own private copy */ 179/* Note: sysrq code uses it's own private copy */
174static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; 180static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -194,9 +200,9 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
194static struct ctl_table root_table[]; 200static struct ctl_table root_table[];
195static struct ctl_table_root sysctl_table_root; 201static struct ctl_table_root sysctl_table_root;
196static struct ctl_table_header root_table_header = { 202static struct ctl_table_header root_table_header = {
197 .count = 1, 203 {{.count = 1,
198 .ctl_table = root_table, 204 .ctl_table = root_table,
199 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), 205 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
200 .root = &sysctl_table_root, 206 .root = &sysctl_table_root,
201 .set = &sysctl_table_root.default_set, 207 .set = &sysctl_table_root.default_set,
202}; 208};
@@ -361,20 +367,13 @@ static struct ctl_table kern_table[] = {
361 .mode = 0644, 367 .mode = 0644,
362 .proc_handler = sched_rt_handler, 368 .proc_handler = sched_rt_handler,
363 }, 369 },
364 {
365 .procname = "sched_compat_yield",
366 .data = &sysctl_sched_compat_yield,
367 .maxlen = sizeof(unsigned int),
368 .mode = 0644,
369 .proc_handler = proc_dointvec,
370 },
371#ifdef CONFIG_SCHED_AUTOGROUP 370#ifdef CONFIG_SCHED_AUTOGROUP
372 { 371 {
373 .procname = "sched_autogroup_enabled", 372 .procname = "sched_autogroup_enabled",
374 .data = &sysctl_sched_autogroup_enabled, 373 .data = &sysctl_sched_autogroup_enabled,
375 .maxlen = sizeof(unsigned int), 374 .maxlen = sizeof(unsigned int),
376 .mode = 0644, 375 .mode = 0644,
377 .proc_handler = proc_dointvec, 376 .proc_handler = proc_dointvec_minmax,
378 .extra1 = &zero, 377 .extra1 = &zero,
379 .extra2 = &one, 378 .extra2 = &one,
380 }, 379 },
@@ -713,7 +712,7 @@ static struct ctl_table kern_table[] = {
713 .data = &kptr_restrict, 712 .data = &kptr_restrict,
714 .maxlen = sizeof(int), 713 .maxlen = sizeof(int),
715 .mode = 0644, 714 .mode = 0644,
716 .proc_handler = proc_dointvec_minmax, 715 .proc_handler = proc_dmesg_restrict,
717 .extra1 = &zero, 716 .extra1 = &zero,
718 .extra2 = &two, 717 .extra2 = &two,
719 }, 718 },
@@ -948,7 +947,7 @@ static struct ctl_table kern_table[] = {
948 .data = &sysctl_perf_event_sample_rate, 947 .data = &sysctl_perf_event_sample_rate,
949 .maxlen = sizeof(sysctl_perf_event_sample_rate), 948 .maxlen = sizeof(sysctl_perf_event_sample_rate),
950 .mode = 0644, 949 .mode = 0644,
951 .proc_handler = proc_dointvec, 950 .proc_handler = perf_proc_update_handler,
952 }, 951 },
953#endif 952#endif
954#ifdef CONFIG_KMEMCHECK 953#ifdef CONFIG_KMEMCHECK
@@ -978,14 +977,18 @@ static struct ctl_table vm_table[] = {
978 .data = &sysctl_overcommit_memory, 977 .data = &sysctl_overcommit_memory,
979 .maxlen = sizeof(sysctl_overcommit_memory), 978 .maxlen = sizeof(sysctl_overcommit_memory),
980 .mode = 0644, 979 .mode = 0644,
981 .proc_handler = proc_dointvec, 980 .proc_handler = proc_dointvec_minmax,
981 .extra1 = &zero,
982 .extra2 = &two,
982 }, 983 },
983 { 984 {
984 .procname = "panic_on_oom", 985 .procname = "panic_on_oom",
985 .data = &sysctl_panic_on_oom, 986 .data = &sysctl_panic_on_oom,
986 .maxlen = sizeof(sysctl_panic_on_oom), 987 .maxlen = sizeof(sysctl_panic_on_oom),
987 .mode = 0644, 988 .mode = 0644,
988 .proc_handler = proc_dointvec, 989 .proc_handler = proc_dointvec_minmax,
990 .extra1 = &zero,
991 .extra2 = &two,
989 }, 992 },
990 { 993 {
991 .procname = "oom_kill_allocating_task", 994 .procname = "oom_kill_allocating_task",
@@ -1013,7 +1016,8 @@ static struct ctl_table vm_table[] = {
1013 .data = &page_cluster, 1016 .data = &page_cluster,
1014 .maxlen = sizeof(int), 1017 .maxlen = sizeof(int),
1015 .mode = 0644, 1018 .mode = 0644,
1016 .proc_handler = proc_dointvec, 1019 .proc_handler = proc_dointvec_minmax,
1020 .extra1 = &zero,
1017 }, 1021 },
1018 { 1022 {
1019 .procname = "dirty_background_ratio", 1023 .procname = "dirty_background_ratio",
@@ -1061,7 +1065,8 @@ static struct ctl_table vm_table[] = {
1061 .data = &dirty_expire_interval, 1065 .data = &dirty_expire_interval,
1062 .maxlen = sizeof(dirty_expire_interval), 1066 .maxlen = sizeof(dirty_expire_interval),
1063 .mode = 0644, 1067 .mode = 0644,
1064 .proc_handler = proc_dointvec, 1068 .proc_handler = proc_dointvec_minmax,
1069 .extra1 = &zero,
1065 }, 1070 },
1066 { 1071 {
1067 .procname = "nr_pdflush_threads", 1072 .procname = "nr_pdflush_threads",
@@ -1137,6 +1142,8 @@ static struct ctl_table vm_table[] = {
1137 .maxlen = sizeof(int), 1142 .maxlen = sizeof(int),
1138 .mode = 0644, 1143 .mode = 0644,
1139 .proc_handler = drop_caches_sysctl_handler, 1144 .proc_handler = drop_caches_sysctl_handler,
1145 .extra1 = &one,
1146 .extra2 = &three,
1140 }, 1147 },
1141#ifdef CONFIG_COMPACTION 1148#ifdef CONFIG_COMPACTION
1142 { 1149 {
@@ -1567,11 +1574,16 @@ void sysctl_head_get(struct ctl_table_header *head)
1567 spin_unlock(&sysctl_lock); 1574 spin_unlock(&sysctl_lock);
1568} 1575}
1569 1576
1577static void free_head(struct rcu_head *rcu)
1578{
1579 kfree(container_of(rcu, struct ctl_table_header, rcu));
1580}
1581
1570void sysctl_head_put(struct ctl_table_header *head) 1582void sysctl_head_put(struct ctl_table_header *head)
1571{ 1583{
1572 spin_lock(&sysctl_lock); 1584 spin_lock(&sysctl_lock);
1573 if (!--head->count) 1585 if (!--head->count)
1574 kfree(head); 1586 call_rcu(&head->rcu, free_head);
1575 spin_unlock(&sysctl_lock); 1587 spin_unlock(&sysctl_lock);
1576} 1588}
1577 1589
@@ -1685,13 +1697,8 @@ static int test_perm(int mode, int op)
1685 1697
1686int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) 1698int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1687{ 1699{
1688 int error;
1689 int mode; 1700 int mode;
1690 1701
1691 error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
1692 if (error)
1693 return error;
1694
1695 if (root->permissions) 1702 if (root->permissions)
1696 mode = root->permissions(root, current->nsproxy, table); 1703 mode = root->permissions(root, current->nsproxy, table);
1697 else 1704 else
@@ -1948,10 +1955,10 @@ void unregister_sysctl_table(struct ctl_table_header * header)
1948 start_unregistering(header); 1955 start_unregistering(header);
1949 if (!--header->parent->count) { 1956 if (!--header->parent->count) {
1950 WARN_ON(1); 1957 WARN_ON(1);
1951 kfree(header->parent); 1958 call_rcu(&header->parent->rcu, free_head);
1952 } 1959 }
1953 if (!--header->count) 1960 if (!--header->count)
1954 kfree(header); 1961 call_rcu(&header->rcu, free_head);
1955 spin_unlock(&sysctl_lock); 1962 spin_unlock(&sysctl_lock);
1956} 1963}
1957 1964
@@ -2392,6 +2399,17 @@ static int proc_taint(struct ctl_table *table, int write,
2392 return err; 2399 return err;
2393} 2400}
2394 2401
2402#ifdef CONFIG_PRINTK
2403static int proc_dmesg_restrict(struct ctl_table *table, int write,
2404 void __user *buffer, size_t *lenp, loff_t *ppos)
2405{
2406 if (write && !capable(CAP_SYS_ADMIN))
2407 return -EPERM;
2408
2409 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2410}
2411#endif
2412
2395struct do_proc_dointvec_minmax_conv_param { 2413struct do_proc_dointvec_minmax_conv_param {
2396 int *min; 2414 int *min;
2397 int *max; 2415 int *max;
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index b875bedf7c9a..3b8e028b9601 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1321,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1321 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1321 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1322{ 1322{
1323 const struct bin_table *table = NULL; 1323 const struct bin_table *table = NULL;
1324 struct nameidata nd;
1325 struct vfsmount *mnt; 1324 struct vfsmount *mnt;
1326 struct file *file; 1325 struct file *file;
1327 ssize_t result; 1326 ssize_t result;
1328 char *pathname; 1327 char *pathname;
1329 int flags; 1328 int flags;
1330 int acc_mode;
1331 1329
1332 pathname = sysctl_getname(name, nlen, &table); 1330 pathname = sysctl_getname(name, nlen, &table);
1333 result = PTR_ERR(pathname); 1331 result = PTR_ERR(pathname);
@@ -1337,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1337 /* How should the sysctl be accessed? */ 1335 /* How should the sysctl be accessed? */
1338 if (oldval && oldlen && newval && newlen) { 1336 if (oldval && oldlen && newval && newlen) {
1339 flags = O_RDWR; 1337 flags = O_RDWR;
1340 acc_mode = MAY_READ | MAY_WRITE;
1341 } else if (newval && newlen) { 1338 } else if (newval && newlen) {
1342 flags = O_WRONLY; 1339 flags = O_WRONLY;
1343 acc_mode = MAY_WRITE;
1344 } else if (oldval && oldlen) { 1340 } else if (oldval && oldlen) {
1345 flags = O_RDONLY; 1341 flags = O_RDONLY;
1346 acc_mode = MAY_READ;
1347 } else { 1342 } else {
1348 result = 0; 1343 result = 0;
1349 goto out_putname; 1344 goto out_putname;
1350 } 1345 }
1351 1346
1352 mnt = current->nsproxy->pid_ns->proc_mnt; 1347 mnt = current->nsproxy->pid_ns->proc_mnt;
1353 result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd); 1348 file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
1354 if (result)
1355 goto out_putname;
1356
1357 result = may_open(&nd.path, acc_mode, flags);
1358 if (result)
1359 goto out_putpath;
1360
1361 file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
1362 result = PTR_ERR(file); 1349 result = PTR_ERR(file);
1363 if (IS_ERR(file)) 1350 if (IS_ERR(file))
1364 goto out_putname; 1351 goto out_putname;
@@ -1370,10 +1357,6 @@ out_putname:
1370 putname(pathname); 1357 putname(pathname);
1371out: 1358out:
1372 return result; 1359 return result;
1373
1374out_putpath:
1375 path_put(&nd.path);
1376 goto out_putname;
1377} 1360}
1378 1361
1379 1362
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 10b90d8a03c4..4e4932a7b360 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
111 const char *fail = NULL; 111 const char *fail = NULL;
112 112
113 if (table->parent) { 113 if (table->parent) {
114 if (table->procname && !table->parent->procname) 114 if (!table->parent->procname)
115 set_fail(&fail, table, "Parent without procname"); 115 set_fail(&fail, table, "Parent without procname");
116 } 116 }
117 if (!table->procname)
118 set_fail(&fail, table, "No procname");
119 if (table->child) { 117 if (table->child) {
120 if (table->data) 118 if (table->data)
121 set_fail(&fail, table, "Directory with data?"); 119 set_fail(&fail, table, "Directory with data?");
@@ -144,13 +142,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
144 set_fail(&fail, table, "No maxlen"); 142 set_fail(&fail, table, "No maxlen");
145 } 143 }
146#ifdef CONFIG_PROC_SYSCTL 144#ifdef CONFIG_PROC_SYSCTL
147 if (table->procname && !table->proc_handler) 145 if (!table->proc_handler)
148 set_fail(&fail, table, "No proc_handler"); 146 set_fail(&fail, table, "No proc_handler");
149#endif 147#endif
150#if 0
151 if (!table->procname && table->proc_handler)
152 set_fail(&fail, table, "proc_handler without procname");
153#endif
154 sysctl_check_leaf(namespaces, table, &fail); 148 sysctl_check_leaf(namespaces, table, &fail);
155 } 149 }
156 if (table->mode > 0777) 150 if (table->mode > 0777)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 3971c6b9d58d..9ffea360a778 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -685,7 +685,7 @@ static int __init taskstats_init(void)
685 goto err_cgroup_ops; 685 goto err_cgroup_ops;
686 686
687 family_registered = 1; 687 family_registered = 1;
688 printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 688 pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
689 return 0; 689 return 0;
690err_cgroup_ops: 690err_cgroup_ops:
691 genl_unregister_ops(&family, &taskstats_ops); 691 genl_unregister_ops(&family, &taskstats_ops);
diff --git a/kernel/time.c b/kernel/time.c
index 32174359576f..8e8dc6d705c9 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -150,7 +150,7 @@ static inline void warp_clock(void)
150 * various programs will get confused when the clock gets warped. 150 * various programs will get confused when the clock gets warped.
151 */ 151 */
152 152
153int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) 153int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
154{ 154{
155 static int firsttime = 1; 155 static int firsttime = 1;
156 int error = 0; 156 int error = 0;
@@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x)
645} 645}
646 646
647/** 647/**
648 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies 648 * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
649 * 649 *
650 * @n: nsecs in u64 650 * @n: nsecs in u64
651 * 651 *
@@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x)
657 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) 657 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
658 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years 658 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
659 */ 659 */
660unsigned long nsecs_to_jiffies(u64 n) 660u64 nsecs_to_jiffies64(u64 n)
661{ 661{
662#if (NSEC_PER_SEC % HZ) == 0 662#if (NSEC_PER_SEC % HZ) == 0
663 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ 663 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
@@ -674,22 +674,23 @@ unsigned long nsecs_to_jiffies(u64 n)
674#endif 674#endif
675} 675}
676 676
677#if (BITS_PER_LONG < 64) 677/**
678u64 get_jiffies_64(void) 678 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
679 *
680 * @n: nsecs in u64
681 *
682 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
683 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
684 * for scheduler, not for use in device drivers to calculate timeout value.
685 *
686 * note:
687 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
688 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
689 */
690unsigned long nsecs_to_jiffies(u64 n)
679{ 691{
680 unsigned long seq; 692 return (unsigned long)nsecs_to_jiffies64(n);
681 u64 ret;
682
683 do {
684 seq = read_seqbegin(&xtime_lock);
685 ret = jiffies_64;
686 } while (read_seqretry(&xtime_lock, seq));
687 return ret;
688} 693}
689EXPORT_SYMBOL(get_jiffies_64);
690#endif
691
692EXPORT_SYMBOL(jiffies);
693 694
694/* 695/*
695 * Add two timespec values and do a safety check for overflow. 696 * Add two timespec values and do a safety check for overflow.
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ee266620b06c..b0425991e9ac 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,5 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
2obj-y += timeconv.o posix-clock.o
2 3
3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d7395fdfb9f3..0d74b9ba90c8 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,7 +18,6 @@
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 5404a8456909..a470154e0408 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -22,8 +22,11 @@
22************************************************************************/ 22************************************************************************/
23#include <linux/clocksource.h> 23#include <linux/clocksource.h>
24#include <linux/jiffies.h> 24#include <linux/jiffies.h>
25#include <linux/module.h>
25#include <linux/init.h> 26#include <linux/init.h>
26 27
28#include "tick-internal.h"
29
27/* The Jiffies based clocksource is the lowest common 30/* The Jiffies based clocksource is the lowest common
28 * denominator clock source which should function on 31 * denominator clock source which should function on
29 * all systems. It has the same coarse resolution as 32 * all systems. It has the same coarse resolution as
@@ -31,7 +34,7 @@
31 * inaccuracies caused by missed or lost timer 34 * inaccuracies caused by missed or lost timer
32 * interrupts and the inability for the timer 35 * interrupts and the inability for the timer
33 * interrupt hardware to accuratly tick at the 36 * interrupt hardware to accuratly tick at the
34 * requested HZ value. It is also not reccomended 37 * requested HZ value. It is also not recommended
35 * for "tick-less" systems. 38 * for "tick-less" systems.
36 */ 39 */
37#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) 40#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
@@ -64,6 +67,23 @@ struct clocksource clocksource_jiffies = {
64 .shift = JIFFIES_SHIFT, 67 .shift = JIFFIES_SHIFT,
65}; 68};
66 69
70#if (BITS_PER_LONG < 64)
71u64 get_jiffies_64(void)
72{
73 unsigned long seq;
74 u64 ret;
75
76 do {
77 seq = read_seqbegin(&xtime_lock);
78 ret = jiffies_64;
79 } while (read_seqretry(&xtime_lock, seq));
80 return ret;
81}
82EXPORT_SYMBOL(get_jiffies_64);
83#endif
84
85EXPORT_SYMBOL(jiffies);
86
67static int __init init_jiffies_clocksource(void) 87static int __init init_jiffies_clocksource(void)
68{ 88{
69 return clocksource_register(&clocksource_jiffies); 89 return clocksource_register(&clocksource_jiffies);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5c00242fa921..f6117a4c7cb8 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -16,6 +16,8 @@
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/module.h> 17#include <linux/module.h>
18 18
19#include "tick-internal.h"
20
19/* 21/*
20 * NTP timekeeping variables: 22 * NTP timekeeping variables:
21 */ 23 */
@@ -646,6 +648,19 @@ int do_adjtimex(struct timex *txc)
646 hrtimer_cancel(&leap_timer); 648 hrtimer_cancel(&leap_timer);
647 } 649 }
648 650
651 if (txc->modes & ADJ_SETOFFSET) {
652 struct timespec delta;
653 delta.tv_sec = txc->time.tv_sec;
654 delta.tv_nsec = txc->time.tv_usec;
655 if (!capable(CAP_SYS_TIME))
656 return -EPERM;
657 if (!(txc->modes & ADJ_NANO))
658 delta.tv_nsec *= 1000;
659 result = timekeeping_inject_offset(&delta);
660 if (result)
661 return result;
662 }
663
649 getnstimeofday(&ts); 664 getnstimeofday(&ts);
650 665
651 write_seqlock_irq(&xtime_lock); 666 write_seqlock_irq(&xtime_lock);
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
new file mode 100644
index 000000000000..c340ca658f37
--- /dev/null
+++ b/kernel/time/posix-clock.c
@@ -0,0 +1,445 @@
1/*
2 * posix-clock.c - support for dynamic clock devices
3 *
4 * Copyright (C) 2010 OMICRON electronics GmbH
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20#include <linux/device.h>
21#include <linux/file.h>
22#include <linux/posix-clock.h>
23#include <linux/slab.h>
24#include <linux/syscalls.h>
25#include <linux/uaccess.h>
26
27static void delete_clock(struct kref *kref);
28
29/*
30 * Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
31 */
32static struct posix_clock *get_posix_clock(struct file *fp)
33{
34 struct posix_clock *clk = fp->private_data;
35
36 down_read(&clk->rwsem);
37
38 if (!clk->zombie)
39 return clk;
40
41 up_read(&clk->rwsem);
42
43 return NULL;
44}
45
46static void put_posix_clock(struct posix_clock *clk)
47{
48 up_read(&clk->rwsem);
49}
50
51static ssize_t posix_clock_read(struct file *fp, char __user *buf,
52 size_t count, loff_t *ppos)
53{
54 struct posix_clock *clk = get_posix_clock(fp);
55 int err = -EINVAL;
56
57 if (!clk)
58 return -ENODEV;
59
60 if (clk->ops.read)
61 err = clk->ops.read(clk, fp->f_flags, buf, count);
62
63 put_posix_clock(clk);
64
65 return err;
66}
67
68static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
69{
70 struct posix_clock *clk = get_posix_clock(fp);
71 int result = 0;
72
73 if (!clk)
74 return -ENODEV;
75
76 if (clk->ops.poll)
77 result = clk->ops.poll(clk, fp, wait);
78
79 put_posix_clock(clk);
80
81 return result;
82}
83
84static int posix_clock_fasync(int fd, struct file *fp, int on)
85{
86 struct posix_clock *clk = get_posix_clock(fp);
87 int err = 0;
88
89 if (!clk)
90 return -ENODEV;
91
92 if (clk->ops.fasync)
93 err = clk->ops.fasync(clk, fd, fp, on);
94
95 put_posix_clock(clk);
96
97 return err;
98}
99
100static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma)
101{
102 struct posix_clock *clk = get_posix_clock(fp);
103 int err = -ENODEV;
104
105 if (!clk)
106 return -ENODEV;
107
108 if (clk->ops.mmap)
109 err = clk->ops.mmap(clk, vma);
110
111 put_posix_clock(clk);
112
113 return err;
114}
115
116static long posix_clock_ioctl(struct file *fp,
117 unsigned int cmd, unsigned long arg)
118{
119 struct posix_clock *clk = get_posix_clock(fp);
120 int err = -ENOTTY;
121
122 if (!clk)
123 return -ENODEV;
124
125 if (clk->ops.ioctl)
126 err = clk->ops.ioctl(clk, cmd, arg);
127
128 put_posix_clock(clk);
129
130 return err;
131}
132
133#ifdef CONFIG_COMPAT
134static long posix_clock_compat_ioctl(struct file *fp,
135 unsigned int cmd, unsigned long arg)
136{
137 struct posix_clock *clk = get_posix_clock(fp);
138 int err = -ENOTTY;
139
140 if (!clk)
141 return -ENODEV;
142
143 if (clk->ops.ioctl)
144 err = clk->ops.ioctl(clk, cmd, arg);
145
146 put_posix_clock(clk);
147
148 return err;
149}
150#endif
151
152static int posix_clock_open(struct inode *inode, struct file *fp)
153{
154 int err;
155 struct posix_clock *clk =
156 container_of(inode->i_cdev, struct posix_clock, cdev);
157
158 down_read(&clk->rwsem);
159
160 if (clk->zombie) {
161 err = -ENODEV;
162 goto out;
163 }
164 if (clk->ops.open)
165 err = clk->ops.open(clk, fp->f_mode);
166 else
167 err = 0;
168
169 if (!err) {
170 kref_get(&clk->kref);
171 fp->private_data = clk;
172 }
173out:
174 up_read(&clk->rwsem);
175 return err;
176}
177
178static int posix_clock_release(struct inode *inode, struct file *fp)
179{
180 struct posix_clock *clk = fp->private_data;
181 int err = 0;
182
183 if (clk->ops.release)
184 err = clk->ops.release(clk);
185
186 kref_put(&clk->kref, delete_clock);
187
188 fp->private_data = NULL;
189
190 return err;
191}
192
193static const struct file_operations posix_clock_file_operations = {
194 .owner = THIS_MODULE,
195 .llseek = no_llseek,
196 .read = posix_clock_read,
197 .poll = posix_clock_poll,
198 .unlocked_ioctl = posix_clock_ioctl,
199 .open = posix_clock_open,
200 .release = posix_clock_release,
201 .fasync = posix_clock_fasync,
202 .mmap = posix_clock_mmap,
203#ifdef CONFIG_COMPAT
204 .compat_ioctl = posix_clock_compat_ioctl,
205#endif
206};
207
208int posix_clock_register(struct posix_clock *clk, dev_t devid)
209{
210 int err;
211
212 kref_init(&clk->kref);
213 init_rwsem(&clk->rwsem);
214
215 cdev_init(&clk->cdev, &posix_clock_file_operations);
216 clk->cdev.owner = clk->ops.owner;
217 err = cdev_add(&clk->cdev, devid, 1);
218
219 return err;
220}
221EXPORT_SYMBOL_GPL(posix_clock_register);
222
223static void delete_clock(struct kref *kref)
224{
225 struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
226
227 if (clk->release)
228 clk->release(clk);
229}
230
231void posix_clock_unregister(struct posix_clock *clk)
232{
233 cdev_del(&clk->cdev);
234
235 down_write(&clk->rwsem);
236 clk->zombie = true;
237 up_write(&clk->rwsem);
238
239 kref_put(&clk->kref, delete_clock);
240}
241EXPORT_SYMBOL_GPL(posix_clock_unregister);
242
243struct posix_clock_desc {
244 struct file *fp;
245 struct posix_clock *clk;
246};
247
248static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd)
249{
250 struct file *fp = fget(CLOCKID_TO_FD(id));
251 int err = -EINVAL;
252
253 if (!fp)
254 return err;
255
256 if (fp->f_op->open != posix_clock_open || !fp->private_data)
257 goto out;
258
259 cd->fp = fp;
260 cd->clk = get_posix_clock(fp);
261
262 err = cd->clk ? 0 : -ENODEV;
263out:
264 if (err)
265 fput(fp);
266 return err;
267}
268
269static void put_clock_desc(struct posix_clock_desc *cd)
270{
271 put_posix_clock(cd->clk);
272 fput(cd->fp);
273}
274
275static int pc_clock_adjtime(clockid_t id, struct timex *tx)
276{
277 struct posix_clock_desc cd;
278 int err;
279
280 err = get_clock_desc(id, &cd);
281 if (err)
282 return err;
283
284 if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
285 err = -EACCES;
286 goto out;
287 }
288
289 if (cd.clk->ops.clock_adjtime)
290 err = cd.clk->ops.clock_adjtime(cd.clk, tx);
291 else
292 err = -EOPNOTSUPP;
293out:
294 put_clock_desc(&cd);
295
296 return err;
297}
298
299static int pc_clock_gettime(clockid_t id, struct timespec *ts)
300{
301 struct posix_clock_desc cd;
302 int err;
303
304 err = get_clock_desc(id, &cd);
305 if (err)
306 return err;
307
308 if (cd.clk->ops.clock_gettime)
309 err = cd.clk->ops.clock_gettime(cd.clk, ts);
310 else
311 err = -EOPNOTSUPP;
312
313 put_clock_desc(&cd);
314
315 return err;
316}
317
318static int pc_clock_getres(clockid_t id, struct timespec *ts)
319{
320 struct posix_clock_desc cd;
321 int err;
322
323 err = get_clock_desc(id, &cd);
324 if (err)
325 return err;
326
327 if (cd.clk->ops.clock_getres)
328 err = cd.clk->ops.clock_getres(cd.clk, ts);
329 else
330 err = -EOPNOTSUPP;
331
332 put_clock_desc(&cd);
333
334 return err;
335}
336
337static int pc_clock_settime(clockid_t id, const struct timespec *ts)
338{
339 struct posix_clock_desc cd;
340 int err;
341
342 err = get_clock_desc(id, &cd);
343 if (err)
344 return err;
345
346 if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
347 err = -EACCES;
348 goto out;
349 }
350
351 if (cd.clk->ops.clock_settime)
352 err = cd.clk->ops.clock_settime(cd.clk, ts);
353 else
354 err = -EOPNOTSUPP;
355out:
356 put_clock_desc(&cd);
357
358 return err;
359}
360
361static int pc_timer_create(struct k_itimer *kit)
362{
363 clockid_t id = kit->it_clock;
364 struct posix_clock_desc cd;
365 int err;
366
367 err = get_clock_desc(id, &cd);
368 if (err)
369 return err;
370
371 if (cd.clk->ops.timer_create)
372 err = cd.clk->ops.timer_create(cd.clk, kit);
373 else
374 err = -EOPNOTSUPP;
375
376 put_clock_desc(&cd);
377
378 return err;
379}
380
381static int pc_timer_delete(struct k_itimer *kit)
382{
383 clockid_t id = kit->it_clock;
384 struct posix_clock_desc cd;
385 int err;
386
387 err = get_clock_desc(id, &cd);
388 if (err)
389 return err;
390
391 if (cd.clk->ops.timer_delete)
392 err = cd.clk->ops.timer_delete(cd.clk, kit);
393 else
394 err = -EOPNOTSUPP;
395
396 put_clock_desc(&cd);
397
398 return err;
399}
400
401static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
402{
403 clockid_t id = kit->it_clock;
404 struct posix_clock_desc cd;
405
406 if (get_clock_desc(id, &cd))
407 return;
408
409 if (cd.clk->ops.timer_gettime)
410 cd.clk->ops.timer_gettime(cd.clk, kit, ts);
411
412 put_clock_desc(&cd);
413}
414
415static int pc_timer_settime(struct k_itimer *kit, int flags,
416 struct itimerspec *ts, struct itimerspec *old)
417{
418 clockid_t id = kit->it_clock;
419 struct posix_clock_desc cd;
420 int err;
421
422 err = get_clock_desc(id, &cd);
423 if (err)
424 return err;
425
426 if (cd.clk->ops.timer_settime)
427 err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
428 else
429 err = -EOPNOTSUPP;
430
431 put_clock_desc(&cd);
432
433 return err;
434}
435
436struct k_clock clock_posix_dynamic = {
437 .clock_getres = pc_clock_getres,
438 .clock_set = pc_clock_settime,
439 .clock_get = pc_clock_gettime,
440 .clock_adj = pc_clock_adjtime,
441 .timer_create = pc_timer_create,
442 .timer_set = pc_timer_settime,
443 .timer_del = pc_timer_delete,
444 .timer_get = pc_timer_gettime,
445};
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 48b2761b5668..da800ffa810c 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
@@ -600,4 +599,14 @@ int tick_broadcast_oneshot_active(void)
600 return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; 599 return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
601} 600}
602 601
602/*
603 * Check whether the broadcast device supports oneshot.
604 */
605bool tick_broadcast_oneshot_available(void)
606{
607 struct clock_event_device *bc = tick_broadcast_device.evtdev;
608
609 return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
610}
611
603#endif 612#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 051bc80a0c43..119528de8235 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include <asm/irq_regs.h> 22#include <asm/irq_regs.h>
24 23
@@ -51,7 +50,11 @@ int tick_is_oneshot_available(void)
51{ 50{
52 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 51 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
53 52
54 return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); 53 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT))
54 return 0;
55 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
56 return 1;
57 return tick_broadcast_oneshot_available();
55} 58}
56 59
57/* 60/*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 290eefbc1f60..1009b06d6f89 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
1/* 1/*
2 * tick internal variable and functions used by low/high res code 2 * tick internal variable and functions used by low/high res code
3 */ 3 */
4#include <linux/hrtimer.h>
5#include <linux/tick.h>
6
7#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
4 8
5#define TICK_DO_TIMER_NONE -1 9#define TICK_DO_TIMER_NONE -1
6#define TICK_DO_TIMER_BOOT -2 10#define TICK_DO_TIMER_BOOT -2
@@ -36,6 +40,7 @@ extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
36extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); 40extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
37extern int tick_broadcast_oneshot_active(void); 41extern int tick_broadcast_oneshot_active(void);
38extern void tick_check_oneshot_broadcast(int cpu); 42extern void tick_check_oneshot_broadcast(int cpu);
43bool tick_broadcast_oneshot_available(void);
39# else /* BROADCAST */ 44# else /* BROADCAST */
40static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 45static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
41{ 46{
@@ -46,6 +51,7 @@ static inline void tick_broadcast_switch_to_oneshot(void) { }
46static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 51static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
47static inline int tick_broadcast_oneshot_active(void) { return 0; } 52static inline int tick_broadcast_oneshot_active(void) { return 0; }
48static inline void tick_check_oneshot_broadcast(int cpu) { } 53static inline void tick_check_oneshot_broadcast(int cpu) { }
54static inline bool tick_broadcast_oneshot_available(void) { return true; }
49# endif /* !BROADCAST */ 55# endif /* !BROADCAST */
50 56
51#else /* !ONESHOT */ 57#else /* !ONESHOT */
@@ -76,6 +82,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
76 return 0; 82 return 0;
77} 83}
78static inline int tick_broadcast_oneshot_active(void) { return 0; } 84static inline int tick_broadcast_oneshot_active(void) { return 0; }
85static inline bool tick_broadcast_oneshot_available(void) { return false; }
79#endif /* !TICK_ONESHOT */ 86#endif /* !TICK_ONESHOT */
80 87
81/* 88/*
@@ -132,3 +139,8 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
132{ 139{
133 return !(dev->features & CLOCK_EVT_FEAT_DUMMY); 140 return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
134} 141}
142
143#endif
144
145extern void do_timer(unsigned long ticks);
146extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 5cbc101f908b..2d04411a5f05 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c55ea2433471..d5097c44b407 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -19,7 +19,6 @@
19#include <linux/percpu.h> 19#include <linux/percpu.h>
20#include <linux/profile.h> 20#include <linux/profile.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/tick.h>
23#include <linux/module.h> 22#include <linux/module.h>
24 23
25#include <asm/irq_regs.h> 24#include <asm/irq_regs.h>
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d27c7562902c..8ad5d576755e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -14,7 +14,7 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/sysdev.h> 17#include <linux/syscore_ops.h>
18#include <linux/clocksource.h> 18#include <linux/clocksource.h>
19#include <linux/jiffies.h> 19#include <linux/jiffies.h>
20#include <linux/time.h> 20#include <linux/time.h>
@@ -353,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday);
353 * 353 *
354 * Sets the time of day to the new time and update NTP and notify hrtimers 354 * Sets the time of day to the new time and update NTP and notify hrtimers
355 */ 355 */
356int do_settimeofday(struct timespec *tv) 356int do_settimeofday(const struct timespec *tv)
357{ 357{
358 struct timespec ts_delta; 358 struct timespec ts_delta;
359 unsigned long flags; 359 unsigned long flags;
@@ -387,6 +387,42 @@ int do_settimeofday(struct timespec *tv)
387 387
388EXPORT_SYMBOL(do_settimeofday); 388EXPORT_SYMBOL(do_settimeofday);
389 389
390
391/**
392 * timekeeping_inject_offset - Adds or subtracts from the current time.
393 * @tv: pointer to the timespec variable containing the offset
394 *
395 * Adds or subtracts an offset value from the current time.
396 */
397int timekeeping_inject_offset(struct timespec *ts)
398{
399 unsigned long flags;
400
401 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
402 return -EINVAL;
403
404 write_seqlock_irqsave(&xtime_lock, flags);
405
406 timekeeping_forward_now();
407
408 xtime = timespec_add(xtime, *ts);
409 wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts);
410
411 timekeeper.ntp_error = 0;
412 ntp_clear();
413
414 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
415 timekeeper.mult);
416
417 write_sequnlock_irqrestore(&xtime_lock, flags);
418
419 /* signal hrtimers about time change */
420 clock_was_set();
421
422 return 0;
423}
424EXPORT_SYMBOL(timekeeping_inject_offset);
425
390/** 426/**
391 * change_clocksource - Swaps clocksources if a new one is available 427 * change_clocksource - Swaps clocksources if a new one is available
392 * 428 *
@@ -561,13 +597,12 @@ static struct timespec timekeeping_suspend_time;
561 597
562/** 598/**
563 * timekeeping_resume - Resumes the generic timekeeping subsystem. 599 * timekeeping_resume - Resumes the generic timekeeping subsystem.
564 * @dev: unused
565 * 600 *
566 * This is for the generic clocksource timekeeping. 601 * This is for the generic clocksource timekeeping.
567 * xtime/wall_to_monotonic/jiffies/etc are 602 * xtime/wall_to_monotonic/jiffies/etc are
568 * still managed by arch specific suspend/resume code. 603 * still managed by arch specific suspend/resume code.
569 */ 604 */
570static int timekeeping_resume(struct sys_device *dev) 605static void timekeeping_resume(void)
571{ 606{
572 unsigned long flags; 607 unsigned long flags;
573 struct timespec ts; 608 struct timespec ts;
@@ -596,11 +631,9 @@ static int timekeeping_resume(struct sys_device *dev)
596 631
597 /* Resume hrtimers */ 632 /* Resume hrtimers */
598 hres_timers_resume(); 633 hres_timers_resume();
599
600 return 0;
601} 634}
602 635
603static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) 636static int timekeeping_suspend(void)
604{ 637{
605 unsigned long flags; 638 unsigned long flags;
606 639
@@ -618,26 +651,18 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
618} 651}
619 652
620/* sysfs resume/suspend bits for timekeeping */ 653/* sysfs resume/suspend bits for timekeeping */
621static struct sysdev_class timekeeping_sysclass = { 654static struct syscore_ops timekeeping_syscore_ops = {
622 .name = "timekeeping",
623 .resume = timekeeping_resume, 655 .resume = timekeeping_resume,
624 .suspend = timekeeping_suspend, 656 .suspend = timekeeping_suspend,
625}; 657};
626 658
627static struct sys_device device_timer = { 659static int __init timekeeping_init_ops(void)
628 .id = 0,
629 .cls = &timekeeping_sysclass,
630};
631
632static int __init timekeeping_init_device(void)
633{ 660{
634 int error = sysdev_class_register(&timekeeping_sysclass); 661 register_syscore_ops(&timekeeping_syscore_ops);
635 if (!error) 662 return 0;
636 error = sysdev_register(&device_timer);
637 return error;
638} 663}
639 664
640device_initcall(timekeeping_init_device); 665device_initcall(timekeeping_init_ops);
641 666
642/* 667/*
643 * If the error is already larger, we look ahead even further 668 * If the error is already larger, we look ahead even further
@@ -779,7 +804,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
779 * 804 *
780 * Called from the timer interrupt, must hold a write on xtime_lock. 805 * Called from the timer interrupt, must hold a write on xtime_lock.
781 */ 806 */
782void update_wall_time(void) 807static void update_wall_time(void)
783{ 808{
784 struct clocksource *clock; 809 struct clocksource *clock;
785 cycle_t offset; 810 cycle_t offset;
@@ -871,7 +896,7 @@ void update_wall_time(void)
871 * getboottime - Return the real time of system boot. 896 * getboottime - Return the real time of system boot.
872 * @ts: pointer to the timespec to be set 897 * @ts: pointer to the timespec to be set
873 * 898 *
874 * Returns the time of day in a timespec. 899 * Returns the wall-time of boot in a timespec.
875 * 900 *
876 * This is based on the wall_to_monotonic offset and the total suspend 901 * This is based on the wall_to_monotonic offset and the total suspend
877 * time. Calls to settimeofday will affect the value returned (which 902 * time. Calls to settimeofday will affect the value returned (which
@@ -889,6 +914,55 @@ void getboottime(struct timespec *ts)
889} 914}
890EXPORT_SYMBOL_GPL(getboottime); 915EXPORT_SYMBOL_GPL(getboottime);
891 916
917
918/**
919 * get_monotonic_boottime - Returns monotonic time since boot
920 * @ts: pointer to the timespec to be set
921 *
922 * Returns the monotonic time since boot in a timespec.
923 *
924 * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
925 * includes the time spent in suspend.
926 */
927void get_monotonic_boottime(struct timespec *ts)
928{
929 struct timespec tomono, sleep;
930 unsigned int seq;
931 s64 nsecs;
932
933 WARN_ON(timekeeping_suspended);
934
935 do {
936 seq = read_seqbegin(&xtime_lock);
937 *ts = xtime;
938 tomono = wall_to_monotonic;
939 sleep = total_sleep_time;
940 nsecs = timekeeping_get_ns();
941
942 } while (read_seqretry(&xtime_lock, seq));
943
944 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
945 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
946}
947EXPORT_SYMBOL_GPL(get_monotonic_boottime);
948
949/**
950 * ktime_get_boottime - Returns monotonic time since boot in a ktime
951 *
952 * Returns the monotonic time since boot in a ktime
953 *
954 * This is similar to CLOCK_MONTONIC/ktime_get, but also
955 * includes the time spent in suspend.
956 */
957ktime_t ktime_get_boottime(void)
958{
959 struct timespec ts;
960
961 get_monotonic_boottime(&ts);
962 return timespec_to_ktime(ts);
963}
964EXPORT_SYMBOL_GPL(ktime_get_boottime);
965
892/** 966/**
893 * monotonic_to_bootbased - Convert the monotonic time to boot based. 967 * monotonic_to_bootbased - Convert the monotonic time to boot based.
894 * @ts: pointer to the timespec to be converted 968 * @ts: pointer to the timespec to be converted
@@ -910,11 +984,6 @@ struct timespec __current_kernel_time(void)
910 return xtime; 984 return xtime;
911} 985}
912 986
913struct timespec __get_wall_to_monotonic(void)
914{
915 return wall_to_monotonic;
916}
917
918struct timespec current_kernel_time(void) 987struct timespec current_kernel_time(void)
919{ 988{
920 struct timespec now; 989 struct timespec now;
@@ -946,3 +1015,48 @@ struct timespec get_monotonic_coarse(void)
946 now.tv_nsec + mono.tv_nsec); 1015 now.tv_nsec + mono.tv_nsec);
947 return now; 1016 return now;
948} 1017}
1018
1019/*
1020 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1021 * without sampling the sequence number in xtime_lock.
1022 * jiffies is defined in the linker script...
1023 */
1024void do_timer(unsigned long ticks)
1025{
1026 jiffies_64 += ticks;
1027 update_wall_time();
1028 calc_global_load(ticks);
1029}
1030
1031/**
1032 * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic,
1033 * and sleep offsets.
1034 * @xtim: pointer to timespec to be set with xtime
1035 * @wtom: pointer to timespec to be set with wall_to_monotonic
1036 * @sleep: pointer to timespec to be set with time in suspend
1037 */
1038void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1039 struct timespec *wtom, struct timespec *sleep)
1040{
1041 unsigned long seq;
1042
1043 do {
1044 seq = read_seqbegin(&xtime_lock);
1045 *xtim = xtime;
1046 *wtom = wall_to_monotonic;
1047 *sleep = total_sleep_time;
1048 } while (read_seqretry(&xtime_lock, seq));
1049}
1050
1051/**
1052 * xtime_update() - advances the timekeeping infrastructure
1053 * @ticks: number of ticks, that have elapsed since the last call.
1054 *
1055 * Must be called with interrupts disabled.
1056 */
1057void xtime_update(unsigned long ticks)
1058{
1059 write_seqlock(&xtime_lock);
1060 do_timer(ticks);
1061 write_sequnlock(&xtime_lock);
1062}
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 2f3b585b8d7d..a5d0a3a85dd8 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -236,7 +236,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
236 unsigned int timer_flag) 236 unsigned int timer_flag)
237{ 237{
238 /* 238 /*
239 * It doesnt matter which lock we take: 239 * It doesn't matter which lock we take:
240 */ 240 */
241 raw_spinlock_t *lock; 241 raw_spinlock_t *lock;
242 struct entry *entry, input; 242 struct entry *entry, input;
diff --git a/kernel/timer.c b/kernel/timer.c
index d6459923d245..fd6198692b57 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -404,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {}
404 404
405static struct debug_obj_descr timer_debug_descr; 405static struct debug_obj_descr timer_debug_descr;
406 406
407static void *timer_debug_hint(void *addr)
408{
409 return ((struct timer_list *) addr)->function;
410}
411
407/* 412/*
408 * fixup_init is called when: 413 * fixup_init is called when:
409 * - an active object is initialized 414 * - an active object is initialized
@@ -477,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
477 482
478static struct debug_obj_descr timer_debug_descr = { 483static struct debug_obj_descr timer_debug_descr = {
479 .name = "timer_list", 484 .name = "timer_list",
485 .debug_hint = timer_debug_hint,
480 .fixup_init = timer_fixup_init, 486 .fixup_init = timer_fixup_init,
481 .fixup_activate = timer_fixup_activate, 487 .fixup_activate = timer_fixup_activate,
482 .fixup_free = timer_fixup_free, 488 .fixup_free = timer_fixup_free,
@@ -964,6 +970,25 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
964 * add_timer_on(). Upon exit the timer is not queued and the handler is 970 * add_timer_on(). Upon exit the timer is not queued and the handler is
965 * not running on any CPU. 971 * not running on any CPU.
966 * 972 *
973 * Note: You must not hold locks that are held in interrupt context
974 * while calling this function. Even if the lock has nothing to do
975 * with the timer in question. Here's why:
976 *
977 * CPU0 CPU1
978 * ---- ----
979 * <SOFTIRQ>
980 * call_timer_fn();
981 * base->running_timer = mytimer;
982 * spin_lock_irq(somelock);
983 * <IRQ>
984 * spin_lock(somelock);
985 * del_timer_sync(mytimer);
986 * while (base->running_timer == mytimer);
987 *
988 * Now del_timer_sync() will never return and never release somelock.
989 * The interrupt on the other CPU is waiting to grab somelock but
990 * it has interrupted the softirq that CPU0 is waiting to finish.
991 *
967 * The function returns whether it has deactivated a pending timer or not. 992 * The function returns whether it has deactivated a pending timer or not.
968 */ 993 */
969int del_timer_sync(struct timer_list *timer) 994int del_timer_sync(struct timer_list *timer)
@@ -971,6 +996,10 @@ int del_timer_sync(struct timer_list *timer)
971#ifdef CONFIG_LOCKDEP 996#ifdef CONFIG_LOCKDEP
972 unsigned long flags; 997 unsigned long flags;
973 998
999 /*
1000 * If lockdep gives a backtrace here, please reference
1001 * the synchronization rules above.
1002 */
974 local_irq_save(flags); 1003 local_irq_save(flags);
975 lock_map_acquire(&timer->lockdep_map); 1004 lock_map_acquire(&timer->lockdep_map);
976 lock_map_release(&timer->lockdep_map); 1005 lock_map_release(&timer->lockdep_map);
@@ -1295,19 +1324,6 @@ void run_local_timers(void)
1295 raise_softirq(TIMER_SOFTIRQ); 1324 raise_softirq(TIMER_SOFTIRQ);
1296} 1325}
1297 1326
1298/*
1299 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1300 * without sampling the sequence number in xtime_lock.
1301 * jiffies is defined in the linker script...
1302 */
1303
1304void do_timer(unsigned long ticks)
1305{
1306 jiffies_64 += ticks;
1307 update_wall_time();
1308 calc_global_load(ticks);
1309}
1310
1311#ifdef __ARCH_WANT_SYS_ALARM 1327#ifdef __ARCH_WANT_SYS_ALARM
1312 1328
1313/* 1329/*
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 14674dce77a6..2ad39e556cb4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -141,7 +141,7 @@ if FTRACE
141config FUNCTION_TRACER 141config FUNCTION_TRACER
142 bool "Kernel Function Tracer" 142 bool "Kernel Function Tracer"
143 depends on HAVE_FUNCTION_TRACER 143 depends on HAVE_FUNCTION_TRACER
144 select FRAME_POINTER if !ARM_UNWIND && !S390 144 select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE
145 select KALLSYMS 145 select KALLSYMS
146 select GENERIC_TRACER 146 select GENERIC_TRACER
147 select CONTEXT_SWITCH_TRACER 147 select CONTEXT_SWITCH_TRACER
@@ -275,7 +275,7 @@ config PROFILE_ANNOTATED_BRANCHES
275 This tracer profiles all the the likely and unlikely macros 275 This tracer profiles all the the likely and unlikely macros
276 in the kernel. It will display the results in: 276 in the kernel. It will display the results in:
277 277
278 /sys/kernel/debug/tracing/profile_annotated_branch 278 /sys/kernel/debug/tracing/trace_stat/branch_annotated
279 279
280 Note: this will add a significant overhead; only turn this 280 Note: this will add a significant overhead; only turn this
281 on if you need to profile the system's use of these macros. 281 on if you need to profile the system's use of these macros.
@@ -288,7 +288,7 @@ config PROFILE_ALL_BRANCHES
288 taken in the kernel is recorded whether it hit or miss. 288 taken in the kernel is recorded whether it hit or miss.
289 The results will be displayed in: 289 The results will be displayed in:
290 290
291 /sys/kernel/debug/tracing/profile_branch 291 /sys/kernel/debug/tracing/trace_stat/branch_all
292 292
293 This option also enables the likely/unlikely profiler. 293 This option also enables the likely/unlikely profiler.
294 294
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d95721f33702..6957aa298dfa 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -703,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q)
703 * 703 *
704 **/ 704 **/
705static void blk_add_trace_rq(struct request_queue *q, struct request *rq, 705static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
706 u32 what) 706 u32 what)
707{ 707{
708 struct blk_trace *bt = q->blk_trace; 708 struct blk_trace *bt = q->blk_trace;
709 int rw = rq->cmd_flags & 0x03;
710 709
711 if (likely(!bt)) 710 if (likely(!bt))
712 return; 711 return;
713 712
714 if (rq->cmd_flags & REQ_DISCARD)
715 rw |= REQ_DISCARD;
716
717 if (rq->cmd_flags & REQ_SECURE)
718 rw |= REQ_SECURE;
719
720 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 713 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
721 what |= BLK_TC_ACT(BLK_TC_PC); 714 what |= BLK_TC_ACT(BLK_TC_PC);
722 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, 715 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags,
723 what, rq->errors, rq->cmd_len, rq->cmd); 716 what, rq->errors, rq->cmd_len, rq->cmd);
724 } else { 717 } else {
725 what |= BLK_TC_ACT(BLK_TC_FS); 718 what |= BLK_TC_ACT(BLK_TC_FS);
726 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw, 719 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
727 what, rq->errors, 0, NULL); 720 rq->cmd_flags, what, rq->errors, 0, NULL);
728 } 721 }
729} 722}
730 723
@@ -857,29 +850,21 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
857 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); 850 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
858} 851}
859 852
860static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q) 853static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
854 unsigned int depth, bool explicit)
861{ 855{
862 struct blk_trace *bt = q->blk_trace; 856 struct blk_trace *bt = q->blk_trace;
863 857
864 if (bt) { 858 if (bt) {
865 unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; 859 __be64 rpdu = cpu_to_be64(depth);
866 __be64 rpdu = cpu_to_be64(pdu); 860 u32 what;
867 861
868 __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, 862 if (explicit)
869 sizeof(rpdu), &rpdu); 863 what = BLK_TA_UNPLUG_IO;
870 } 864 else
871} 865 what = BLK_TA_UNPLUG_TIMER;
872
873static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
874{
875 struct blk_trace *bt = q->blk_trace;
876
877 if (bt) {
878 unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
879 __be64 rpdu = cpu_to_be64(pdu);
880 866
881 __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, 867 __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
882 sizeof(rpdu), &rpdu);
883 } 868 }
884} 869}
885 870
@@ -1022,9 +1007,7 @@ static void blk_register_tracepoints(void)
1022 WARN_ON(ret); 1007 WARN_ON(ret);
1023 ret = register_trace_block_plug(blk_add_trace_plug, NULL); 1008 ret = register_trace_block_plug(blk_add_trace_plug, NULL);
1024 WARN_ON(ret); 1009 WARN_ON(ret);
1025 ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); 1010 ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
1026 WARN_ON(ret);
1027 ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
1028 WARN_ON(ret); 1011 WARN_ON(ret);
1029 ret = register_trace_block_split(blk_add_trace_split, NULL); 1012 ret = register_trace_block_split(blk_add_trace_split, NULL);
1030 WARN_ON(ret); 1013 WARN_ON(ret);
@@ -1039,8 +1022,7 @@ static void blk_unregister_tracepoints(void)
1039 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); 1022 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1040 unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); 1023 unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1041 unregister_trace_block_split(blk_add_trace_split, NULL); 1024 unregister_trace_block_split(blk_add_trace_split, NULL);
1042 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); 1025 unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
1043 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
1044 unregister_trace_block_plug(blk_add_trace_plug, NULL); 1026 unregister_trace_block_plug(blk_add_trace_plug, NULL);
1045 unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); 1027 unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
1046 unregister_trace_block_getrq(blk_add_trace_getrq, NULL); 1028 unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
@@ -1827,21 +1809,5 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1827 rwbs[i] = '\0'; 1809 rwbs[i] = '\0';
1828} 1810}
1829 1811
1830void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
1831{
1832 int rw = rq->cmd_flags & 0x03;
1833 int bytes;
1834
1835 if (rq->cmd_flags & REQ_DISCARD)
1836 rw |= REQ_DISCARD;
1837
1838 if (rq->cmd_flags & REQ_SECURE)
1839 rw |= REQ_SECURE;
1840
1841 bytes = blk_rq_bytes(rq);
1842
1843 blk_fill_rwbs(rwbs, rw, bytes);
1844}
1845
1846#endif /* CONFIG_EVENT_TRACING */ 1812#endif /* CONFIG_EVENT_TRACING */
1847 1813
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f3dadae83883..ee24fa1935ac 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1268,7 +1268,7 @@ static int ftrace_update_code(struct module *mod)
1268 p->flags = 0L; 1268 p->flags = 0L;
1269 1269
1270 /* 1270 /*
1271 * Do the initial record convertion from mcount jump 1271 * Do the initial record conversion from mcount jump
1272 * to the NOP instructions. 1272 * to the NOP instructions.
1273 */ 1273 */
1274 if (!ftrace_code_disable(mod, p)) { 1274 if (!ftrace_code_disable(mod, p)) {
@@ -1467,7 +1467,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1467 return t_hash_next(m, pos); 1467 return t_hash_next(m, pos);
1468 1468
1469 (*pos)++; 1469 (*pos)++;
1470 iter->pos = *pos; 1470 iter->pos = iter->func_pos = *pos;
1471 1471
1472 if (iter->flags & FTRACE_ITER_PRINTALL) 1472 if (iter->flags & FTRACE_ITER_PRINTALL)
1473 return t_hash_start(m, pos); 1473 return t_hash_start(m, pos);
@@ -1502,7 +1502,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1502 if (!rec) 1502 if (!rec)
1503 return t_hash_start(m, pos); 1503 return t_hash_start(m, pos);
1504 1504
1505 iter->func_pos = *pos;
1506 iter->func = rec; 1505 iter->func = rec;
1507 1506
1508 return iter; 1507 return iter;
@@ -3328,7 +3327,7 @@ static int start_graph_tracing(void)
3328 /* The cpu_boot init_task->ret_stack will never be freed */ 3327 /* The cpu_boot init_task->ret_stack will never be freed */
3329 for_each_online_cpu(cpu) { 3328 for_each_online_cpu(cpu) {
3330 if (!idle_task(cpu)->ret_stack) 3329 if (!idle_task(cpu)->ret_stack)
3331 ftrace_graph_init_task(idle_task(cpu)); 3330 ftrace_graph_init_idle_task(idle_task(cpu), cpu);
3332 } 3331 }
3333 3332
3334 do { 3333 do {
@@ -3418,6 +3417,49 @@ void unregister_ftrace_graph(void)
3418 mutex_unlock(&ftrace_lock); 3417 mutex_unlock(&ftrace_lock);
3419} 3418}
3420 3419
3420static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
3421
3422static void
3423graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
3424{
3425 atomic_set(&t->tracing_graph_pause, 0);
3426 atomic_set(&t->trace_overrun, 0);
3427 t->ftrace_timestamp = 0;
3428 /* make curr_ret_stack visible before we add the ret_stack */
3429 smp_wmb();
3430 t->ret_stack = ret_stack;
3431}
3432
3433/*
3434 * Allocate a return stack for the idle task. May be the first
3435 * time through, or it may be done by CPU hotplug online.
3436 */
3437void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
3438{
3439 t->curr_ret_stack = -1;
3440 /*
3441 * The idle task has no parent, it either has its own
3442 * stack or no stack at all.
3443 */
3444 if (t->ret_stack)
3445 WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
3446
3447 if (ftrace_graph_active) {
3448 struct ftrace_ret_stack *ret_stack;
3449
3450 ret_stack = per_cpu(idle_ret_stack, cpu);
3451 if (!ret_stack) {
3452 ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
3453 * sizeof(struct ftrace_ret_stack),
3454 GFP_KERNEL);
3455 if (!ret_stack)
3456 return;
3457 per_cpu(idle_ret_stack, cpu) = ret_stack;
3458 }
3459 graph_init_task(t, ret_stack);
3460 }
3461}
3462
3421/* Allocate a return stack for newly created task */ 3463/* Allocate a return stack for newly created task */
3422void ftrace_graph_init_task(struct task_struct *t) 3464void ftrace_graph_init_task(struct task_struct *t)
3423{ 3465{
@@ -3433,12 +3475,7 @@ void ftrace_graph_init_task(struct task_struct *t)
3433 GFP_KERNEL); 3475 GFP_KERNEL);
3434 if (!ret_stack) 3476 if (!ret_stack)
3435 return; 3477 return;
3436 atomic_set(&t->tracing_graph_pause, 0); 3478 graph_init_task(t, ret_stack);
3437 atomic_set(&t->trace_overrun, 0);
3438 t->ftrace_timestamp = 0;
3439 /* make curr_ret_stack visable before we add the ret_stack */
3440 smp_wmb();
3441 t->ret_stack = ret_stack;
3442 } 3479 }
3443} 3480}
3444 3481
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bd1c35a4fbcc..0ef7b4b2a1f7 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5,7 +5,6 @@
5 */ 5 */
6#include <linux/ring_buffer.h> 6#include <linux/ring_buffer.h>
7#include <linux/trace_clock.h> 7#include <linux/trace_clock.h>
8#include <linux/ftrace_irq.h>
9#include <linux/spinlock.h> 8#include <linux/spinlock.h>
10#include <linux/debugfs.h> 9#include <linux/debugfs.h>
11#include <linux/uaccess.h> 10#include <linux/uaccess.h>
@@ -669,7 +668,7 @@ static struct list_head *rb_list_head(struct list_head *list)
669 * the reader page). But if the next page is a header page, 668 * the reader page). But if the next page is a header page,
670 * its flags will be non zero. 669 * its flags will be non zero.
671 */ 670 */
672static int inline 671static inline int
673rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, 672rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
674 struct buffer_page *page, struct list_head *list) 673 struct buffer_page *page, struct list_head *list)
675{ 674{
@@ -1429,6 +1428,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1429} 1428}
1430EXPORT_SYMBOL_GPL(ring_buffer_resize); 1429EXPORT_SYMBOL_GPL(ring_buffer_resize);
1431 1430
1431void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
1432{
1433 mutex_lock(&buffer->mutex);
1434 if (val)
1435 buffer->flags |= RB_FL_OVERWRITE;
1436 else
1437 buffer->flags &= ~RB_FL_OVERWRITE;
1438 mutex_unlock(&buffer->mutex);
1439}
1440EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
1441
1432static inline void * 1442static inline void *
1433__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) 1443__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
1434{ 1444{
@@ -1468,7 +1478,7 @@ static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1468 return local_read(&bpage->entries) & RB_WRITE_MASK; 1478 return local_read(&bpage->entries) & RB_WRITE_MASK;
1469} 1479}
1470 1480
1471/* Size is determined by what has been commited */ 1481/* Size is determined by what has been committed */
1472static inline unsigned rb_page_size(struct buffer_page *bpage) 1482static inline unsigned rb_page_size(struct buffer_page *bpage)
1473{ 1483{
1474 return rb_page_commit(bpage); 1484 return rb_page_commit(bpage);
@@ -2162,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2162 if (likely(ts >= cpu_buffer->write_stamp)) { 2172 if (likely(ts >= cpu_buffer->write_stamp)) {
2163 delta = diff; 2173 delta = diff;
2164 if (unlikely(test_time_stamp(delta))) { 2174 if (unlikely(test_time_stamp(delta))) {
2175 int local_clock_stable = 1;
2176#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
2177 local_clock_stable = sched_clock_stable;
2178#endif
2165 WARN_ONCE(delta > (1ULL << 59), 2179 WARN_ONCE(delta > (1ULL << 59),
2166 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", 2180 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
2167 (unsigned long long)delta, 2181 (unsigned long long)delta,
2168 (unsigned long long)ts, 2182 (unsigned long long)ts,
2169 (unsigned long long)cpu_buffer->write_stamp); 2183 (unsigned long long)cpu_buffer->write_stamp,
2184 local_clock_stable ? "" :
2185 "If you just came from a suspend/resume,\n"
2186 "please switch to the trace global clock:\n"
2187 " echo global > /sys/kernel/debug/tracing/trace_clock\n");
2170 add_timestamp = 1; 2188 add_timestamp = 1;
2171 } 2189 }
2172 } 2190 }
@@ -2914,7 +2932,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2914 /* 2932 /*
2915 * cpu_buffer->pages just needs to point to the buffer, it 2933 * cpu_buffer->pages just needs to point to the buffer, it
2916 * has no specific buffer page to point to. Lets move it out 2934 * has no specific buffer page to point to. Lets move it out
2917 * of our way so we don't accidently swap it. 2935 * of our way so we don't accidentally swap it.
2918 */ 2936 */
2919 cpu_buffer->pages = reader->list.prev; 2937 cpu_buffer->pages = reader->list.prev;
2920 2938
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dc53ecb80589..1cb49be7c7fb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,8 +41,6 @@
41#include "trace.h" 41#include "trace.h"
42#include "trace_output.h" 42#include "trace_output.h"
43 43
44#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
45
46/* 44/*
47 * On boot up, the ring buffer is set to the minimum size, so that 45 * On boot up, the ring buffer is set to the minimum size, so that
48 * we do not waste memory on systems that are not using tracing. 46 * we do not waste memory on systems that are not using tracing.
@@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
340/* trace_flags holds trace_options default values */ 338/* trace_flags holds trace_options default values */
341unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 339unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
342 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 340 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
343 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; 341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
344 342
345static int trace_stop_count; 343static int trace_stop_count;
346static DEFINE_SPINLOCK(tracing_start_lock); 344static DEFINE_SPINLOCK(tracing_start_lock);
@@ -425,6 +423,7 @@ static const char *trace_options[] = {
425 "sleep-time", 423 "sleep-time",
426 "graph-time", 424 "graph-time",
427 "record-cmd", 425 "record-cmd",
426 "overwrite",
428 NULL 427 NULL
429}; 428};
430 429
@@ -780,6 +779,11 @@ __acquires(kernel_lock)
780 tracing_reset_online_cpus(tr); 779 tracing_reset_online_cpus(tr);
781 780
782 current_trace = type; 781 current_trace = type;
782
783 /* If we expanded the buffers, make sure the max is expanded too */
784 if (ring_buffer_expanded && type->use_max_tr)
785 ring_buffer_resize(max_tr.buffer, trace_buf_size);
786
783 /* the test is responsible for initializing and enabling */ 787 /* the test is responsible for initializing and enabling */
784 pr_info("Testing tracer %s: ", type->name); 788 pr_info("Testing tracer %s: ", type->name);
785 ret = type->selftest(type, tr); 789 ret = type->selftest(type, tr);
@@ -792,6 +796,10 @@ __acquires(kernel_lock)
792 /* Only reset on passing, to avoid touching corrupted buffers */ 796 /* Only reset on passing, to avoid touching corrupted buffers */
793 tracing_reset_online_cpus(tr); 797 tracing_reset_online_cpus(tr);
794 798
799 /* Shrink the max buffer again */
800 if (ring_buffer_expanded && type->use_max_tr)
801 ring_buffer_resize(max_tr.buffer, 1);
802
795 printk(KERN_CONT "PASSED\n"); 803 printk(KERN_CONT "PASSED\n");
796 } 804 }
797#endif 805#endif
@@ -1102,7 +1110,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
1102 1110
1103 entry->preempt_count = pc & 0xff; 1111 entry->preempt_count = pc & 0xff;
1104 entry->pid = (tsk) ? tsk->pid : 0; 1112 entry->pid = (tsk) ? tsk->pid : 0;
1105 entry->lock_depth = (tsk) ? tsk->lock_depth : 0; 1113 entry->padding = 0;
1106 entry->flags = 1114 entry->flags =
1107#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 1115#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
1108 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 1116 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1749,10 +1757,9 @@ static void print_lat_help_header(struct seq_file *m)
1749 seq_puts(m, "# | / _----=> need-resched \n"); 1757 seq_puts(m, "# | / _----=> need-resched \n");
1750 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 1758 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1751 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 1759 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1752 seq_puts(m, "# |||| /_--=> lock-depth \n"); 1760 seq_puts(m, "# |||| / delay \n");
1753 seq_puts(m, "# |||||/ delay \n"); 1761 seq_puts(m, "# cmd pid ||||| time | caller \n");
1754 seq_puts(m, "# cmd pid |||||| time | caller \n"); 1762 seq_puts(m, "# \\ / ||||| \\ | / \n");
1755 seq_puts(m, "# \\ / |||||| \\ | / \n");
1756} 1763}
1757 1764
1758static void print_func_help_header(struct seq_file *m) 1765static void print_func_help_header(struct seq_file *m)
@@ -2529,6 +2536,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2529 2536
2530 if (mask == TRACE_ITER_RECORD_CMD) 2537 if (mask == TRACE_ITER_RECORD_CMD)
2531 trace_event_enable_cmd_record(enabled); 2538 trace_event_enable_cmd_record(enabled);
2539
2540 if (mask == TRACE_ITER_OVERWRITE)
2541 ring_buffer_change_overwrite(global_trace.buffer, enabled);
2532} 2542}
2533 2543
2534static ssize_t 2544static ssize_t
@@ -2710,6 +2720,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2710 2720
2711 mutex_lock(&trace_types_lock); 2721 mutex_lock(&trace_types_lock);
2712 if (tracer_enabled ^ val) { 2722 if (tracer_enabled ^ val) {
2723
2724 /* Only need to warn if this is used to change the state */
2725 WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
2726
2713 if (val) { 2727 if (val) {
2714 tracer_enabled = 1; 2728 tracer_enabled = 1;
2715 if (current_trace->start) 2729 if (current_trace->start)
@@ -3226,7 +3240,7 @@ waitagain:
3226 trace_seq_init(&iter->seq); 3240 trace_seq_init(&iter->seq);
3227 3241
3228 /* 3242 /*
3229 * If there was nothing to send to user, inspite of consuming trace 3243 * If there was nothing to send to user, in spite of consuming trace
3230 * entries, go back to wait for more entries. 3244 * entries, go back to wait for more entries.
3231 */ 3245 */
3232 if (sret == -EBUSY) 3246 if (sret == -EBUSY)
@@ -4551,9 +4565,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4551__init static int tracer_alloc_buffers(void) 4565__init static int tracer_alloc_buffers(void)
4552{ 4566{
4553 int ring_buf_size; 4567 int ring_buf_size;
4568 enum ring_buffer_flags rb_flags;
4554 int i; 4569 int i;
4555 int ret = -ENOMEM; 4570 int ret = -ENOMEM;
4556 4571
4572
4557 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) 4573 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
4558 goto out; 4574 goto out;
4559 4575
@@ -4566,12 +4582,13 @@ __init static int tracer_alloc_buffers(void)
4566 else 4582 else
4567 ring_buf_size = 1; 4583 ring_buf_size = 1;
4568 4584
4585 rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
4586
4569 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 4587 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
4570 cpumask_copy(tracing_cpumask, cpu_all_mask); 4588 cpumask_copy(tracing_cpumask, cpu_all_mask);
4571 4589
4572 /* TODO: make the number of buffers hot pluggable with CPUS */ 4590 /* TODO: make the number of buffers hot pluggable with CPUS */
4573 global_trace.buffer = ring_buffer_alloc(ring_buf_size, 4591 global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
4574 TRACE_BUFFER_FLAGS);
4575 if (!global_trace.buffer) { 4592 if (!global_trace.buffer) {
4576 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 4593 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
4577 WARN_ON(1); 4594 WARN_ON(1);
@@ -4581,7 +4598,7 @@ __init static int tracer_alloc_buffers(void)
4581 4598
4582 4599
4583#ifdef CONFIG_TRACER_MAX_TRACE 4600#ifdef CONFIG_TRACER_MAX_TRACE
4584 max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); 4601 max_tr.buffer = ring_buffer_alloc(1, rb_flags);
4585 if (!max_tr.buffer) { 4602 if (!max_tr.buffer) {
4586 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 4603 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
4587 WARN_ON(1); 4604 WARN_ON(1);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9021f8c0c0c3..5e9dfc6286dd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -272,8 +272,8 @@ struct tracer {
272 /* If you handled the flag setting, return 0 */ 272 /* If you handled the flag setting, return 0 */
273 int (*set_flag)(u32 old_flags, u32 bit, int set); 273 int (*set_flag)(u32 old_flags, u32 bit, int set);
274 struct tracer *next; 274 struct tracer *next;
275 int print_max;
276 struct tracer_flags *flags; 275 struct tracer_flags *flags;
276 int print_max;
277 int use_max_tr; 277 int use_max_tr;
278}; 278};
279 279
@@ -606,6 +606,7 @@ enum trace_iterator_flags {
606 TRACE_ITER_SLEEP_TIME = 0x40000, 606 TRACE_ITER_SLEEP_TIME = 0x40000,
607 TRACE_ITER_GRAPH_TIME = 0x80000, 607 TRACE_ITER_GRAPH_TIME = 0x80000,
608 TRACE_ITER_RECORD_CMD = 0x100000, 608 TRACE_ITER_RECORD_CMD = 0x100000,
609 TRACE_ITER_OVERWRITE = 0x200000,
609}; 610};
610 611
611/* 612/*
@@ -661,8 +662,10 @@ struct ftrace_event_field {
661}; 662};
662 663
663struct event_filter { 664struct event_filter {
664 int n_preds; 665 int n_preds; /* Number assigned */
665 struct filter_pred **preds; 666 int a_preds; /* allocated */
667 struct filter_pred *preds;
668 struct filter_pred *root;
666 char *filter_string; 669 char *filter_string;
667}; 670};
668 671
@@ -674,11 +677,23 @@ struct event_subsystem {
674 int nr_events; 677 int nr_events;
675}; 678};
676 679
680#define FILTER_PRED_INVALID ((unsigned short)-1)
681#define FILTER_PRED_IS_RIGHT (1 << 15)
682#define FILTER_PRED_FOLD (1 << 15)
683
684/*
685 * The max preds is the size of unsigned short with
686 * two flags at the MSBs. One bit is used for both the IS_RIGHT
687 * and FOLD flags. The other is reserved.
688 *
689 * 2^14 preds is way more than enough.
690 */
691#define MAX_FILTER_PRED 16384
692
677struct filter_pred; 693struct filter_pred;
678struct regex; 694struct regex;
679 695
680typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, 696typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
681 int val1, int val2);
682 697
683typedef int (*regex_match_func)(char *str, struct regex *r, int len); 698typedef int (*regex_match_func)(char *str, struct regex *r, int len);
684 699
@@ -700,11 +715,23 @@ struct filter_pred {
700 filter_pred_fn_t fn; 715 filter_pred_fn_t fn;
701 u64 val; 716 u64 val;
702 struct regex regex; 717 struct regex regex;
703 char *field_name; 718 /*
719 * Leaf nodes use field_name, ops is used by AND and OR
720 * nodes. The field_name is always freed when freeing a pred.
721 * We can overload field_name for ops and have it freed
722 * as well.
723 */
724 union {
725 char *field_name;
726 unsigned short *ops;
727 };
704 int offset; 728 int offset;
705 int not; 729 int not;
706 int op; 730 int op;
707 int pop_n; 731 unsigned short index;
732 unsigned short parent;
733 unsigned short left;
734 unsigned short right;
708}; 735};
709 736
710extern struct list_head ftrace_common_fields; 737extern struct list_head ftrace_common_fields;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 685a67d55db0..6302747a1398 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -46,7 +46,7 @@ u64 notrace trace_clock_local(void)
46} 46}
47 47
48/* 48/*
49 * trace_clock(): 'inbetween' trace clock. Not completely serialized, 49 * trace_clock(): 'between' trace clock. Not completely serialized,
50 * but not completely incorrect when crossing CPUs either. 50 * but not completely incorrect when crossing CPUs either.
51 * 51 *
52 * This is based on cpu_clock(), which will allow at most ~1 jiffy of 52 * This is based on cpu_clock(), which will allow at most ~1 jiffy of
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 6cf223764be8..e32744c84d94 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -27,7 +27,7 @@
27 * in the structure. 27 * in the structure.
28 * 28 *
29 * * for structures within structures, the format of the internal 29 * * for structures within structures, the format of the internal
30 * structure is layed out. This allows the internal structure 30 * structure is laid out. This allows the internal structure
31 * to be deciphered for the format file. Although these macros 31 * to be deciphered for the format file. Although these macros
32 * may become out of sync with the internal structure, they 32 * may become out of sync with the internal structure, they
33 * will create a compile error if it happens. Since the 33 * will create a compile error if it happens. Since the
@@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
109 */ 109 */
110#define FTRACE_CTX_FIELDS \ 110#define FTRACE_CTX_FIELDS \
111 __field( unsigned int, prev_pid ) \ 111 __field( unsigned int, prev_pid ) \
112 __field( unsigned int, next_pid ) \
113 __field( unsigned int, next_cpu ) \
112 __field( unsigned char, prev_prio ) \ 114 __field( unsigned char, prev_prio ) \
113 __field( unsigned char, prev_state ) \ 115 __field( unsigned char, prev_state ) \
114 __field( unsigned int, next_pid ) \
115 __field( unsigned char, next_prio ) \ 116 __field( unsigned char, next_prio ) \
116 __field( unsigned char, next_state ) \ 117 __field( unsigned char, next_state )
117 __field( unsigned int, next_cpu )
118 118
119FTRACE_ENTRY(context_switch, ctx_switch_entry, 119FTRACE_ENTRY(context_switch, ctx_switch_entry,
120 120
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5f499e0438a4..2fe110341359 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,7 +116,7 @@ static int trace_define_common_fields(void)
116 __common_field(unsigned char, flags); 116 __common_field(unsigned char, flags);
117 __common_field(unsigned char, preempt_count); 117 __common_field(unsigned char, preempt_count);
118 __common_field(int, pid); 118 __common_field(int, pid);
119 __common_field(int, lock_depth); 119 __common_field(int, padding);
120 120
121 return ret; 121 return ret;
122} 122}
@@ -326,6 +326,7 @@ int trace_set_clr_event(const char *system, const char *event, int set)
326{ 326{
327 return __ftrace_set_clr_event(NULL, system, event, set); 327 return __ftrace_set_clr_event(NULL, system, event, set);
328} 328}
329EXPORT_SYMBOL_GPL(trace_set_clr_event);
329 330
330/* 128 should be much more than enough */ 331/* 128 should be much more than enough */
331#define EVENT_BUF_SIZE 127 332#define EVENT_BUF_SIZE 127
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 36d40104b17f..8008ddcfbf20 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -123,9 +123,13 @@ struct filter_parse_state {
123 } operand; 123 } operand;
124}; 124};
125 125
126struct pred_stack {
127 struct filter_pred **preds;
128 int index;
129};
130
126#define DEFINE_COMPARISON_PRED(type) \ 131#define DEFINE_COMPARISON_PRED(type) \
127static int filter_pred_##type(struct filter_pred *pred, void *event, \ 132static int filter_pred_##type(struct filter_pred *pred, void *event) \
128 int val1, int val2) \
129{ \ 133{ \
130 type *addr = (type *)(event + pred->offset); \ 134 type *addr = (type *)(event + pred->offset); \
131 type val = (type)pred->val; \ 135 type val = (type)pred->val; \
@@ -152,8 +156,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \
152} 156}
153 157
154#define DEFINE_EQUALITY_PRED(size) \ 158#define DEFINE_EQUALITY_PRED(size) \
155static int filter_pred_##size(struct filter_pred *pred, void *event, \ 159static int filter_pred_##size(struct filter_pred *pred, void *event) \
156 int val1, int val2) \
157{ \ 160{ \
158 u##size *addr = (u##size *)(event + pred->offset); \ 161 u##size *addr = (u##size *)(event + pred->offset); \
159 u##size val = (u##size)pred->val; \ 162 u##size val = (u##size)pred->val; \
@@ -178,23 +181,8 @@ DEFINE_EQUALITY_PRED(32);
178DEFINE_EQUALITY_PRED(16); 181DEFINE_EQUALITY_PRED(16);
179DEFINE_EQUALITY_PRED(8); 182DEFINE_EQUALITY_PRED(8);
180 183
181static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
182 void *event __attribute((unused)),
183 int val1, int val2)
184{
185 return val1 && val2;
186}
187
188static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
189 void *event __attribute((unused)),
190 int val1, int val2)
191{
192 return val1 || val2;
193}
194
195/* Filter predicate for fixed sized arrays of characters */ 184/* Filter predicate for fixed sized arrays of characters */
196static int filter_pred_string(struct filter_pred *pred, void *event, 185static int filter_pred_string(struct filter_pred *pred, void *event)
197 int val1, int val2)
198{ 186{
199 char *addr = (char *)(event + pred->offset); 187 char *addr = (char *)(event + pred->offset);
200 int cmp, match; 188 int cmp, match;
@@ -207,8 +195,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
207} 195}
208 196
209/* Filter predicate for char * pointers */ 197/* Filter predicate for char * pointers */
210static int filter_pred_pchar(struct filter_pred *pred, void *event, 198static int filter_pred_pchar(struct filter_pred *pred, void *event)
211 int val1, int val2)
212{ 199{
213 char **addr = (char **)(event + pred->offset); 200 char **addr = (char **)(event + pred->offset);
214 int cmp, match; 201 int cmp, match;
@@ -231,8 +218,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
231 * and add it to the address of the entry, and at last we have 218 * and add it to the address of the entry, and at last we have
232 * the address of the string. 219 * the address of the string.
233 */ 220 */
234static int filter_pred_strloc(struct filter_pred *pred, void *event, 221static int filter_pred_strloc(struct filter_pred *pred, void *event)
235 int val1, int val2)
236{ 222{
237 u32 str_item = *(u32 *)(event + pred->offset); 223 u32 str_item = *(u32 *)(event + pred->offset);
238 int str_loc = str_item & 0xffff; 224 int str_loc = str_item & 0xffff;
@@ -247,8 +233,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
247 return match; 233 return match;
248} 234}
249 235
250static int filter_pred_none(struct filter_pred *pred, void *event, 236static int filter_pred_none(struct filter_pred *pred, void *event)
251 int val1, int val2)
252{ 237{
253 return 0; 238 return 0;
254} 239}
@@ -377,32 +362,147 @@ static void filter_build_regex(struct filter_pred *pred)
377 pred->not ^= not; 362 pred->not ^= not;
378} 363}
379 364
365enum move_type {
366 MOVE_DOWN,
367 MOVE_UP_FROM_LEFT,
368 MOVE_UP_FROM_RIGHT
369};
370
371static struct filter_pred *
372get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
373 int index, enum move_type *move)
374{
375 if (pred->parent & FILTER_PRED_IS_RIGHT)
376 *move = MOVE_UP_FROM_RIGHT;
377 else
378 *move = MOVE_UP_FROM_LEFT;
379 pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT];
380
381 return pred;
382}
383
384/*
385 * A series of AND or ORs where found together. Instead of
386 * climbing up and down the tree branches, an array of the
387 * ops were made in order of checks. We can just move across
388 * the array and short circuit if needed.
389 */
390static int process_ops(struct filter_pred *preds,
391 struct filter_pred *op, void *rec)
392{
393 struct filter_pred *pred;
394 int match = 0;
395 int type;
396 int i;
397
398 /*
399 * Micro-optimization: We set type to true if op
400 * is an OR and false otherwise (AND). Then we
401 * just need to test if the match is equal to
402 * the type, and if it is, we can short circuit the
403 * rest of the checks:
404 *
405 * if ((match && op->op == OP_OR) ||
406 * (!match && op->op == OP_AND))
407 * return match;
408 */
409 type = op->op == OP_OR;
410
411 for (i = 0; i < op->val; i++) {
412 pred = &preds[op->ops[i]];
413 match = pred->fn(pred, rec);
414 if (!!match == type)
415 return match;
416 }
417 return match;
418}
419
380/* return 1 if event matches, 0 otherwise (discard) */ 420/* return 1 if event matches, 0 otherwise (discard) */
381int filter_match_preds(struct event_filter *filter, void *rec) 421int filter_match_preds(struct event_filter *filter, void *rec)
382{ 422{
383 int match, top = 0, val1 = 0, val2 = 0; 423 int match = -1;
384 int stack[MAX_FILTER_PRED]; 424 enum move_type move = MOVE_DOWN;
425 struct filter_pred *preds;
385 struct filter_pred *pred; 426 struct filter_pred *pred;
386 int i; 427 struct filter_pred *root;
428 int n_preds;
429 int done = 0;
430
431 /* no filter is considered a match */
432 if (!filter)
433 return 1;
434
435 n_preds = filter->n_preds;
436
437 if (!n_preds)
438 return 1;
439
440 /*
441 * n_preds, root and filter->preds are protect with preemption disabled.
442 */
443 preds = rcu_dereference_sched(filter->preds);
444 root = rcu_dereference_sched(filter->root);
445 if (!root)
446 return 1;
447
448 pred = root;
387 449
388 for (i = 0; i < filter->n_preds; i++) { 450 /* match is currently meaningless */
389 pred = filter->preds[i]; 451 match = -1;
390 if (!pred->pop_n) { 452
391 match = pred->fn(pred, rec, val1, val2); 453 do {
392 stack[top++] = match; 454 switch (move) {
455 case MOVE_DOWN:
456 /* only AND and OR have children */
457 if (pred->left != FILTER_PRED_INVALID) {
458 /* If ops is set, then it was folded. */
459 if (!pred->ops) {
460 /* keep going to down the left side */
461 pred = &preds[pred->left];
462 continue;
463 }
464 /* We can treat folded ops as a leaf node */
465 match = process_ops(preds, pred, rec);
466 } else
467 match = pred->fn(pred, rec);
468 /* If this pred is the only pred */
469 if (pred == root)
470 break;
471 pred = get_pred_parent(pred, preds,
472 pred->parent, &move);
473 continue;
474 case MOVE_UP_FROM_LEFT:
475 /*
476 * Check for short circuits.
477 *
478 * Optimization: !!match == (pred->op == OP_OR)
479 * is the same as:
480 * if ((match && pred->op == OP_OR) ||
481 * (!match && pred->op == OP_AND))
482 */
483 if (!!match == (pred->op == OP_OR)) {
484 if (pred == root)
485 break;
486 pred = get_pred_parent(pred, preds,
487 pred->parent, &move);
488 continue;
489 }
490 /* now go down the right side of the tree. */
491 pred = &preds[pred->right];
492 move = MOVE_DOWN;
493 continue;
494 case MOVE_UP_FROM_RIGHT:
495 /* We finished this equation. */
496 if (pred == root)
497 break;
498 pred = get_pred_parent(pred, preds,
499 pred->parent, &move);
393 continue; 500 continue;
394 } 501 }
395 if (pred->pop_n > top) { 502 done = 1;
396 WARN_ON_ONCE(1); 503 } while (!done);
397 return 0;
398 }
399 val1 = stack[--top];
400 val2 = stack[--top];
401 match = pred->fn(pred, rec, val1, val2);
402 stack[top++] = match;
403 }
404 504
405 return stack[--top]; 505 return match;
406} 506}
407EXPORT_SYMBOL_GPL(filter_match_preds); 507EXPORT_SYMBOL_GPL(filter_match_preds);
408 508
@@ -414,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos)
414 514
415static void remove_filter_string(struct event_filter *filter) 515static void remove_filter_string(struct event_filter *filter)
416{ 516{
517 if (!filter)
518 return;
519
417 kfree(filter->filter_string); 520 kfree(filter->filter_string);
418 filter->filter_string = NULL; 521 filter->filter_string = NULL;
419} 522}
@@ -473,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps,
473 576
474void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) 577void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
475{ 578{
476 struct event_filter *filter = call->filter; 579 struct event_filter *filter;
477 580
478 mutex_lock(&event_mutex); 581 mutex_lock(&event_mutex);
582 filter = call->filter;
479 if (filter && filter->filter_string) 583 if (filter && filter->filter_string)
480 trace_seq_printf(s, "%s\n", filter->filter_string); 584 trace_seq_printf(s, "%s\n", filter->filter_string);
481 else 585 else
@@ -486,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
486void print_subsystem_event_filter(struct event_subsystem *system, 590void print_subsystem_event_filter(struct event_subsystem *system,
487 struct trace_seq *s) 591 struct trace_seq *s)
488{ 592{
489 struct event_filter *filter = system->filter; 593 struct event_filter *filter;
490 594
491 mutex_lock(&event_mutex); 595 mutex_lock(&event_mutex);
596 filter = system->filter;
492 if (filter && filter->filter_string) 597 if (filter && filter->filter_string)
493 trace_seq_printf(s, "%s\n", filter->filter_string); 598 trace_seq_printf(s, "%s\n", filter->filter_string);
494 else 599 else
@@ -539,10 +644,58 @@ static void filter_clear_pred(struct filter_pred *pred)
539 pred->regex.len = 0; 644 pred->regex.len = 0;
540} 645}
541 646
542static int filter_set_pred(struct filter_pred *dest, 647static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
648{
649 stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
650 if (!stack->preds)
651 return -ENOMEM;
652 stack->index = n_preds;
653 return 0;
654}
655
656static void __free_pred_stack(struct pred_stack *stack)
657{
658 kfree(stack->preds);
659 stack->index = 0;
660}
661
662static int __push_pred_stack(struct pred_stack *stack,
663 struct filter_pred *pred)
664{
665 int index = stack->index;
666
667 if (WARN_ON(index == 0))
668 return -ENOSPC;
669
670 stack->preds[--index] = pred;
671 stack->index = index;
672 return 0;
673}
674
675static struct filter_pred *
676__pop_pred_stack(struct pred_stack *stack)
677{
678 struct filter_pred *pred;
679 int index = stack->index;
680
681 pred = stack->preds[index++];
682 if (!pred)
683 return NULL;
684
685 stack->index = index;
686 return pred;
687}
688
689static int filter_set_pred(struct event_filter *filter,
690 int idx,
691 struct pred_stack *stack,
543 struct filter_pred *src, 692 struct filter_pred *src,
544 filter_pred_fn_t fn) 693 filter_pred_fn_t fn)
545{ 694{
695 struct filter_pred *dest = &filter->preds[idx];
696 struct filter_pred *left;
697 struct filter_pred *right;
698
546 *dest = *src; 699 *dest = *src;
547 if (src->field_name) { 700 if (src->field_name) {
548 dest->field_name = kstrdup(src->field_name, GFP_KERNEL); 701 dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
@@ -550,116 +703,140 @@ static int filter_set_pred(struct filter_pred *dest,
550 return -ENOMEM; 703 return -ENOMEM;
551 } 704 }
552 dest->fn = fn; 705 dest->fn = fn;
706 dest->index = idx;
553 707
554 return 0; 708 if (dest->op == OP_OR || dest->op == OP_AND) {
709 right = __pop_pred_stack(stack);
710 left = __pop_pred_stack(stack);
711 if (!left || !right)
712 return -EINVAL;
713 /*
714 * If both children can be folded
715 * and they are the same op as this op or a leaf,
716 * then this op can be folded.
717 */
718 if (left->index & FILTER_PRED_FOLD &&
719 (left->op == dest->op ||
720 left->left == FILTER_PRED_INVALID) &&
721 right->index & FILTER_PRED_FOLD &&
722 (right->op == dest->op ||
723 right->left == FILTER_PRED_INVALID))
724 dest->index |= FILTER_PRED_FOLD;
725
726 dest->left = left->index & ~FILTER_PRED_FOLD;
727 dest->right = right->index & ~FILTER_PRED_FOLD;
728 left->parent = dest->index & ~FILTER_PRED_FOLD;
729 right->parent = dest->index | FILTER_PRED_IS_RIGHT;
730 } else {
731 /*
732 * Make dest->left invalid to be used as a quick
733 * way to know this is a leaf node.
734 */
735 dest->left = FILTER_PRED_INVALID;
736
737 /* All leafs allow folding the parent ops. */
738 dest->index |= FILTER_PRED_FOLD;
739 }
740
741 return __push_pred_stack(stack, dest);
555} 742}
556 743
557static void filter_disable_preds(struct ftrace_event_call *call) 744static void __free_preds(struct event_filter *filter)
558{ 745{
559 struct event_filter *filter = call->filter;
560 int i; 746 int i;
561 747
562 call->flags &= ~TRACE_EVENT_FL_FILTERED; 748 if (filter->preds) {
749 for (i = 0; i < filter->a_preds; i++)
750 kfree(filter->preds[i].field_name);
751 kfree(filter->preds);
752 filter->preds = NULL;
753 }
754 filter->a_preds = 0;
563 filter->n_preds = 0; 755 filter->n_preds = 0;
564
565 for (i = 0; i < MAX_FILTER_PRED; i++)
566 filter->preds[i]->fn = filter_pred_none;
567} 756}
568 757
569static void __free_preds(struct event_filter *filter) 758static void filter_disable(struct ftrace_event_call *call)
570{ 759{
571 int i; 760 call->flags &= ~TRACE_EVENT_FL_FILTERED;
761}
572 762
763static void __free_filter(struct event_filter *filter)
764{
573 if (!filter) 765 if (!filter)
574 return; 766 return;
575 767
576 for (i = 0; i < MAX_FILTER_PRED; i++) { 768 __free_preds(filter);
577 if (filter->preds[i])
578 filter_free_pred(filter->preds[i]);
579 }
580 kfree(filter->preds);
581 kfree(filter->filter_string); 769 kfree(filter->filter_string);
582 kfree(filter); 770 kfree(filter);
583} 771}
584 772
773/*
774 * Called when destroying the ftrace_event_call.
775 * The call is being freed, so we do not need to worry about
776 * the call being currently used. This is for module code removing
777 * the tracepoints from within it.
778 */
585void destroy_preds(struct ftrace_event_call *call) 779void destroy_preds(struct ftrace_event_call *call)
586{ 780{
587 __free_preds(call->filter); 781 __free_filter(call->filter);
588 call->filter = NULL; 782 call->filter = NULL;
589 call->flags &= ~TRACE_EVENT_FL_FILTERED;
590} 783}
591 784
592static struct event_filter *__alloc_preds(void) 785static struct event_filter *__alloc_filter(void)
593{ 786{
594 struct event_filter *filter; 787 struct event_filter *filter;
788
789 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
790 return filter;
791}
792
793static int __alloc_preds(struct event_filter *filter, int n_preds)
794{
595 struct filter_pred *pred; 795 struct filter_pred *pred;
596 int i; 796 int i;
597 797
598 filter = kzalloc(sizeof(*filter), GFP_KERNEL); 798 if (filter->preds)
599 if (!filter) 799 __free_preds(filter);
600 return ERR_PTR(-ENOMEM);
601 800
602 filter->n_preds = 0; 801 filter->preds =
802 kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
603 803
604 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
605 if (!filter->preds) 804 if (!filter->preds)
606 goto oom; 805 return -ENOMEM;
607 806
608 for (i = 0; i < MAX_FILTER_PRED; i++) { 807 filter->a_preds = n_preds;
609 pred = kzalloc(sizeof(*pred), GFP_KERNEL); 808 filter->n_preds = 0;
610 if (!pred) 809
611 goto oom; 810 for (i = 0; i < n_preds; i++) {
811 pred = &filter->preds[i];
612 pred->fn = filter_pred_none; 812 pred->fn = filter_pred_none;
613 filter->preds[i] = pred;
614 } 813 }
615 814
616 return filter;
617
618oom:
619 __free_preds(filter);
620 return ERR_PTR(-ENOMEM);
621}
622
623static int init_preds(struct ftrace_event_call *call)
624{
625 if (call->filter)
626 return 0;
627
628 call->flags &= ~TRACE_EVENT_FL_FILTERED;
629 call->filter = __alloc_preds();
630 if (IS_ERR(call->filter))
631 return PTR_ERR(call->filter);
632
633 return 0; 815 return 0;
634} 816}
635 817
636static int init_subsystem_preds(struct event_subsystem *system) 818static void filter_free_subsystem_preds(struct event_subsystem *system)
637{ 819{
638 struct ftrace_event_call *call; 820 struct ftrace_event_call *call;
639 int err;
640 821
641 list_for_each_entry(call, &ftrace_events, list) { 822 list_for_each_entry(call, &ftrace_events, list) {
642 if (strcmp(call->class->system, system->name) != 0) 823 if (strcmp(call->class->system, system->name) != 0)
643 continue; 824 continue;
644 825
645 err = init_preds(call); 826 filter_disable(call);
646 if (err) 827 remove_filter_string(call->filter);
647 return err;
648 } 828 }
649
650 return 0;
651} 829}
652 830
653static void filter_free_subsystem_preds(struct event_subsystem *system) 831static void filter_free_subsystem_filters(struct event_subsystem *system)
654{ 832{
655 struct ftrace_event_call *call; 833 struct ftrace_event_call *call;
656 834
657 list_for_each_entry(call, &ftrace_events, list) { 835 list_for_each_entry(call, &ftrace_events, list) {
658 if (strcmp(call->class->system, system->name) != 0) 836 if (strcmp(call->class->system, system->name) != 0)
659 continue; 837 continue;
660 838 __free_filter(call->filter);
661 filter_disable_preds(call); 839 call->filter = NULL;
662 remove_filter_string(call->filter);
663 } 840 }
664} 841}
665 842
@@ -667,18 +844,19 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
667 struct ftrace_event_call *call, 844 struct ftrace_event_call *call,
668 struct event_filter *filter, 845 struct event_filter *filter,
669 struct filter_pred *pred, 846 struct filter_pred *pred,
847 struct pred_stack *stack,
670 filter_pred_fn_t fn) 848 filter_pred_fn_t fn)
671{ 849{
672 int idx, err; 850 int idx, err;
673 851
674 if (filter->n_preds == MAX_FILTER_PRED) { 852 if (WARN_ON(filter->n_preds == filter->a_preds)) {
675 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 853 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
676 return -ENOSPC; 854 return -ENOSPC;
677 } 855 }
678 856
679 idx = filter->n_preds; 857 idx = filter->n_preds;
680 filter_clear_pred(filter->preds[idx]); 858 filter_clear_pred(&filter->preds[idx]);
681 err = filter_set_pred(filter->preds[idx], pred, fn); 859 err = filter_set_pred(filter, idx, stack, pred, fn);
682 if (err) 860 if (err)
683 return err; 861 return err;
684 862
@@ -763,6 +941,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
763 struct ftrace_event_call *call, 941 struct ftrace_event_call *call,
764 struct event_filter *filter, 942 struct event_filter *filter,
765 struct filter_pred *pred, 943 struct filter_pred *pred,
944 struct pred_stack *stack,
766 bool dry_run) 945 bool dry_run)
767{ 946{
768 struct ftrace_event_field *field; 947 struct ftrace_event_field *field;
@@ -770,17 +949,12 @@ static int filter_add_pred(struct filter_parse_state *ps,
770 unsigned long long val; 949 unsigned long long val;
771 int ret; 950 int ret;
772 951
773 pred->fn = filter_pred_none; 952 fn = pred->fn = filter_pred_none;
774 953
775 if (pred->op == OP_AND) { 954 if (pred->op == OP_AND)
776 pred->pop_n = 2;
777 fn = filter_pred_and;
778 goto add_pred_fn; 955 goto add_pred_fn;
779 } else if (pred->op == OP_OR) { 956 else if (pred->op == OP_OR)
780 pred->pop_n = 2;
781 fn = filter_pred_or;
782 goto add_pred_fn; 957 goto add_pred_fn;
783 }
784 958
785 field = find_event_field(call, pred->field_name); 959 field = find_event_field(call, pred->field_name);
786 if (!field) { 960 if (!field) {
@@ -829,7 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
829 1003
830add_pred_fn: 1004add_pred_fn:
831 if (!dry_run) 1005 if (!dry_run)
832 return filter_add_pred_fn(ps, call, filter, pred, fn); 1006 return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
833 return 0; 1007 return 0;
834} 1008}
835 1009
@@ -1187,6 +1361,234 @@ static int check_preds(struct filter_parse_state *ps)
1187 return 0; 1361 return 0;
1188} 1362}
1189 1363
1364static int count_preds(struct filter_parse_state *ps)
1365{
1366 struct postfix_elt *elt;
1367 int n_preds = 0;
1368
1369 list_for_each_entry(elt, &ps->postfix, list) {
1370 if (elt->op == OP_NONE)
1371 continue;
1372 n_preds++;
1373 }
1374
1375 return n_preds;
1376}
1377
1378/*
1379 * The tree is walked at filtering of an event. If the tree is not correctly
1380 * built, it may cause an infinite loop. Check here that the tree does
1381 * indeed terminate.
1382 */
1383static int check_pred_tree(struct event_filter *filter,
1384 struct filter_pred *root)
1385{
1386 struct filter_pred *preds;
1387 struct filter_pred *pred;
1388 enum move_type move = MOVE_DOWN;
1389 int count = 0;
1390 int done = 0;
1391 int max;
1392
1393 /*
1394 * The max that we can hit a node is three times.
1395 * Once going down, once coming up from left, and
1396 * once coming up from right. This is more than enough
1397 * since leafs are only hit a single time.
1398 */
1399 max = 3 * filter->n_preds;
1400
1401 preds = filter->preds;
1402 if (!preds)
1403 return -EINVAL;
1404 pred = root;
1405
1406 do {
1407 if (WARN_ON(count++ > max))
1408 return -EINVAL;
1409
1410 switch (move) {
1411 case MOVE_DOWN:
1412 if (pred->left != FILTER_PRED_INVALID) {
1413 pred = &preds[pred->left];
1414 continue;
1415 }
1416 /* A leaf at the root is just a leaf in the tree */
1417 if (pred == root)
1418 break;
1419 pred = get_pred_parent(pred, preds,
1420 pred->parent, &move);
1421 continue;
1422 case MOVE_UP_FROM_LEFT:
1423 pred = &preds[pred->right];
1424 move = MOVE_DOWN;
1425 continue;
1426 case MOVE_UP_FROM_RIGHT:
1427 if (pred == root)
1428 break;
1429 pred = get_pred_parent(pred, preds,
1430 pred->parent, &move);
1431 continue;
1432 }
1433 done = 1;
1434 } while (!done);
1435
1436 /* We are fine. */
1437 return 0;
1438}
1439
1440static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
1441{
1442 struct filter_pred *pred;
1443 enum move_type move = MOVE_DOWN;
1444 int count = 0;
1445 int done = 0;
1446
1447 pred = root;
1448
1449 do {
1450 switch (move) {
1451 case MOVE_DOWN:
1452 if (pred->left != FILTER_PRED_INVALID) {
1453 pred = &preds[pred->left];
1454 continue;
1455 }
1456 /* A leaf at the root is just a leaf in the tree */
1457 if (pred == root)
1458 return 1;
1459 count++;
1460 pred = get_pred_parent(pred, preds,
1461 pred->parent, &move);
1462 continue;
1463 case MOVE_UP_FROM_LEFT:
1464 pred = &preds[pred->right];
1465 move = MOVE_DOWN;
1466 continue;
1467 case MOVE_UP_FROM_RIGHT:
1468 if (pred == root)
1469 break;
1470 pred = get_pred_parent(pred, preds,
1471 pred->parent, &move);
1472 continue;
1473 }
1474 done = 1;
1475 } while (!done);
1476
1477 return count;
1478}
1479
1480static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1481{
1482 struct filter_pred *pred;
1483 enum move_type move = MOVE_DOWN;
1484 int count = 0;
1485 int children;
1486 int done = 0;
1487
1488 /* No need to keep the fold flag */
1489 root->index &= ~FILTER_PRED_FOLD;
1490
1491 /* If the root is a leaf then do nothing */
1492 if (root->left == FILTER_PRED_INVALID)
1493 return 0;
1494
1495 /* count the children */
1496 children = count_leafs(preds, &preds[root->left]);
1497 children += count_leafs(preds, &preds[root->right]);
1498
1499 root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL);
1500 if (!root->ops)
1501 return -ENOMEM;
1502
1503 root->val = children;
1504
1505 pred = root;
1506 do {
1507 switch (move) {
1508 case MOVE_DOWN:
1509 if (pred->left != FILTER_PRED_INVALID) {
1510 pred = &preds[pred->left];
1511 continue;
1512 }
1513 if (WARN_ON(count == children))
1514 return -EINVAL;
1515 pred->index &= ~FILTER_PRED_FOLD;
1516 root->ops[count++] = pred->index;
1517 pred = get_pred_parent(pred, preds,
1518 pred->parent, &move);
1519 continue;
1520 case MOVE_UP_FROM_LEFT:
1521 pred = &preds[pred->right];
1522 move = MOVE_DOWN;
1523 continue;
1524 case MOVE_UP_FROM_RIGHT:
1525 if (pred == root)
1526 break;
1527 pred = get_pred_parent(pred, preds,
1528 pred->parent, &move);
1529 continue;
1530 }
1531 done = 1;
1532 } while (!done);
1533
1534 return 0;
1535}
1536
1537/*
1538 * To optimize the processing of the ops, if we have several "ors" or
1539 * "ands" together, we can put them in an array and process them all
1540 * together speeding up the filter logic.
1541 */
1542static int fold_pred_tree(struct event_filter *filter,
1543 struct filter_pred *root)
1544{
1545 struct filter_pred *preds;
1546 struct filter_pred *pred;
1547 enum move_type move = MOVE_DOWN;
1548 int done = 0;
1549 int err;
1550
1551 preds = filter->preds;
1552 if (!preds)
1553 return -EINVAL;
1554 pred = root;
1555
1556 do {
1557 switch (move) {
1558 case MOVE_DOWN:
1559 if (pred->index & FILTER_PRED_FOLD) {
1560 err = fold_pred(preds, pred);
1561 if (err)
1562 return err;
1563 /* Folded nodes are like leafs */
1564 } else if (pred->left != FILTER_PRED_INVALID) {
1565 pred = &preds[pred->left];
1566 continue;
1567 }
1568
1569 /* A leaf at the root is just a leaf in the tree */
1570 if (pred == root)
1571 break;
1572 pred = get_pred_parent(pred, preds,
1573 pred->parent, &move);
1574 continue;
1575 case MOVE_UP_FROM_LEFT:
1576 pred = &preds[pred->right];
1577 move = MOVE_DOWN;
1578 continue;
1579 case MOVE_UP_FROM_RIGHT:
1580 if (pred == root)
1581 break;
1582 pred = get_pred_parent(pred, preds,
1583 pred->parent, &move);
1584 continue;
1585 }
1586 done = 1;
1587 } while (!done);
1588
1589 return 0;
1590}
1591
1190static int replace_preds(struct ftrace_event_call *call, 1592static int replace_preds(struct ftrace_event_call *call,
1191 struct event_filter *filter, 1593 struct event_filter *filter,
1192 struct filter_parse_state *ps, 1594 struct filter_parse_state *ps,
@@ -1195,14 +1597,32 @@ static int replace_preds(struct ftrace_event_call *call,
1195{ 1597{
1196 char *operand1 = NULL, *operand2 = NULL; 1598 char *operand1 = NULL, *operand2 = NULL;
1197 struct filter_pred *pred; 1599 struct filter_pred *pred;
1600 struct filter_pred *root;
1198 struct postfix_elt *elt; 1601 struct postfix_elt *elt;
1602 struct pred_stack stack = { }; /* init to NULL */
1199 int err; 1603 int err;
1200 int n_preds = 0; 1604 int n_preds = 0;
1201 1605
1606 n_preds = count_preds(ps);
1607 if (n_preds >= MAX_FILTER_PRED) {
1608 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1609 return -ENOSPC;
1610 }
1611
1202 err = check_preds(ps); 1612 err = check_preds(ps);
1203 if (err) 1613 if (err)
1204 return err; 1614 return err;
1205 1615
1616 if (!dry_run) {
1617 err = __alloc_pred_stack(&stack, n_preds);
1618 if (err)
1619 return err;
1620 err = __alloc_preds(filter, n_preds);
1621 if (err)
1622 goto fail;
1623 }
1624
1625 n_preds = 0;
1206 list_for_each_entry(elt, &ps->postfix, list) { 1626 list_for_each_entry(elt, &ps->postfix, list) {
1207 if (elt->op == OP_NONE) { 1627 if (elt->op == OP_NONE) {
1208 if (!operand1) 1628 if (!operand1)
@@ -1211,14 +1631,16 @@ static int replace_preds(struct ftrace_event_call *call,
1211 operand2 = elt->operand; 1631 operand2 = elt->operand;
1212 else { 1632 else {
1213 parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); 1633 parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
1214 return -EINVAL; 1634 err = -EINVAL;
1635 goto fail;
1215 } 1636 }
1216 continue; 1637 continue;
1217 } 1638 }
1218 1639
1219 if (n_preds++ == MAX_FILTER_PRED) { 1640 if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
1220 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 1641 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1221 return -ENOSPC; 1642 err = -ENOSPC;
1643 goto fail;
1222 } 1644 }
1223 1645
1224 if (elt->op == OP_AND || elt->op == OP_OR) { 1646 if (elt->op == OP_AND || elt->op == OP_OR) {
@@ -1228,76 +1650,181 @@ static int replace_preds(struct ftrace_event_call *call,
1228 1650
1229 if (!operand1 || !operand2) { 1651 if (!operand1 || !operand2) {
1230 parse_error(ps, FILT_ERR_MISSING_FIELD, 0); 1652 parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
1231 return -EINVAL; 1653 err = -EINVAL;
1654 goto fail;
1232 } 1655 }
1233 1656
1234 pred = create_pred(elt->op, operand1, operand2); 1657 pred = create_pred(elt->op, operand1, operand2);
1235add_pred: 1658add_pred:
1236 if (!pred) 1659 if (!pred) {
1237 return -ENOMEM; 1660 err = -ENOMEM;
1238 err = filter_add_pred(ps, call, filter, pred, dry_run); 1661 goto fail;
1662 }
1663 err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
1239 filter_free_pred(pred); 1664 filter_free_pred(pred);
1240 if (err) 1665 if (err)
1241 return err; 1666 goto fail;
1242 1667
1243 operand1 = operand2 = NULL; 1668 operand1 = operand2 = NULL;
1244 } 1669 }
1245 1670
1246 return 0; 1671 if (!dry_run) {
1672 /* We should have one item left on the stack */
1673 pred = __pop_pred_stack(&stack);
1674 if (!pred)
1675 return -EINVAL;
1676 /* This item is where we start from in matching */
1677 root = pred;
1678 /* Make sure the stack is empty */
1679 pred = __pop_pred_stack(&stack);
1680 if (WARN_ON(pred)) {
1681 err = -EINVAL;
1682 filter->root = NULL;
1683 goto fail;
1684 }
1685 err = check_pred_tree(filter, root);
1686 if (err)
1687 goto fail;
1688
1689 /* Optimize the tree */
1690 err = fold_pred_tree(filter, root);
1691 if (err)
1692 goto fail;
1693
1694 /* We don't set root until we know it works */
1695 barrier();
1696 filter->root = root;
1697 }
1698
1699 err = 0;
1700fail:
1701 __free_pred_stack(&stack);
1702 return err;
1247} 1703}
1248 1704
1705struct filter_list {
1706 struct list_head list;
1707 struct event_filter *filter;
1708};
1709
1249static int replace_system_preds(struct event_subsystem *system, 1710static int replace_system_preds(struct event_subsystem *system,
1250 struct filter_parse_state *ps, 1711 struct filter_parse_state *ps,
1251 char *filter_string) 1712 char *filter_string)
1252{ 1713{
1253 struct ftrace_event_call *call; 1714 struct ftrace_event_call *call;
1715 struct filter_list *filter_item;
1716 struct filter_list *tmp;
1717 LIST_HEAD(filter_list);
1254 bool fail = true; 1718 bool fail = true;
1255 int err; 1719 int err;
1256 1720
1257 list_for_each_entry(call, &ftrace_events, list) { 1721 list_for_each_entry(call, &ftrace_events, list) {
1258 struct event_filter *filter = call->filter;
1259 1722
1260 if (strcmp(call->class->system, system->name) != 0) 1723 if (strcmp(call->class->system, system->name) != 0)
1261 continue; 1724 continue;
1262 1725
1263 /* try to see if the filter can be applied */ 1726 /*
1264 err = replace_preds(call, filter, ps, filter_string, true); 1727 * Try to see if the filter can be applied
1728 * (filter arg is ignored on dry_run)
1729 */
1730 err = replace_preds(call, NULL, ps, filter_string, true);
1265 if (err) 1731 if (err)
1732 goto fail;
1733 }
1734
1735 list_for_each_entry(call, &ftrace_events, list) {
1736 struct event_filter *filter;
1737
1738 if (strcmp(call->class->system, system->name) != 0)
1266 continue; 1739 continue;
1267 1740
1268 /* really apply the filter */ 1741 filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
1269 filter_disable_preds(call); 1742 if (!filter_item)
1270 err = replace_preds(call, filter, ps, filter_string, false); 1743 goto fail_mem;
1744
1745 list_add_tail(&filter_item->list, &filter_list);
1746
1747 filter_item->filter = __alloc_filter();
1748 if (!filter_item->filter)
1749 goto fail_mem;
1750 filter = filter_item->filter;
1751
1752 /* Can only fail on no memory */
1753 err = replace_filter_string(filter, filter_string);
1271 if (err) 1754 if (err)
1272 filter_disable_preds(call); 1755 goto fail_mem;
1273 else { 1756
1757 err = replace_preds(call, filter, ps, filter_string, false);
1758 if (err) {
1759 filter_disable(call);
1760 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1761 append_filter_err(ps, filter);
1762 } else
1274 call->flags |= TRACE_EVENT_FL_FILTERED; 1763 call->flags |= TRACE_EVENT_FL_FILTERED;
1275 replace_filter_string(filter, filter_string); 1764 /*
1276 } 1765 * Regardless of if this returned an error, we still
1766 * replace the filter for the call.
1767 */
1768 filter = call->filter;
1769 call->filter = filter_item->filter;
1770 filter_item->filter = filter;
1771
1277 fail = false; 1772 fail = false;
1278 } 1773 }
1279 1774
1280 if (fail) { 1775 if (fail)
1281 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 1776 goto fail;
1282 return -EINVAL; 1777
1778 /*
1779 * The calls can still be using the old filters.
1780 * Do a synchronize_sched() to ensure all calls are
1781 * done with them before we free them.
1782 */
1783 synchronize_sched();
1784 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1785 __free_filter(filter_item->filter);
1786 list_del(&filter_item->list);
1787 kfree(filter_item);
1283 } 1788 }
1284 return 0; 1789 return 0;
1790 fail:
1791 /* No call succeeded */
1792 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1793 list_del(&filter_item->list);
1794 kfree(filter_item);
1795 }
1796 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1797 return -EINVAL;
1798 fail_mem:
1799 /* If any call succeeded, we still need to sync */
1800 if (!fail)
1801 synchronize_sched();
1802 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1803 __free_filter(filter_item->filter);
1804 list_del(&filter_item->list);
1805 kfree(filter_item);
1806 }
1807 return -ENOMEM;
1285} 1808}
1286 1809
1287int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1810int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1288{ 1811{
1289 int err;
1290 struct filter_parse_state *ps; 1812 struct filter_parse_state *ps;
1813 struct event_filter *filter;
1814 struct event_filter *tmp;
1815 int err = 0;
1291 1816
1292 mutex_lock(&event_mutex); 1817 mutex_lock(&event_mutex);
1293 1818
1294 err = init_preds(call);
1295 if (err)
1296 goto out_unlock;
1297
1298 if (!strcmp(strstrip(filter_string), "0")) { 1819 if (!strcmp(strstrip(filter_string), "0")) {
1299 filter_disable_preds(call); 1820 filter_disable(call);
1300 remove_filter_string(call->filter); 1821 filter = call->filter;
1822 if (!filter)
1823 goto out_unlock;
1824 call->filter = NULL;
1825 /* Make sure the filter is not being used */
1826 synchronize_sched();
1827 __free_filter(filter);
1301 goto out_unlock; 1828 goto out_unlock;
1302 } 1829 }
1303 1830
@@ -1306,22 +1833,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1306 if (!ps) 1833 if (!ps)
1307 goto out_unlock; 1834 goto out_unlock;
1308 1835
1309 filter_disable_preds(call); 1836 filter = __alloc_filter();
1310 replace_filter_string(call->filter, filter_string); 1837 if (!filter) {
1838 kfree(ps);
1839 goto out_unlock;
1840 }
1841
1842 replace_filter_string(filter, filter_string);
1311 1843
1312 parse_init(ps, filter_ops, filter_string); 1844 parse_init(ps, filter_ops, filter_string);
1313 err = filter_parse(ps); 1845 err = filter_parse(ps);
1314 if (err) { 1846 if (err) {
1315 append_filter_err(ps, call->filter); 1847 append_filter_err(ps, filter);
1316 goto out; 1848 goto out;
1317 } 1849 }
1318 1850
1319 err = replace_preds(call, call->filter, ps, filter_string, false); 1851 err = replace_preds(call, filter, ps, filter_string, false);
1320 if (err) 1852 if (err) {
1321 append_filter_err(ps, call->filter); 1853 filter_disable(call);
1322 else 1854 append_filter_err(ps, filter);
1855 } else
1323 call->flags |= TRACE_EVENT_FL_FILTERED; 1856 call->flags |= TRACE_EVENT_FL_FILTERED;
1324out: 1857out:
1858 /*
1859 * Always swap the call filter with the new filter
1860 * even if there was an error. If there was an error
1861 * in the filter, we disable the filter and show the error
1862 * string
1863 */
1864 tmp = call->filter;
1865 call->filter = filter;
1866 if (tmp) {
1867 /* Make sure the call is done with the filter */
1868 synchronize_sched();
1869 __free_filter(tmp);
1870 }
1325 filter_opstack_clear(ps); 1871 filter_opstack_clear(ps);
1326 postfix_clear(ps); 1872 postfix_clear(ps);
1327 kfree(ps); 1873 kfree(ps);
@@ -1334,18 +1880,21 @@ out_unlock:
1334int apply_subsystem_event_filter(struct event_subsystem *system, 1880int apply_subsystem_event_filter(struct event_subsystem *system,
1335 char *filter_string) 1881 char *filter_string)
1336{ 1882{
1337 int err;
1338 struct filter_parse_state *ps; 1883 struct filter_parse_state *ps;
1884 struct event_filter *filter;
1885 int err = 0;
1339 1886
1340 mutex_lock(&event_mutex); 1887 mutex_lock(&event_mutex);
1341 1888
1342 err = init_subsystem_preds(system);
1343 if (err)
1344 goto out_unlock;
1345
1346 if (!strcmp(strstrip(filter_string), "0")) { 1889 if (!strcmp(strstrip(filter_string), "0")) {
1347 filter_free_subsystem_preds(system); 1890 filter_free_subsystem_preds(system);
1348 remove_filter_string(system->filter); 1891 remove_filter_string(system->filter);
1892 filter = system->filter;
1893 system->filter = NULL;
1894 /* Ensure all filters are no longer used */
1895 synchronize_sched();
1896 filter_free_subsystem_filters(system);
1897 __free_filter(filter);
1349 goto out_unlock; 1898 goto out_unlock;
1350 } 1899 }
1351 1900
@@ -1354,7 +1903,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1354 if (!ps) 1903 if (!ps)
1355 goto out_unlock; 1904 goto out_unlock;
1356 1905
1357 replace_filter_string(system->filter, filter_string); 1906 filter = __alloc_filter();
1907 if (!filter)
1908 goto out;
1909
1910 replace_filter_string(filter, filter_string);
1911 /*
1912 * No event actually uses the system filter
1913 * we can free it without synchronize_sched().
1914 */
1915 __free_filter(system->filter);
1916 system->filter = filter;
1358 1917
1359 parse_init(ps, filter_ops, filter_string); 1918 parse_init(ps, filter_ops, filter_string);
1360 err = filter_parse(ps); 1919 err = filter_parse(ps);
@@ -1384,7 +1943,7 @@ void ftrace_profile_free_filter(struct perf_event *event)
1384 struct event_filter *filter = event->filter; 1943 struct event_filter *filter = event->filter;
1385 1944
1386 event->filter = NULL; 1945 event->filter = NULL;
1387 __free_preds(filter); 1946 __free_filter(filter);
1388} 1947}
1389 1948
1390int ftrace_profile_set_filter(struct perf_event *event, int event_id, 1949int ftrace_profile_set_filter(struct perf_event *event, int event_id,
@@ -1410,8 +1969,8 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1410 if (event->filter) 1969 if (event->filter)
1411 goto out_unlock; 1970 goto out_unlock;
1412 1971
1413 filter = __alloc_preds(); 1972 filter = __alloc_filter();
1414 if (IS_ERR(filter)) { 1973 if (!filter) {
1415 err = PTR_ERR(filter); 1974 err = PTR_ERR(filter);
1416 goto out_unlock; 1975 goto out_unlock;
1417 } 1976 }
@@ -1419,7 +1978,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1419 err = -ENOMEM; 1978 err = -ENOMEM;
1420 ps = kzalloc(sizeof(*ps), GFP_KERNEL); 1979 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1421 if (!ps) 1980 if (!ps)
1422 goto free_preds; 1981 goto free_filter;
1423 1982
1424 parse_init(ps, filter_ops, filter_str); 1983 parse_init(ps, filter_ops, filter_str);
1425 err = filter_parse(ps); 1984 err = filter_parse(ps);
@@ -1435,9 +1994,9 @@ free_ps:
1435 postfix_clear(ps); 1994 postfix_clear(ps);
1436 kfree(ps); 1995 kfree(ps);
1437 1996
1438free_preds: 1997free_filter:
1439 if (err) 1998 if (err)
1440 __free_preds(filter); 1999 __free_filter(filter);
1441 2000
1442out_unlock: 2001out_unlock:
1443 mutex_unlock(&event_mutex); 2002 mutex_unlock(&event_mutex);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 76b05980225c..962cdb24ed81 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -905,7 +905,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
905 * 905 *
906 * returns 1 if 906 * returns 1 if
907 * - we are inside irq code 907 * - we are inside irq code
908 * - we just extered irq code 908 * - we just entered irq code
909 * 909 *
910 * retunns 0 if 910 * retunns 0 if
911 * - funcgraph-interrupts option is set 911 * - funcgraph-interrupts option is set
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 92b6e1e12d98..a4969b47afc1 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -80,7 +80,7 @@ static struct tracer_flags tracer_flags = {
80 * skip the latency if the sequence has changed - some other section 80 * skip the latency if the sequence has changed - some other section
81 * did a maximum and could disturb our measurement with serial console 81 * did a maximum and could disturb our measurement with serial console
82 * printouts, etc. Truly coinciding maximum latencies should be rare 82 * printouts, etc. Truly coinciding maximum latencies should be rare
83 * and what happens together happens separately as well, so this doesnt 83 * and what happens together happens separately as well, so this doesn't
84 * decrease the validity of the maximum found: 84 * decrease the validity of the maximum found:
85 */ 85 */
86static __cacheline_aligned_in_smp unsigned long max_sequence; 86static __cacheline_aligned_in_smp unsigned long max_sequence;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 2dec9bcde8b4..35d55a386145 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -353,6 +353,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
353 kfree(data); 353 kfree(data);
354} 354}
355 355
356/* Bitfield fetch function */
357struct bitfield_fetch_param {
358 struct fetch_param orig;
359 unsigned char hi_shift;
360 unsigned char low_shift;
361};
362
363#define DEFINE_FETCH_bitfield(type) \
364static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
365 void *data, void *dest) \
366{ \
367 struct bitfield_fetch_param *bprm = data; \
368 type buf = 0; \
369 call_fetch(&bprm->orig, regs, &buf); \
370 if (buf) { \
371 buf <<= bprm->hi_shift; \
372 buf >>= bprm->low_shift; \
373 } \
374 *(type *)dest = buf; \
375}
376DEFINE_BASIC_FETCH_FUNCS(bitfield)
377#define fetch_bitfield_string NULL
378#define fetch_bitfield_string_size NULL
379
380static __kprobes void
381free_bitfield_fetch_param(struct bitfield_fetch_param *data)
382{
383 /*
384 * Don't check the bitfield itself, because this must be the
385 * last fetch function.
386 */
387 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
388 free_deref_fetch_param(data->orig.data);
389 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
390 free_symbol_cache(data->orig.data);
391 kfree(data);
392}
356/* Default (unsigned long) fetch type */ 393/* Default (unsigned long) fetch type */
357#define __DEFAULT_FETCH_TYPE(t) u##t 394#define __DEFAULT_FETCH_TYPE(t) u##t
358#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) 395#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -367,6 +404,7 @@ enum {
367 FETCH_MTD_memory, 404 FETCH_MTD_memory,
368 FETCH_MTD_symbol, 405 FETCH_MTD_symbol,
369 FETCH_MTD_deref, 406 FETCH_MTD_deref,
407 FETCH_MTD_bitfield,
370 FETCH_MTD_END, 408 FETCH_MTD_END,
371}; 409};
372 410
@@ -387,6 +425,7 @@ ASSIGN_FETCH_FUNC(retval, ftype), \
387ASSIGN_FETCH_FUNC(memory, ftype), \ 425ASSIGN_FETCH_FUNC(memory, ftype), \
388ASSIGN_FETCH_FUNC(symbol, ftype), \ 426ASSIGN_FETCH_FUNC(symbol, ftype), \
389ASSIGN_FETCH_FUNC(deref, ftype), \ 427ASSIGN_FETCH_FUNC(deref, ftype), \
428ASSIGN_FETCH_FUNC(bitfield, ftype), \
390 } \ 429 } \
391 } 430 }
392 431
@@ -430,9 +469,33 @@ static const struct fetch_type *find_fetch_type(const char *type)
430 if (!type) 469 if (!type)
431 type = DEFAULT_FETCH_TYPE_STR; 470 type = DEFAULT_FETCH_TYPE_STR;
432 471
472 /* Special case: bitfield */
473 if (*type == 'b') {
474 unsigned long bs;
475 type = strchr(type, '/');
476 if (!type)
477 goto fail;
478 type++;
479 if (strict_strtoul(type, 0, &bs))
480 goto fail;
481 switch (bs) {
482 case 8:
483 return find_fetch_type("u8");
484 case 16:
485 return find_fetch_type("u16");
486 case 32:
487 return find_fetch_type("u32");
488 case 64:
489 return find_fetch_type("u64");
490 default:
491 goto fail;
492 }
493 }
494
433 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) 495 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
434 if (strcmp(type, fetch_type_table[i].name) == 0) 496 if (strcmp(type, fetch_type_table[i].name) == 0)
435 return &fetch_type_table[i]; 497 return &fetch_type_table[i];
498fail:
436 return NULL; 499 return NULL;
437} 500}
438 501
@@ -586,7 +649,9 @@ error:
586 649
587static void free_probe_arg(struct probe_arg *arg) 650static void free_probe_arg(struct probe_arg *arg)
588{ 651{
589 if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) 652 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
653 free_bitfield_fetch_param(arg->fetch.data);
654 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
590 free_deref_fetch_param(arg->fetch.data); 655 free_deref_fetch_param(arg->fetch.data);
591 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) 656 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
592 free_symbol_cache(arg->fetch.data); 657 free_symbol_cache(arg->fetch.data);
@@ -767,16 +832,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
767 } 832 }
768 break; 833 break;
769 case '+': /* deref memory */ 834 case '+': /* deref memory */
835 arg++; /* Skip '+', because strict_strtol() rejects it. */
770 case '-': 836 case '-':
771 tmp = strchr(arg, '('); 837 tmp = strchr(arg, '(');
772 if (!tmp) 838 if (!tmp)
773 break; 839 break;
774 *tmp = '\0'; 840 *tmp = '\0';
775 ret = strict_strtol(arg + 1, 0, &offset); 841 ret = strict_strtol(arg, 0, &offset);
776 if (ret) 842 if (ret)
777 break; 843 break;
778 if (arg[0] == '-')
779 offset = -offset;
780 arg = tmp + 1; 844 arg = tmp + 1;
781 tmp = strrchr(arg, ')'); 845 tmp = strrchr(arg, ')');
782 if (tmp) { 846 if (tmp) {
@@ -807,6 +871,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
807 return ret; 871 return ret;
808} 872}
809 873
874#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
875
876/* Bitfield type needs to be parsed into a fetch function */
877static int __parse_bitfield_probe_arg(const char *bf,
878 const struct fetch_type *t,
879 struct fetch_param *f)
880{
881 struct bitfield_fetch_param *bprm;
882 unsigned long bw, bo;
883 char *tail;
884
885 if (*bf != 'b')
886 return 0;
887
888 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
889 if (!bprm)
890 return -ENOMEM;
891 bprm->orig = *f;
892 f->fn = t->fetch[FETCH_MTD_bitfield];
893 f->data = (void *)bprm;
894
895 bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
896 if (bw == 0 || *tail != '@')
897 return -EINVAL;
898
899 bf = tail + 1;
900 bo = simple_strtoul(bf, &tail, 0);
901 if (tail == bf || *tail != '/')
902 return -EINVAL;
903
904 bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
905 bprm->low_shift = bprm->hi_shift + bo;
906 return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
907}
908
810/* String length checking wrapper */ 909/* String length checking wrapper */
811static int parse_probe_arg(char *arg, struct trace_probe *tp, 910static int parse_probe_arg(char *arg, struct trace_probe *tp,
812 struct probe_arg *parg, int is_return) 911 struct probe_arg *parg, int is_return)
@@ -836,6 +935,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
836 parg->offset = tp->size; 935 parg->offset = tp->size;
837 tp->size += parg->type->size; 936 tp->size += parg->type->size;
838 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); 937 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
938 if (ret >= 0 && t != NULL)
939 ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
839 if (ret >= 0) { 940 if (ret >= 0) {
840 parg->fetch_size.fn = get_fetch_size_function(parg->type, 941 parg->fetch_size.fn = get_fetch_size_function(parg->type,
841 parg->fetch.fn); 942 parg->fetch.fn);
@@ -1130,7 +1231,7 @@ static int command_trace_probe(const char *buf)
1130 return ret; 1231 return ret;
1131} 1232}
1132 1233
1133#define WRITE_BUFSIZE 128 1234#define WRITE_BUFSIZE 4096
1134 1235
1135static ssize_t probes_write(struct file *file, const char __user *buffer, 1236static ssize_t probes_write(struct file *file, const char __user *buffer,
1136 size_t count, loff_t *ppos) 1237 size_t count, loff_t *ppos)
@@ -1738,7 +1839,7 @@ static void unregister_probe_event(struct trace_probe *tp)
1738 kfree(tp->call.print_fmt); 1839 kfree(tp->call.print_fmt);
1739} 1840}
1740 1841
1741/* Make a debugfs interface for controling probe points */ 1842/* Make a debugfs interface for controlling probe points */
1742static __init int init_kprobe_trace(void) 1843static __init int init_kprobe_trace(void)
1743{ 1844{
1744 struct dentry *d_tracer; 1845 struct dentry *d_tracer;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 02272baa2206..456be9063c2d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -529,24 +529,34 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
529 * @entry: The trace entry field from the ring buffer 529 * @entry: The trace entry field from the ring buffer
530 * 530 *
531 * Prints the generic fields of irqs off, in hard or softirq, preempt 531 * Prints the generic fields of irqs off, in hard or softirq, preempt
532 * count and lock depth. 532 * count.
533 */ 533 */
534int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) 534int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
535{ 535{
536 int hardirq, softirq; 536 char hardsoft_irq;
537 char need_resched;
538 char irqs_off;
539 int hardirq;
540 int softirq;
537 int ret; 541 int ret;
538 542
539 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 543 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
540 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 544 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
541 545
546 irqs_off =
547 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
548 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
549 '.';
550 need_resched =
551 (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
552 hardsoft_irq =
553 (hardirq && softirq) ? 'H' :
554 hardirq ? 'h' :
555 softirq ? 's' :
556 '.';
557
542 if (!trace_seq_printf(s, "%c%c%c", 558 if (!trace_seq_printf(s, "%c%c%c",
543 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 559 irqs_off, need_resched, hardsoft_irq))
544 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
545 'X' : '.',
546 (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
547 'N' : '.',
548 (hardirq && softirq) ? 'H' :
549 hardirq ? 'h' : softirq ? 's' : '.'))
550 return 0; 560 return 0;
551 561
552 if (entry->preempt_count) 562 if (entry->preempt_count)
@@ -554,13 +564,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
554 else 564 else
555 ret = trace_seq_putc(s, '.'); 565 ret = trace_seq_putc(s, '.');
556 566
557 if (!ret) 567 return ret;
558 return 0;
559
560 if (entry->lock_depth < 0)
561 return trace_seq_putc(s, '.');
562
563 return trace_seq_printf(s, "%d", entry->lock_depth);
564} 568}
565 569
566static int 570static int
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8f758d070c43..7e62c0a18456 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
247 ctx_trace = tr; 247 ctx_trace = tr;
248} 248}
249 249
250static void stop_sched_trace(struct trace_array *tr)
251{
252 tracing_stop_sched_switch_record();
253}
254
255static int sched_switch_trace_init(struct trace_array *tr)
256{
257 ctx_trace = tr;
258 tracing_reset_online_cpus(tr);
259 tracing_start_sched_switch_record();
260 return 0;
261}
262
263static void sched_switch_trace_reset(struct trace_array *tr)
264{
265 if (sched_ref)
266 stop_sched_trace(tr);
267}
268
269static void sched_switch_trace_start(struct trace_array *tr)
270{
271 sched_stopped = 0;
272}
273
274static void sched_switch_trace_stop(struct trace_array *tr)
275{
276 sched_stopped = 1;
277}
278
279static struct tracer sched_switch_trace __read_mostly =
280{
281 .name = "sched_switch",
282 .init = sched_switch_trace_init,
283 .reset = sched_switch_trace_reset,
284 .start = sched_switch_trace_start,
285 .stop = sched_switch_trace_stop,
286 .wait_pipe = poll_wait_pipe,
287#ifdef CONFIG_FTRACE_SELFTEST
288 .selftest = trace_selftest_startup_sched_switch,
289#endif
290};
291
292__init static int init_sched_switch_trace(void)
293{
294 return register_tracer(&sched_switch_trace);
295}
296device_initcall(init_sched_switch_trace);
297
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5c9fe08d2093..ee7b5a0bb9f8 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -60,6 +60,19 @@ extern struct syscall_metadata *__stop_syscalls_metadata[];
60 60
61static struct syscall_metadata **syscalls_metadata; 61static struct syscall_metadata **syscalls_metadata;
62 62
63#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
64static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
65{
66 /*
67 * Only compare after the "sys" prefix. Archs that use
68 * syscall wrappers may have syscalls symbols aliases prefixed
69 * with "SyS" instead of "sys", leading to an unwanted
70 * mismatch.
71 */
72 return !strcmp(sym + 3, name + 3);
73}
74#endif
75
63static __init struct syscall_metadata * 76static __init struct syscall_metadata *
64find_syscall_meta(unsigned long syscall) 77find_syscall_meta(unsigned long syscall)
65{ 78{
@@ -72,14 +85,11 @@ find_syscall_meta(unsigned long syscall)
72 stop = __stop_syscalls_metadata; 85 stop = __stop_syscalls_metadata;
73 kallsyms_lookup(syscall, NULL, NULL, NULL, str); 86 kallsyms_lookup(syscall, NULL, NULL, NULL, str);
74 87
88 if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
89 return NULL;
90
75 for ( ; start < stop; start++) { 91 for ( ; start < stop; start++) {
76 /* 92 if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
77 * Only compare after the "sys" prefix. Archs that use
78 * syscall wrappers may have syscalls symbols aliases prefixed
79 * with "SyS" instead of "sys", leading to an unwanted
80 * mismatch.
81 */
82 if ((*start)->name && !strcmp((*start)->name + 3, str + 3))
83 return *start; 93 return *start;
84 } 94 }
85 return NULL; 95 return NULL;
@@ -359,7 +369,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
359 int num; 369 int num;
360 370
361 num = ((struct syscall_metadata *)call->data)->syscall_nr; 371 num = ((struct syscall_metadata *)call->data)->syscall_nr;
362 if (num < 0 || num >= NR_syscalls) 372 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
363 return -ENOSYS; 373 return -ENOSYS;
364 mutex_lock(&syscall_trace_lock); 374 mutex_lock(&syscall_trace_lock);
365 if (!sys_refcount_enter) 375 if (!sys_refcount_enter)
@@ -377,7 +387,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
377 int num; 387 int num;
378 388
379 num = ((struct syscall_metadata *)call->data)->syscall_nr; 389 num = ((struct syscall_metadata *)call->data)->syscall_nr;
380 if (num < 0 || num >= NR_syscalls) 390 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
381 return; 391 return;
382 mutex_lock(&syscall_trace_lock); 392 mutex_lock(&syscall_trace_lock);
383 sys_refcount_enter--; 393 sys_refcount_enter--;
@@ -393,7 +403,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
393 int num; 403 int num;
394 404
395 num = ((struct syscall_metadata *)call->data)->syscall_nr; 405 num = ((struct syscall_metadata *)call->data)->syscall_nr;
396 if (num < 0 || num >= NR_syscalls) 406 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
397 return -ENOSYS; 407 return -ENOSYS;
398 mutex_lock(&syscall_trace_lock); 408 mutex_lock(&syscall_trace_lock);
399 if (!sys_refcount_exit) 409 if (!sys_refcount_exit)
@@ -411,7 +421,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
411 int num; 421 int num;
412 422
413 num = ((struct syscall_metadata *)call->data)->syscall_nr; 423 num = ((struct syscall_metadata *)call->data)->syscall_nr;
414 if (num < 0 || num >= NR_syscalls) 424 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
415 return; 425 return;
416 mutex_lock(&syscall_trace_lock); 426 mutex_lock(&syscall_trace_lock);
417 sys_refcount_exit--; 427 sys_refcount_exit--;
@@ -424,6 +434,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
424int init_syscall_trace(struct ftrace_event_call *call) 434int init_syscall_trace(struct ftrace_event_call *call)
425{ 435{
426 int id; 436 int id;
437 int num;
438
439 num = ((struct syscall_metadata *)call->data)->syscall_nr;
440 if (num < 0 || num >= NR_syscalls) {
441 pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
442 ((struct syscall_metadata *)call->data)->name);
443 return -ENOSYS;
444 }
427 445
428 if (set_syscall_print_fmt(call) < 0) 446 if (set_syscall_print_fmt(call) < 0)
429 return -ENOMEM; 447 return -ENOMEM;
@@ -438,7 +456,7 @@ int init_syscall_trace(struct ftrace_event_call *call)
438 return id; 456 return id;
439} 457}
440 458
441unsigned long __init arch_syscall_addr(int nr) 459unsigned long __init __weak arch_syscall_addr(int nr)
442{ 460{
443 return (unsigned long)sys_call_table[nr]; 461 return (unsigned long)sys_call_table[nr];
444} 462}
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 419209893d87..51c6e89e8619 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
189 struct group_info *group_info; 189 struct group_info *group_info;
190 int retval; 190 int retval;
191 191
192 if (!capable(CAP_SETGID)) 192 if (!nsown_capable(CAP_SETGID))
193 return -EPERM; 193 return -EPERM;
194 if ((unsigned)gidsetsize > NGROUPS_MAX) 194 if ((unsigned)gidsetsize > NGROUPS_MAX)
195 return -EINVAL; 195 return -EINVAL;
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index eb27fd3430a2..92cb706c7fc8 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -20,7 +20,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register);
20 20
21/* 21/*
22 * Removes a registered user return notifier. Must be called from atomic 22 * Removes a registered user return notifier. Must be called from atomic
23 * context, and from the same cpu registration occured in. 23 * context, and from the same cpu registration occurred in.
24 */ 24 */
25void user_return_notifier_unregister(struct user_return_notifier *urn) 25void user_return_notifier_unregister(struct user_return_notifier *urn)
26{ 26{
diff --git a/kernel/user.c b/kernel/user.c
index 5c598ca781df..9e03e9c1df8d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -17,9 +17,13 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19 19
20/*
21 * userns count is 1 for root user, 1 for init_uts_ns,
22 * and 1 for... ?
23 */
20struct user_namespace init_user_ns = { 24struct user_namespace init_user_ns = {
21 .kref = { 25 .kref = {
22 .refcount = ATOMIC_INIT(2), 26 .refcount = ATOMIC_INIT(3),
23 }, 27 },
24 .creator = &root_user, 28 .creator = &root_user,
25}; 29};
@@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep;
47 */ 51 */
48static DEFINE_SPINLOCK(uidhash_lock); 52static DEFINE_SPINLOCK(uidhash_lock);
49 53
50/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */ 54/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */
51struct user_struct root_user = { 55struct user_struct root_user = {
52 .__count = ATOMIC_INIT(2), 56 .__count = ATOMIC_INIT(2),
53 .processes = ATOMIC_INIT(1), 57 .processes = ATOMIC_INIT(1),
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 8a82b4b8ea52..44646179eaba 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/user_namespace.h>
17 18
18static struct uts_namespace *create_uts_ns(void) 19static struct uts_namespace *create_uts_ns(void)
19{ 20{
@@ -30,7 +31,8 @@ static struct uts_namespace *create_uts_ns(void)
30 * @old_ns: namespace to clone 31 * @old_ns: namespace to clone
31 * Return NULL on error (failure to kmalloc), new ns otherwise 32 * Return NULL on error (failure to kmalloc), new ns otherwise
32 */ 33 */
33static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) 34static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
35 struct uts_namespace *old_ns)
34{ 36{
35 struct uts_namespace *ns; 37 struct uts_namespace *ns;
36 38
@@ -40,6 +42,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
40 42
41 down_read(&uts_sem); 43 down_read(&uts_sem);
42 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 44 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
45 ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
43 up_read(&uts_sem); 46 up_read(&uts_sem);
44 return ns; 47 return ns;
45} 48}
@@ -50,8 +53,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
50 * utsname of this process won't be seen by parent, and vice 53 * utsname of this process won't be seen by parent, and vice
51 * versa. 54 * versa.
52 */ 55 */
53struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns) 56struct uts_namespace *copy_utsname(unsigned long flags,
57 struct task_struct *tsk)
54{ 58{
59 struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
55 struct uts_namespace *new_ns; 60 struct uts_namespace *new_ns;
56 61
57 BUG_ON(!old_ns); 62 BUG_ON(!old_ns);
@@ -60,7 +65,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol
60 if (!(flags & CLONE_NEWUTS)) 65 if (!(flags & CLONE_NEWUTS))
61 return old_ns; 66 return old_ns;
62 67
63 new_ns = clone_uts_ns(old_ns); 68 new_ns = clone_uts_ns(tsk, old_ns);
64 69
65 put_uts_ns(old_ns); 70 put_uts_ns(old_ns);
66 return new_ns; 71 return new_ns;
@@ -71,5 +76,6 @@ void free_uts_ns(struct kref *kref)
71 struct uts_namespace *ns; 76 struct uts_namespace *ns;
72 77
73 ns = container_of(kref, struct uts_namespace, kref); 78 ns = container_of(kref, struct uts_namespace, kref);
79 put_user_ns(ns->user_ns);
74 kfree(ns); 80 kfree(ns);
75} 81}
diff --git a/kernel/wait.c b/kernel/wait.c
index b0310eb6cc1e..f45ea8d2a1ce 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -142,7 +142,7 @@ EXPORT_SYMBOL(finish_wait);
142 * woken up through the queue. 142 * woken up through the queue.
143 * 143 *
144 * This prevents waiter starvation where an exclusive waiter 144 * This prevents waiter starvation where an exclusive waiter
145 * aborts and is woken up concurrently and noone wakes up 145 * aborts and is woken up concurrently and no one wakes up
146 * the next waiter. 146 * the next waiter.
147 */ 147 */
148void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, 148void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 18bb15776c57..14733d4d156b 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -48,12 +48,15 @@ static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
48 * Should we panic when a soft-lockup or hard-lockup occurs: 48 * Should we panic when a soft-lockup or hard-lockup occurs:
49 */ 49 */
50#ifdef CONFIG_HARDLOCKUP_DETECTOR 50#ifdef CONFIG_HARDLOCKUP_DETECTOR
51static int hardlockup_panic; 51static int hardlockup_panic =
52 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
52 53
53static int __init hardlockup_panic_setup(char *str) 54static int __init hardlockup_panic_setup(char *str)
54{ 55{
55 if (!strncmp(str, "panic", 5)) 56 if (!strncmp(str, "panic", 5))
56 hardlockup_panic = 1; 57 hardlockup_panic = 1;
58 else if (!strncmp(str, "nopanic", 7))
59 hardlockup_panic = 0;
57 else if (!strncmp(str, "0", 1)) 60 else if (!strncmp(str, "0", 1))
58 watchdog_enabled = 0; 61 watchdog_enabled = 0;
59 return 1; 62 return 1;
@@ -415,19 +418,25 @@ static int watchdog_prepare_cpu(int cpu)
415static int watchdog_enable(int cpu) 418static int watchdog_enable(int cpu)
416{ 419{
417 struct task_struct *p = per_cpu(softlockup_watchdog, cpu); 420 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
418 int err; 421 int err = 0;
419 422
420 /* enable the perf event */ 423 /* enable the perf event */
421 err = watchdog_nmi_enable(cpu); 424 err = watchdog_nmi_enable(cpu);
422 if (err) 425
423 return err; 426 /* Regardless of err above, fall through and start softlockup */
424 427
425 /* create the watchdog thread */ 428 /* create the watchdog thread */
426 if (!p) { 429 if (!p) {
427 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); 430 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
428 if (IS_ERR(p)) { 431 if (IS_ERR(p)) {
429 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); 432 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
430 return PTR_ERR(p); 433 if (!err) {
434 /* if hardlockup hasn't already set this */
435 err = PTR_ERR(p);
436 /* and disable the perf event */
437 watchdog_nmi_disable(cpu);
438 }
439 goto out;
431 } 440 }
432 kthread_bind(p, cpu); 441 kthread_bind(p, cpu);
433 per_cpu(watchdog_touch_ts, cpu) = 0; 442 per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -435,7 +444,8 @@ static int watchdog_enable(int cpu)
435 wake_up_process(p); 444 wake_up_process(p);
436 } 445 }
437 446
438 return 0; 447out:
448 return err;
439} 449}
440 450
441static void watchdog_disable(int cpu) 451static void watchdog_disable(int cpu)
@@ -547,7 +557,13 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
547 break; 557 break;
548#endif /* CONFIG_HOTPLUG_CPU */ 558#endif /* CONFIG_HOTPLUG_CPU */
549 } 559 }
550 return notifier_from_errno(err); 560
561 /*
562 * hardlockup and softlockup are not important enough
563 * to block cpu bring up. Just always succeed and
564 * rely on printk output to flag problems.
565 */
566 return NOTIFY_OK;
551} 567}
552 568
553static struct notifier_block __cpuinitdata cpu_nfb = { 569static struct notifier_block __cpuinitdata cpu_nfb = {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ee6578b578ad..e3378e8d3a5c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -251,10 +251,12 @@ struct workqueue_struct *system_wq __read_mostly;
251struct workqueue_struct *system_long_wq __read_mostly; 251struct workqueue_struct *system_long_wq __read_mostly;
252struct workqueue_struct *system_nrt_wq __read_mostly; 252struct workqueue_struct *system_nrt_wq __read_mostly;
253struct workqueue_struct *system_unbound_wq __read_mostly; 253struct workqueue_struct *system_unbound_wq __read_mostly;
254struct workqueue_struct *system_freezable_wq __read_mostly;
254EXPORT_SYMBOL_GPL(system_wq); 255EXPORT_SYMBOL_GPL(system_wq);
255EXPORT_SYMBOL_GPL(system_long_wq); 256EXPORT_SYMBOL_GPL(system_long_wq);
256EXPORT_SYMBOL_GPL(system_nrt_wq); 257EXPORT_SYMBOL_GPL(system_nrt_wq);
257EXPORT_SYMBOL_GPL(system_unbound_wq); 258EXPORT_SYMBOL_GPL(system_unbound_wq);
259EXPORT_SYMBOL_GPL(system_freezable_wq);
258 260
259#define CREATE_TRACE_POINTS 261#define CREATE_TRACE_POINTS
260#include <trace/events/workqueue.h> 262#include <trace/events/workqueue.h>
@@ -316,6 +318,11 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
316 318
317static struct debug_obj_descr work_debug_descr; 319static struct debug_obj_descr work_debug_descr;
318 320
321static void *work_debug_hint(void *addr)
322{
323 return ((struct work_struct *) addr)->func;
324}
325
319/* 326/*
320 * fixup_init is called when: 327 * fixup_init is called when:
321 * - an active object is initialized 328 * - an active object is initialized
@@ -387,6 +394,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state)
387 394
388static struct debug_obj_descr work_debug_descr = { 395static struct debug_obj_descr work_debug_descr = {
389 .name = "work_struct", 396 .name = "work_struct",
397 .debug_hint = work_debug_hint,
390 .fixup_init = work_fixup_init, 398 .fixup_init = work_fixup_init,
391 .fixup_activate = work_fixup_activate, 399 .fixup_activate = work_fixup_activate,
392 .fixup_free = work_fixup_free, 400 .fixup_free = work_fixup_free,
@@ -1283,8 +1291,14 @@ __acquires(&gcwq->lock)
1283 return true; 1291 return true;
1284 spin_unlock_irq(&gcwq->lock); 1292 spin_unlock_irq(&gcwq->lock);
1285 1293
1286 /* CPU has come up inbetween, retry migration */ 1294 /*
1295 * We've raced with CPU hot[un]plug. Give it a breather
1296 * and retry migration. cond_resched() is required here;
1297 * otherwise, we might deadlock against cpu_stop trying to
1298 * bring down the CPU on non-preemptive kernel.
1299 */
1287 cpu_relax(); 1300 cpu_relax();
1301 cond_resched();
1288 } 1302 }
1289} 1303}
1290 1304
@@ -1358,8 +1372,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
1358 worker->id = id; 1372 worker->id = id;
1359 1373
1360 if (!on_unbound_cpu) 1374 if (!on_unbound_cpu)
1361 worker->task = kthread_create(worker_thread, worker, 1375 worker->task = kthread_create_on_node(worker_thread,
1362 "kworker/%u:%d", gcwq->cpu, id); 1376 worker,
1377 cpu_to_node(gcwq->cpu),
1378 "kworker/%u:%d", gcwq->cpu, id);
1363 else 1379 else
1364 worker->task = kthread_create(worker_thread, worker, 1380 worker->task = kthread_create(worker_thread, worker,
1365 "kworker/u:%d", id); 1381 "kworker/u:%d", id);
@@ -3775,8 +3791,10 @@ static int __init init_workqueues(void)
3775 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); 3791 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3776 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, 3792 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3777 WQ_UNBOUND_MAX_ACTIVE); 3793 WQ_UNBOUND_MAX_ACTIVE);
3794 system_freezable_wq = alloc_workqueue("events_freezable",
3795 WQ_FREEZABLE, 0);
3778 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || 3796 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
3779 !system_unbound_wq); 3797 !system_unbound_wq || !system_freezable_wq);
3780 return 0; 3798 return 0;
3781} 3799}
3782early_initcall(init_workqueues); 3800early_initcall(init_workqueues);