aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit.c8
-rw-r--r--kernel/audit_watch.c85
-rw-r--r--kernel/auditfilter.c10
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/capability.c96
-rw-r--r--kernel/cgroup.c68
-rw-r--r--kernel/compat.c136
-rw-r--r--kernel/cpu.c11
-rw-r--r--kernel/cpuset.c80
-rw-r--r--kernel/crash_dump.c34
-rw-r--r--kernel/cred.c8
-rw-r--r--kernel/debug/gdbstub.c30
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c155
-rw-r--r--kernel/futex.c158
-rw-r--r--kernel/futex_compat.c11
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/gcov/Makefile2
-rw-r--r--kernel/groups.c2
-rw-r--r--kernel/hrtimer.c90
-rw-r--r--kernel/irq/Kconfig43
-rw-r--r--kernel/irq/autoprobe.c54
-rw-r--r--kernel/irq/chip.c483
-rw-r--r--kernel/irq/compat.h72
-rw-r--r--kernel/irq/debug.h40
-rw-r--r--kernel/irq/handle.c144
-rw-r--r--kernel/irq/internals.h167
-rw-r--r--kernel/irq/irqdesc.c76
-rw-r--r--kernel/irq/manage.c604
-rw-r--r--kernel/irq/migration.c38
-rw-r--r--kernel/irq/pm.c30
-rw-r--r--kernel/irq/proc.c84
-rw-r--r--kernel/irq/resend.c17
-rw-r--r--kernel/irq/settings.h138
-rw-r--r--kernel/irq/spurious.c163
-rw-r--r--kernel/kallsyms.c58
-rw-r--r--kernel/kthread.c31
-rw-r--r--kernel/lockdep_proc.c9
-rw-r--r--kernel/module.c4
-rw-r--r--kernel/nsproxy.c4
-rw-r--r--kernel/panic.c10
-rw-r--r--kernel/perf_event.c1034
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/pid_namespace.c11
-rw-r--r--kernel/pm_qos_params.c24
-rw-r--r--kernel/posix-cpu-timers.c110
-rw-r--r--kernel/posix-timers.c342
-rw-r--r--kernel/power/Kconfig237
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/block_io.c2
-rw-r--r--kernel/power/hibernate.c9
-rw-r--r--kernel/power/main.c3
-rw-r--r--kernel/power/snapshot.c8
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/printk.c174
-rw-r--r--kernel/ptrace.c27
-rw-r--r--kernel/rcupdate.c10
-rw-r--r--kernel/rcutiny_plugin.h2
-rw-r--r--kernel/rcutorture.c1
-rw-r--r--kernel/res_counter.c14
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex-tester.c40
-rw-r--r--kernel/rtmutex.c318
-rw-r--r--kernel/rtmutex_common.h16
-rw-r--r--kernel/sched.c376
-rw-r--r--kernel/sched_autogroup.c15
-rw-r--r--kernel/sched_autogroup.h5
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c397
-rw-r--r--kernel/sched_idletask.c28
-rw-r--r--kernel/sched_rt.c19
-rw-r--r--kernel/sched_stoptask.c9
-rw-r--r--kernel/signal.c46
-rw-r--r--kernel/smp.c152
-rw-r--r--kernel/softirq.c29
-rw-r--r--kernel/stop_machine.c6
-rw-r--r--kernel/sys.c81
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c51
-rw-r--r--kernel/sysctl_binary.c19
-rw-r--r--kernel/sysctl_check.c10
-rw-r--r--kernel/taskstats.c2
-rw-r--r--kernel/time.c35
-rw-r--r--kernel/time/Makefile3
-rw-r--r--kernel/time/clockevents.c1
-rw-r--r--kernel/time/jiffies.c20
-rw-r--r--kernel/time/ntp.c13
-rw-r--r--kernel/time/posix-clock.c451
-rw-r--r--kernel/time/tick-broadcast.c1
-rw-r--r--kernel/time/tick-common.c1
-rw-r--r--kernel/time/tick-internal.h9
-rw-r--r--kernel/time/tick-oneshot.c1
-rw-r--r--kernel/time/tick-sched.c1
-rw-r--r--kernel/time/timekeeping.c141
-rw-r--r--kernel/timer.c42
-rw-r--r--kernel/trace/Kconfig4
-rw-r--r--kernel/trace/blktrace.c15
-rw-r--r--kernel/trace/ftrace.c55
-rw-r--r--kernel/trace/ring_buffer.c26
-rw-r--r--kernel/trace/trace.c38
-rw-r--r--kernel/trace/trace.h41
-rw-r--r--kernel/trace/trace_entries.h6
-rw-r--r--kernel/trace/trace_events.c2
-rw-r--r--kernel/trace/trace_events_filter.c885
-rw-r--r--kernel/trace/trace_kprobe.c111
-rw-r--r--kernel/trace/trace_output.c36
-rw-r--r--kernel/trace/trace_sched_switch.c48
-rw-r--r--kernel/trace/trace_syscalls.c42
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/user.c8
-rw-r--r--kernel/utsname.c12
-rw-r--r--kernel/watchdog.c27
-rw-r--r--kernel/workqueue.c18
114 files changed, 6228 insertions, 2770 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 353d3fe8ba33..85cbfb31e73e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -107,6 +107,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
107obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 107obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
108obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 108obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
109obj-$(CONFIG_PADATA) += padata.o 109obj-$(CONFIG_PADATA) += padata.o
110obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
110 111
111ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 112ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
112# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 113# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit.c b/kernel/audit.c
index e4956244ae50..939500317066 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -74,6 +74,8 @@ static int audit_initialized;
74int audit_enabled; 74int audit_enabled;
75int audit_ever_enabled; 75int audit_ever_enabled;
76 76
77EXPORT_SYMBOL_GPL(audit_enabled);
78
77/* Default state when kernel boots without any parameters. */ 79/* Default state when kernel boots without any parameters. */
78static int audit_default; 80static int audit_default;
79 81
@@ -671,9 +673,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
671 673
672 pid = NETLINK_CREDS(skb)->pid; 674 pid = NETLINK_CREDS(skb)->pid;
673 uid = NETLINK_CREDS(skb)->uid; 675 uid = NETLINK_CREDS(skb)->uid;
674 loginuid = NETLINK_CB(skb).loginuid; 676 loginuid = audit_get_loginuid(current);
675 sessionid = NETLINK_CB(skb).sessionid; 677 sessionid = audit_get_sessionid(current);
676 sid = NETLINK_CB(skb).sid; 678 security_task_getsecid(current, &sid);
677 seq = nlh->nlmsg_seq; 679 seq = nlh->nlmsg_seq;
678 data = NLMSG_DATA(nlh); 680 data = NLMSG_DATA(nlh);
679 681
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index d2e3c7866460..e683869365d9 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
144} 144}
145 145
146/* Initialize a parent watch entry. */ 146/* Initialize a parent watch entry. */
147static struct audit_parent *audit_init_parent(struct nameidata *ndp) 147static struct audit_parent *audit_init_parent(struct path *path)
148{ 148{
149 struct inode *inode = ndp->path.dentry->d_inode; 149 struct inode *inode = path->dentry->d_inode;
150 struct audit_parent *parent; 150 struct audit_parent *parent;
151 int ret; 151 int ret;
152 152
@@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
353} 353}
354 354
355/* Get path information necessary for adding watches. */ 355/* Get path information necessary for adding watches. */
356static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) 356static int audit_get_nd(struct audit_watch *watch, struct path *parent)
357{ 357{
358 struct nameidata *ndparent, *ndwatch; 358 struct nameidata nd;
359 struct dentry *d;
359 int err; 360 int err;
360 361
361 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); 362 err = kern_path_parent(watch->path, &nd);
362 if (unlikely(!ndparent)) 363 if (err)
363 return -ENOMEM; 364 return err;
364 365
365 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); 366 if (nd.last_type != LAST_NORM) {
366 if (unlikely(!ndwatch)) { 367 path_put(&nd.path);
367 kfree(ndparent); 368 return -EINVAL;
368 return -ENOMEM;
369 } 369 }
370 370
371 err = path_lookup(path, LOOKUP_PARENT, ndparent); 371 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
372 if (err) { 372 d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
373 kfree(ndparent); 373 if (IS_ERR(d)) {
374 kfree(ndwatch); 374 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
375 return err; 375 path_put(&nd.path);
376 return PTR_ERR(d);
376 } 377 }
377 378 if (d->d_inode) {
378 err = path_lookup(path, 0, ndwatch); 379 /* update watch filter fields */
379 if (err) { 380 watch->dev = d->d_inode->i_sb->s_dev;
380 kfree(ndwatch); 381 watch->ino = d->d_inode->i_ino;
381 ndwatch = NULL;
382 } 382 }
383 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
383 384
384 *ndp = ndparent; 385 *parent = nd.path;
385 *ndw = ndwatch; 386 dput(d);
386
387 return 0; 387 return 0;
388} 388}
389 389
390/* Release resources used for watch path information. */
391static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
392{
393 if (ndp) {
394 path_put(&ndp->path);
395 kfree(ndp);
396 }
397 if (ndw) {
398 path_put(&ndw->path);
399 kfree(ndw);
400 }
401}
402
403/* Associate the given rule with an existing parent. 390/* Associate the given rule with an existing parent.
404 * Caller must hold audit_filter_mutex. */ 391 * Caller must hold audit_filter_mutex. */
405static void audit_add_to_parent(struct audit_krule *krule, 392static void audit_add_to_parent(struct audit_krule *krule,
@@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
440{ 427{
441 struct audit_watch *watch = krule->watch; 428 struct audit_watch *watch = krule->watch;
442 struct audit_parent *parent; 429 struct audit_parent *parent;
443 struct nameidata *ndp = NULL, *ndw = NULL; 430 struct path parent_path;
444 int h, ret = 0; 431 int h, ret = 0;
445 432
446 mutex_unlock(&audit_filter_mutex); 433 mutex_unlock(&audit_filter_mutex);
447 434
448 /* Avoid calling path_lookup under audit_filter_mutex. */ 435 /* Avoid calling path_lookup under audit_filter_mutex. */
449 ret = audit_get_nd(watch->path, &ndp, &ndw); 436 ret = audit_get_nd(watch, &parent_path);
450 if (ret) {
451 /* caller expects mutex locked */
452 mutex_lock(&audit_filter_mutex);
453 goto error;
454 }
455 437
438 /* caller expects mutex locked */
456 mutex_lock(&audit_filter_mutex); 439 mutex_lock(&audit_filter_mutex);
457 440
458 /* update watch filter fields */ 441 if (ret)
459 if (ndw) { 442 return ret;
460 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
461 watch->ino = ndw->path.dentry->d_inode->i_ino;
462 }
463 443
464 /* either find an old parent or attach a new one */ 444 /* either find an old parent or attach a new one */
465 parent = audit_find_parent(ndp->path.dentry->d_inode); 445 parent = audit_find_parent(parent_path.dentry->d_inode);
466 if (!parent) { 446 if (!parent) {
467 parent = audit_init_parent(ndp); 447 parent = audit_init_parent(&parent_path);
468 if (IS_ERR(parent)) { 448 if (IS_ERR(parent)) {
469 ret = PTR_ERR(parent); 449 ret = PTR_ERR(parent);
470 goto error; 450 goto error;
@@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
479 h = audit_hash_ino((u32)watch->ino); 459 h = audit_hash_ino((u32)watch->ino);
480 *list = &audit_inode_hash[h]; 460 *list = &audit_inode_hash[h];
481error: 461error:
482 audit_put_nd(ndp, ndw); /* NULL args OK */ 462 path_put(&parent_path);
483 return ret; 463 return ret;
484
485} 464}
486 465
487void audit_remove_watch_rule(struct audit_krule *krule) 466void audit_remove_watch_rule(struct audit_krule *krule)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index add2819af71b..f8277c80d678 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1238,6 +1238,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1238 for (i = 0; i < rule->field_count; i++) { 1238 for (i = 0; i < rule->field_count; i++) {
1239 struct audit_field *f = &rule->fields[i]; 1239 struct audit_field *f = &rule->fields[i];
1240 int result = 0; 1240 int result = 0;
1241 u32 sid;
1241 1242
1242 switch (f->type) { 1243 switch (f->type) {
1243 case AUDIT_PID: 1244 case AUDIT_PID:
@@ -1250,19 +1251,22 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1250 result = audit_comparator(cb->creds.gid, f->op, f->val); 1251 result = audit_comparator(cb->creds.gid, f->op, f->val);
1251 break; 1252 break;
1252 case AUDIT_LOGINUID: 1253 case AUDIT_LOGINUID:
1253 result = audit_comparator(cb->loginuid, f->op, f->val); 1254 result = audit_comparator(audit_get_loginuid(current),
1255 f->op, f->val);
1254 break; 1256 break;
1255 case AUDIT_SUBJ_USER: 1257 case AUDIT_SUBJ_USER:
1256 case AUDIT_SUBJ_ROLE: 1258 case AUDIT_SUBJ_ROLE:
1257 case AUDIT_SUBJ_TYPE: 1259 case AUDIT_SUBJ_TYPE:
1258 case AUDIT_SUBJ_SEN: 1260 case AUDIT_SUBJ_SEN:
1259 case AUDIT_SUBJ_CLR: 1261 case AUDIT_SUBJ_CLR:
1260 if (f->lsm_rule) 1262 if (f->lsm_rule) {
1261 result = security_audit_rule_match(cb->sid, 1263 security_task_getsecid(current, &sid);
1264 result = security_audit_rule_match(sid,
1262 f->type, 1265 f->type,
1263 f->op, 1266 f->op,
1264 f->lsm_rule, 1267 f->lsm_rule,
1265 NULL); 1268 NULL);
1269 }
1266 break; 1270 break;
1267 } 1271 }
1268 1272
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 98a51f26c136..0c9b862292b2 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,11 +9,13 @@
9#include <linux/page-flags.h> 9#include <linux/page-flags.h>
10#include <linux/mmzone.h> 10#include <linux/mmzone.h>
11#include <linux/kbuild.h> 11#include <linux/kbuild.h>
12#include <linux/page_cgroup.h>
12 13
13void foo(void) 14void foo(void)
14{ 15{
15 /* The enum constants to put into include/generated/bounds.h */ 16 /* The enum constants to put into include/generated/bounds.h */
16 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); 17 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
17 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); 18 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
19 DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
18 /* End of constants */ 20 /* End of constants */
19} 21}
diff --git a/kernel/capability.c b/kernel/capability.c
index 9e9385f132c8..bf0c734d0c12 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -14,6 +14,7 @@
14#include <linux/security.h> 14#include <linux/security.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
17#include <linux/user_namespace.h>
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
18 19
19/* 20/*
@@ -290,6 +291,60 @@ error:
290} 291}
291 292
292/** 293/**
294 * has_capability - Does a task have a capability in init_user_ns
295 * @t: The task in question
296 * @cap: The capability to be tested for
297 *
298 * Return true if the specified task has the given superior capability
299 * currently in effect to the initial user namespace, false if not.
300 *
301 * Note that this does not set PF_SUPERPRIV on the task.
302 */
303bool has_capability(struct task_struct *t, int cap)
304{
305 int ret = security_real_capable(t, &init_user_ns, cap);
306
307 return (ret == 0);
308}
309
310/**
311 * has_capability - Does a task have a capability in a specific user ns
312 * @t: The task in question
313 * @ns: target user namespace
314 * @cap: The capability to be tested for
315 *
316 * Return true if the specified task has the given superior capability
317 * currently in effect to the specified user namespace, false if not.
318 *
319 * Note that this does not set PF_SUPERPRIV on the task.
320 */
321bool has_ns_capability(struct task_struct *t,
322 struct user_namespace *ns, int cap)
323{
324 int ret = security_real_capable(t, ns, cap);
325
326 return (ret == 0);
327}
328
329/**
330 * has_capability_noaudit - Does a task have a capability (unaudited)
331 * @t: The task in question
332 * @cap: The capability to be tested for
333 *
334 * Return true if the specified task has the given superior capability
335 * currently in effect to init_user_ns, false if not. Don't write an
336 * audit message for the check.
337 *
338 * Note that this does not set PF_SUPERPRIV on the task.
339 */
340bool has_capability_noaudit(struct task_struct *t, int cap)
341{
342 int ret = security_real_capable_noaudit(t, &init_user_ns, cap);
343
344 return (ret == 0);
345}
346
347/**
293 * capable - Determine if the current task has a superior capability in effect 348 * capable - Determine if the current task has a superior capability in effect
294 * @cap: The capability to be tested for 349 * @cap: The capability to be tested for
295 * 350 *
@@ -299,17 +354,48 @@ error:
299 * This sets PF_SUPERPRIV on the task if the capability is available on the 354 * This sets PF_SUPERPRIV on the task if the capability is available on the
300 * assumption that it's about to be used. 355 * assumption that it's about to be used.
301 */ 356 */
302int capable(int cap) 357bool capable(int cap)
358{
359 return ns_capable(&init_user_ns, cap);
360}
361EXPORT_SYMBOL(capable);
362
363/**
364 * ns_capable - Determine if the current task has a superior capability in effect
365 * @ns: The usernamespace we want the capability in
366 * @cap: The capability to be tested for
367 *
368 * Return true if the current task has the given superior capability currently
369 * available for use, false if not.
370 *
371 * This sets PF_SUPERPRIV on the task if the capability is available on the
372 * assumption that it's about to be used.
373 */
374bool ns_capable(struct user_namespace *ns, int cap)
303{ 375{
304 if (unlikely(!cap_valid(cap))) { 376 if (unlikely(!cap_valid(cap))) {
305 printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); 377 printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
306 BUG(); 378 BUG();
307 } 379 }
308 380
309 if (security_capable(current_cred(), cap) == 0) { 381 if (security_capable(ns, current_cred(), cap) == 0) {
310 current->flags |= PF_SUPERPRIV; 382 current->flags |= PF_SUPERPRIV;
311 return 1; 383 return true;
312 } 384 }
313 return 0; 385 return false;
314} 386}
315EXPORT_SYMBOL(capable); 387EXPORT_SYMBOL(ns_capable);
388
389/**
390 * task_ns_capable - Determine whether current task has a superior
391 * capability targeted at a specific task's user namespace.
392 * @t: The task whose user namespace is targeted.
393 * @cap: The capability in question.
394 *
395 * Return true if it does, false otherwise.
396 */
397bool task_ns_capable(struct task_struct *t, int cap)
398{
399 return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
400}
401EXPORT_SYMBOL(task_ns_capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b24d7027b83c..e31b220a743d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1813,10 +1813,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1813 1813
1814 /* Update the css_set linked lists if we're using them */ 1814 /* Update the css_set linked lists if we're using them */
1815 write_lock(&css_set_lock); 1815 write_lock(&css_set_lock);
1816 if (!list_empty(&tsk->cg_list)) { 1816 if (!list_empty(&tsk->cg_list))
1817 list_del(&tsk->cg_list); 1817 list_move(&tsk->cg_list, &newcg->tasks);
1818 list_add(&tsk->cg_list, &newcg->tasks);
1819 }
1820 write_unlock(&css_set_lock); 1818 write_unlock(&css_set_lock);
1821 1819
1822 for_each_subsys(root, ss) { 1820 for_each_subsys(root, ss) {
@@ -3655,12 +3653,12 @@ again:
3655 spin_lock(&release_list_lock); 3653 spin_lock(&release_list_lock);
3656 set_bit(CGRP_REMOVED, &cgrp->flags); 3654 set_bit(CGRP_REMOVED, &cgrp->flags);
3657 if (!list_empty(&cgrp->release_list)) 3655 if (!list_empty(&cgrp->release_list))
3658 list_del(&cgrp->release_list); 3656 list_del_init(&cgrp->release_list);
3659 spin_unlock(&release_list_lock); 3657 spin_unlock(&release_list_lock);
3660 3658
3661 cgroup_lock_hierarchy(cgrp->root); 3659 cgroup_lock_hierarchy(cgrp->root);
3662 /* delete this cgroup from parent->children */ 3660 /* delete this cgroup from parent->children */
3663 list_del(&cgrp->sibling); 3661 list_del_init(&cgrp->sibling);
3664 cgroup_unlock_hierarchy(cgrp->root); 3662 cgroup_unlock_hierarchy(cgrp->root);
3665 3663
3666 d = dget(cgrp->dentry); 3664 d = dget(cgrp->dentry);
@@ -3879,7 +3877,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
3879 subsys[ss->subsys_id] = NULL; 3877 subsys[ss->subsys_id] = NULL;
3880 3878
3881 /* remove subsystem from rootnode's list of subsystems */ 3879 /* remove subsystem from rootnode's list of subsystems */
3882 list_del(&ss->sibling); 3880 list_del_init(&ss->sibling);
3883 3881
3884 /* 3882 /*
3885 * disentangle the css from all css_sets attached to the dummytop. as 3883 * disentangle the css from all css_sets attached to the dummytop. as
@@ -4230,20 +4228,8 @@ void cgroup_post_fork(struct task_struct *child)
4230 */ 4228 */
4231void cgroup_exit(struct task_struct *tsk, int run_callbacks) 4229void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4232{ 4230{
4233 int i;
4234 struct css_set *cg; 4231 struct css_set *cg;
4235 4232 int i;
4236 if (run_callbacks && need_forkexit_callback) {
4237 /*
4238 * modular subsystems can't use callbacks, so no need to lock
4239 * the subsys array
4240 */
4241 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4242 struct cgroup_subsys *ss = subsys[i];
4243 if (ss->exit)
4244 ss->exit(ss, tsk);
4245 }
4246 }
4247 4233
4248 /* 4234 /*
4249 * Unlink from the css_set task list if necessary. 4235 * Unlink from the css_set task list if necessary.
@@ -4253,7 +4239,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4253 if (!list_empty(&tsk->cg_list)) { 4239 if (!list_empty(&tsk->cg_list)) {
4254 write_lock(&css_set_lock); 4240 write_lock(&css_set_lock);
4255 if (!list_empty(&tsk->cg_list)) 4241 if (!list_empty(&tsk->cg_list))
4256 list_del(&tsk->cg_list); 4242 list_del_init(&tsk->cg_list);
4257 write_unlock(&css_set_lock); 4243 write_unlock(&css_set_lock);
4258 } 4244 }
4259 4245
@@ -4261,7 +4247,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4261 task_lock(tsk); 4247 task_lock(tsk);
4262 cg = tsk->cgroups; 4248 cg = tsk->cgroups;
4263 tsk->cgroups = &init_css_set; 4249 tsk->cgroups = &init_css_set;
4250
4251 if (run_callbacks && need_forkexit_callback) {
4252 /*
4253 * modular subsystems can't use callbacks, so no need to lock
4254 * the subsys array
4255 */
4256 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4257 struct cgroup_subsys *ss = subsys[i];
4258 if (ss->exit) {
4259 struct cgroup *old_cgrp =
4260 rcu_dereference_raw(cg->subsys[i])->cgroup;
4261 struct cgroup *cgrp = task_cgroup(tsk, i);
4262 ss->exit(ss, cgrp, old_cgrp, tsk);
4263 }
4264 }
4265 }
4264 task_unlock(tsk); 4266 task_unlock(tsk);
4267
4265 if (cg) 4268 if (cg)
4266 put_css_set_taskexit(cg); 4269 put_css_set_taskexit(cg);
4267} 4270}
@@ -4813,6 +4816,29 @@ css_get_next(struct cgroup_subsys *ss, int id,
4813 return ret; 4816 return ret;
4814} 4817}
4815 4818
4819/*
4820 * get corresponding css from file open on cgroupfs directory
4821 */
4822struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
4823{
4824 struct cgroup *cgrp;
4825 struct inode *inode;
4826 struct cgroup_subsys_state *css;
4827
4828 inode = f->f_dentry->d_inode;
4829 /* check in cgroup filesystem dir */
4830 if (inode->i_op != &cgroup_dir_inode_operations)
4831 return ERR_PTR(-EBADF);
4832
4833 if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
4834 return ERR_PTR(-EINVAL);
4835
4836 /* get cgroup */
4837 cgrp = __d_cgrp(f->f_dentry);
4838 css = cgrp->subsys[id];
4839 return css ? css : ERR_PTR(-ENOENT);
4840}
4841
4816#ifdef CONFIG_CGROUP_DEBUG 4842#ifdef CONFIG_CGROUP_DEBUG
4817static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, 4843static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
4818 struct cgroup *cont) 4844 struct cgroup *cont)
diff --git a/kernel/compat.c b/kernel/compat.c
index c9e2ec0b34a8..38b1d2c1cbe8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o,
52 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; 52 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
53} 53}
54 54
55static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
56{
57 memset(txc, 0, sizeof(struct timex));
58
59 if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
60 __get_user(txc->modes, &utp->modes) ||
61 __get_user(txc->offset, &utp->offset) ||
62 __get_user(txc->freq, &utp->freq) ||
63 __get_user(txc->maxerror, &utp->maxerror) ||
64 __get_user(txc->esterror, &utp->esterror) ||
65 __get_user(txc->status, &utp->status) ||
66 __get_user(txc->constant, &utp->constant) ||
67 __get_user(txc->precision, &utp->precision) ||
68 __get_user(txc->tolerance, &utp->tolerance) ||
69 __get_user(txc->time.tv_sec, &utp->time.tv_sec) ||
70 __get_user(txc->time.tv_usec, &utp->time.tv_usec) ||
71 __get_user(txc->tick, &utp->tick) ||
72 __get_user(txc->ppsfreq, &utp->ppsfreq) ||
73 __get_user(txc->jitter, &utp->jitter) ||
74 __get_user(txc->shift, &utp->shift) ||
75 __get_user(txc->stabil, &utp->stabil) ||
76 __get_user(txc->jitcnt, &utp->jitcnt) ||
77 __get_user(txc->calcnt, &utp->calcnt) ||
78 __get_user(txc->errcnt, &utp->errcnt) ||
79 __get_user(txc->stbcnt, &utp->stbcnt))
80 return -EFAULT;
81
82 return 0;
83}
84
85static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
86{
87 if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
88 __put_user(txc->modes, &utp->modes) ||
89 __put_user(txc->offset, &utp->offset) ||
90 __put_user(txc->freq, &utp->freq) ||
91 __put_user(txc->maxerror, &utp->maxerror) ||
92 __put_user(txc->esterror, &utp->esterror) ||
93 __put_user(txc->status, &utp->status) ||
94 __put_user(txc->constant, &utp->constant) ||
95 __put_user(txc->precision, &utp->precision) ||
96 __put_user(txc->tolerance, &utp->tolerance) ||
97 __put_user(txc->time.tv_sec, &utp->time.tv_sec) ||
98 __put_user(txc->time.tv_usec, &utp->time.tv_usec) ||
99 __put_user(txc->tick, &utp->tick) ||
100 __put_user(txc->ppsfreq, &utp->ppsfreq) ||
101 __put_user(txc->jitter, &utp->jitter) ||
102 __put_user(txc->shift, &utp->shift) ||
103 __put_user(txc->stabil, &utp->stabil) ||
104 __put_user(txc->jitcnt, &utp->jitcnt) ||
105 __put_user(txc->calcnt, &utp->calcnt) ||
106 __put_user(txc->errcnt, &utp->errcnt) ||
107 __put_user(txc->stbcnt, &utp->stbcnt) ||
108 __put_user(txc->tai, &utp->tai))
109 return -EFAULT;
110 return 0;
111}
112
55asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, 113asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
56 struct timezone __user *tz) 114 struct timezone __user *tz)
57{ 115{
@@ -617,6 +675,29 @@ long compat_sys_clock_gettime(clockid_t which_clock,
617 return err; 675 return err;
618} 676}
619 677
678long compat_sys_clock_adjtime(clockid_t which_clock,
679 struct compat_timex __user *utp)
680{
681 struct timex txc;
682 mm_segment_t oldfs;
683 int err, ret;
684
685 err = compat_get_timex(&txc, utp);
686 if (err)
687 return err;
688
689 oldfs = get_fs();
690 set_fs(KERNEL_DS);
691 ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc);
692 set_fs(oldfs);
693
694 err = compat_put_timex(utp, &txc);
695 if (err)
696 return err;
697
698 return ret;
699}
700
620long compat_sys_clock_getres(clockid_t which_clock, 701long compat_sys_clock_getres(clockid_t which_clock,
621 struct compat_timespec __user *tp) 702 struct compat_timespec __user *tp)
622{ 703{
@@ -951,58 +1032,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
951asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) 1032asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
952{ 1033{
953 struct timex txc; 1034 struct timex txc;
954 int ret; 1035 int err, ret;
955
956 memset(&txc, 0, sizeof(struct timex));
957 1036
958 if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || 1037 err = compat_get_timex(&txc, utp);
959 __get_user(txc.modes, &utp->modes) || 1038 if (err)
960 __get_user(txc.offset, &utp->offset) || 1039 return err;
961 __get_user(txc.freq, &utp->freq) ||
962 __get_user(txc.maxerror, &utp->maxerror) ||
963 __get_user(txc.esterror, &utp->esterror) ||
964 __get_user(txc.status, &utp->status) ||
965 __get_user(txc.constant, &utp->constant) ||
966 __get_user(txc.precision, &utp->precision) ||
967 __get_user(txc.tolerance, &utp->tolerance) ||
968 __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
969 __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
970 __get_user(txc.tick, &utp->tick) ||
971 __get_user(txc.ppsfreq, &utp->ppsfreq) ||
972 __get_user(txc.jitter, &utp->jitter) ||
973 __get_user(txc.shift, &utp->shift) ||
974 __get_user(txc.stabil, &utp->stabil) ||
975 __get_user(txc.jitcnt, &utp->jitcnt) ||
976 __get_user(txc.calcnt, &utp->calcnt) ||
977 __get_user(txc.errcnt, &utp->errcnt) ||
978 __get_user(txc.stbcnt, &utp->stbcnt))
979 return -EFAULT;
980 1040
981 ret = do_adjtimex(&txc); 1041 ret = do_adjtimex(&txc);
982 1042
983 if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || 1043 err = compat_put_timex(utp, &txc);
984 __put_user(txc.modes, &utp->modes) || 1044 if (err)
985 __put_user(txc.offset, &utp->offset) || 1045 return err;
986 __put_user(txc.freq, &utp->freq) ||
987 __put_user(txc.maxerror, &utp->maxerror) ||
988 __put_user(txc.esterror, &utp->esterror) ||
989 __put_user(txc.status, &utp->status) ||
990 __put_user(txc.constant, &utp->constant) ||
991 __put_user(txc.precision, &utp->precision) ||
992 __put_user(txc.tolerance, &utp->tolerance) ||
993 __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
994 __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
995 __put_user(txc.tick, &utp->tick) ||
996 __put_user(txc.ppsfreq, &utp->ppsfreq) ||
997 __put_user(txc.jitter, &utp->jitter) ||
998 __put_user(txc.shift, &utp->shift) ||
999 __put_user(txc.stabil, &utp->stabil) ||
1000 __put_user(txc.jitcnt, &utp->jitcnt) ||
1001 __put_user(txc.calcnt, &utp->calcnt) ||
1002 __put_user(txc.errcnt, &utp->errcnt) ||
1003 __put_user(txc.stbcnt, &utp->stbcnt) ||
1004 __put_user(txc.tai, &utp->tai))
1005 ret = -EFAULT;
1006 1046
1007 return ret; 1047 return ret;
1008} 1048}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 156cc5556140..c95fc4df0faa 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v)
160{ 160{
161 BUG_ON(cpu_notify(val, v)); 161 BUG_ON(cpu_notify(val, v));
162} 162}
163
164EXPORT_SYMBOL(register_cpu_notifier); 163EXPORT_SYMBOL(register_cpu_notifier);
165 164
166void __ref unregister_cpu_notifier(struct notifier_block *nb) 165void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -205,7 +204,6 @@ static int __ref take_cpu_down(void *_param)
205 return err; 204 return err;
206 205
207 cpu_notify(CPU_DYING | param->mod, param->hcpu); 206 cpu_notify(CPU_DYING | param->mod, param->hcpu);
208
209 return 0; 207 return 0;
210} 208}
211 209
@@ -227,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
227 return -EINVAL; 225 return -EINVAL;
228 226
229 cpu_hotplug_begin(); 227 cpu_hotplug_begin();
228
230 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); 229 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
231 if (err) { 230 if (err) {
232 nr_calls--; 231 nr_calls--;
@@ -304,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
304 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); 303 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
305 if (ret) { 304 if (ret) {
306 nr_calls--; 305 nr_calls--;
307 printk("%s: attempt to bring up CPU %u failed\n", 306 printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
308 __func__, cpu); 307 __func__, cpu);
309 goto out_notify; 308 goto out_notify;
310 } 309 }
@@ -450,14 +449,14 @@ void __ref enable_nonboot_cpus(void)
450 if (cpumask_empty(frozen_cpus)) 449 if (cpumask_empty(frozen_cpus))
451 goto out; 450 goto out;
452 451
453 printk("Enabling non-boot CPUs ...\n"); 452 printk(KERN_INFO "Enabling non-boot CPUs ...\n");
454 453
455 arch_enable_nonboot_cpus_begin(); 454 arch_enable_nonboot_cpus_begin();
456 455
457 for_each_cpu(cpu, frozen_cpus) { 456 for_each_cpu(cpu, frozen_cpus) {
458 error = _cpu_up(cpu, 1); 457 error = _cpu_up(cpu, 1);
459 if (!error) { 458 if (!error) {
460 printk("CPU%d is up\n", cpu); 459 printk(KERN_INFO "CPU%d is up\n", cpu);
461 continue; 460 continue;
462 } 461 }
463 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 462 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
@@ -509,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
509 */ 508 */
510 509
511/* cpu_bit_bitmap[0] is empty - so we can back into it */ 510/* cpu_bit_bitmap[0] is empty - so we can back into it */
512#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x) 511#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x))
513#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) 512#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
514#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) 513#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
515#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) 514#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e92e98189032..33eee16addb8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p,
1015 struct cpuset *cs; 1015 struct cpuset *cs;
1016 int migrate; 1016 int migrate;
1017 const nodemask_t *oldmem = scan->data; 1017 const nodemask_t *oldmem = scan->data;
1018 NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); 1018 static nodemask_t newmems; /* protected by cgroup_mutex */
1019
1020 if (!newmems)
1021 return;
1022 1019
1023 cs = cgroup_cs(scan->cg); 1020 cs = cgroup_cs(scan->cg);
1024 guarantee_online_mems(cs, newmems); 1021 guarantee_online_mems(cs, &newmems);
1025
1026 cpuset_change_task_nodemask(p, newmems);
1027 1022
1028 NODEMASK_FREE(newmems); 1023 cpuset_change_task_nodemask(p, &newmems);
1029 1024
1030 mm = get_task_mm(p); 1025 mm = get_task_mm(p);
1031 if (!mm) 1026 if (!mm)
@@ -1438,44 +1433,35 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1438 struct mm_struct *mm; 1433 struct mm_struct *mm;
1439 struct cpuset *cs = cgroup_cs(cont); 1434 struct cpuset *cs = cgroup_cs(cont);
1440 struct cpuset *oldcs = cgroup_cs(oldcont); 1435 struct cpuset *oldcs = cgroup_cs(oldcont);
1441 NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL); 1436 static nodemask_t to; /* protected by cgroup_mutex */
1442 NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
1443
1444 if (from == NULL || to == NULL)
1445 goto alloc_fail;
1446 1437
1447 if (cs == &top_cpuset) { 1438 if (cs == &top_cpuset) {
1448 cpumask_copy(cpus_attach, cpu_possible_mask); 1439 cpumask_copy(cpus_attach, cpu_possible_mask);
1449 } else { 1440 } else {
1450 guarantee_online_cpus(cs, cpus_attach); 1441 guarantee_online_cpus(cs, cpus_attach);
1451 } 1442 }
1452 guarantee_online_mems(cs, to); 1443 guarantee_online_mems(cs, &to);
1453 1444
1454 /* do per-task migration stuff possibly for each in the threadgroup */ 1445 /* do per-task migration stuff possibly for each in the threadgroup */
1455 cpuset_attach_task(tsk, to, cs); 1446 cpuset_attach_task(tsk, &to, cs);
1456 if (threadgroup) { 1447 if (threadgroup) {
1457 struct task_struct *c; 1448 struct task_struct *c;
1458 rcu_read_lock(); 1449 rcu_read_lock();
1459 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 1450 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1460 cpuset_attach_task(c, to, cs); 1451 cpuset_attach_task(c, &to, cs);
1461 } 1452 }
1462 rcu_read_unlock(); 1453 rcu_read_unlock();
1463 } 1454 }
1464 1455
1465 /* change mm; only needs to be done once even if threadgroup */ 1456 /* change mm; only needs to be done once even if threadgroup */
1466 *from = oldcs->mems_allowed; 1457 to = cs->mems_allowed;
1467 *to = cs->mems_allowed;
1468 mm = get_task_mm(tsk); 1458 mm = get_task_mm(tsk);
1469 if (mm) { 1459 if (mm) {
1470 mpol_rebind_mm(mm, to); 1460 mpol_rebind_mm(mm, &to);
1471 if (is_memory_migrate(cs)) 1461 if (is_memory_migrate(cs))
1472 cpuset_migrate_mm(mm, from, to); 1462 cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
1473 mmput(mm); 1463 mmput(mm);
1474 } 1464 }
1475
1476alloc_fail:
1477 NODEMASK_FREE(from);
1478 NODEMASK_FREE(to);
1479} 1465}
1480 1466
1481/* The various types of files and directories in a cpuset file system */ 1467/* The various types of files and directories in a cpuset file system */
@@ -1610,34 +1596,26 @@ out:
1610 * across a page fault. 1596 * across a page fault.
1611 */ 1597 */
1612 1598
1613static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) 1599static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1614{ 1600{
1615 int ret; 1601 size_t count;
1616 1602
1617 mutex_lock(&callback_mutex); 1603 mutex_lock(&callback_mutex);
1618 ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); 1604 count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
1619 mutex_unlock(&callback_mutex); 1605 mutex_unlock(&callback_mutex);
1620 1606
1621 return ret; 1607 return count;
1622} 1608}
1623 1609
1624static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1610static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1625{ 1611{
1626 NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); 1612 size_t count;
1627 int retval;
1628
1629 if (mask == NULL)
1630 return -ENOMEM;
1631 1613
1632 mutex_lock(&callback_mutex); 1614 mutex_lock(&callback_mutex);
1633 *mask = cs->mems_allowed; 1615 count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
1634 mutex_unlock(&callback_mutex); 1616 mutex_unlock(&callback_mutex);
1635 1617
1636 retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); 1618 return count;
1637
1638 NODEMASK_FREE(mask);
1639
1640 return retval;
1641} 1619}
1642 1620
1643static ssize_t cpuset_common_file_read(struct cgroup *cont, 1621static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1862,8 +1840,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
1862 cs = cgroup_cs(cgroup); 1840 cs = cgroup_cs(cgroup);
1863 parent_cs = cgroup_cs(parent); 1841 parent_cs = cgroup_cs(parent);
1864 1842
1843 mutex_lock(&callback_mutex);
1865 cs->mems_allowed = parent_cs->mems_allowed; 1844 cs->mems_allowed = parent_cs->mems_allowed;
1866 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); 1845 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1846 mutex_unlock(&callback_mutex);
1867 return; 1847 return;
1868} 1848}
1869 1849
@@ -2066,10 +2046,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2066 struct cpuset *cp; /* scans cpusets being updated */ 2046 struct cpuset *cp; /* scans cpusets being updated */
2067 struct cpuset *child; /* scans child cpusets of cp */ 2047 struct cpuset *child; /* scans child cpusets of cp */
2068 struct cgroup *cont; 2048 struct cgroup *cont;
2069 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); 2049 static nodemask_t oldmems; /* protected by cgroup_mutex */
2070
2071 if (oldmems == NULL)
2072 return;
2073 2050
2074 list_add_tail((struct list_head *)&root->stack_list, &queue); 2051 list_add_tail((struct list_head *)&root->stack_list, &queue);
2075 2052
@@ -2086,7 +2063,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2086 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2063 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2087 continue; 2064 continue;
2088 2065
2089 *oldmems = cp->mems_allowed; 2066 oldmems = cp->mems_allowed;
2090 2067
2091 /* Remove offline cpus and mems from this cpuset. */ 2068 /* Remove offline cpus and mems from this cpuset. */
2092 mutex_lock(&callback_mutex); 2069 mutex_lock(&callback_mutex);
@@ -2102,10 +2079,9 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2102 remove_tasks_in_empty_cpuset(cp); 2079 remove_tasks_in_empty_cpuset(cp);
2103 else { 2080 else {
2104 update_tasks_cpumask(cp, NULL); 2081 update_tasks_cpumask(cp, NULL);
2105 update_tasks_nodemask(cp, oldmems, NULL); 2082 update_tasks_nodemask(cp, &oldmems, NULL);
2106 } 2083 }
2107 } 2084 }
2108 NODEMASK_FREE(oldmems);
2109} 2085}
2110 2086
2111/* 2087/*
@@ -2147,19 +2123,16 @@ void cpuset_update_active_cpus(void)
2147static int cpuset_track_online_nodes(struct notifier_block *self, 2123static int cpuset_track_online_nodes(struct notifier_block *self,
2148 unsigned long action, void *arg) 2124 unsigned long action, void *arg)
2149{ 2125{
2150 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); 2126 static nodemask_t oldmems; /* protected by cgroup_mutex */
2151
2152 if (oldmems == NULL)
2153 return NOTIFY_DONE;
2154 2127
2155 cgroup_lock(); 2128 cgroup_lock();
2156 switch (action) { 2129 switch (action) {
2157 case MEM_ONLINE: 2130 case MEM_ONLINE:
2158 *oldmems = top_cpuset.mems_allowed; 2131 oldmems = top_cpuset.mems_allowed;
2159 mutex_lock(&callback_mutex); 2132 mutex_lock(&callback_mutex);
2160 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2133 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2161 mutex_unlock(&callback_mutex); 2134 mutex_unlock(&callback_mutex);
2162 update_tasks_nodemask(&top_cpuset, oldmems, NULL); 2135 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2163 break; 2136 break;
2164 case MEM_OFFLINE: 2137 case MEM_OFFLINE:
2165 /* 2138 /*
@@ -2173,7 +2146,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2173 } 2146 }
2174 cgroup_unlock(); 2147 cgroup_unlock();
2175 2148
2176 NODEMASK_FREE(oldmems);
2177 return NOTIFY_OK; 2149 return NOTIFY_OK;
2178} 2150}
2179#endif 2151#endif
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000000..5f85690285d4
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,34 @@
1#include <linux/kernel.h>
2#include <linux/crash_dump.h>
3#include <linux/init.h>
4#include <linux/errno.h>
5#include <linux/module.h>
6
7/*
8 * If we have booted due to a crash, max_pfn will be a very low value. We need
9 * to know the amount of memory that the previous kernel used.
10 */
11unsigned long saved_max_pfn;
12
13/*
14 * stores the physical address of elf header of crash image
15 *
16 * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
17 * is_kdump_kernel() to determine if we are booting after a panic. Hence put
18 * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
19 */
20unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
21
22/*
23 * elfcorehdr= specifies the location of elf core header stored by the crashed
24 * kernel. This option will be passed by kexec loader to the capture kernel.
25 */
26static int __init setup_elfcorehdr(char *arg)
27{
28 char *end;
29 if (!arg)
30 return -EINVAL;
31 elfcorehdr_addr = memparse(arg, &end);
32 return end > arg ? 0 : -EINVAL;
33}
34early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
index 3a9d6dd53a6c..5557b55048df 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar;
35static struct thread_group_cred init_tgcred = { 35static struct thread_group_cred init_tgcred = {
36 .usage = ATOMIC_INIT(2), 36 .usage = ATOMIC_INIT(2),
37 .tgid = 0, 37 .tgid = 0,
38 .lock = SPIN_LOCK_UNLOCKED, 38 .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
39}; 39};
40#endif 40#endif
41 41
@@ -741,6 +741,12 @@ int set_create_files_as(struct cred *new, struct inode *inode)
741} 741}
742EXPORT_SYMBOL(set_create_files_as); 742EXPORT_SYMBOL(set_create_files_as);
743 743
744struct user_namespace *current_user_ns(void)
745{
746 return _current_user_ns();
747}
748EXPORT_SYMBOL(current_user_ns);
749
744#ifdef CONFIG_DEBUG_CREDENTIALS 750#ifdef CONFIG_DEBUG_CREDENTIALS
745 751
746bool creds_are_invalid(const struct cred *cred) 752bool creds_are_invalid(const struct cred *cred)
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 481a7bd2dfe7..a11db956dd62 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1093,3 +1093,33 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
1093 put_packet(remcom_out_buffer); 1093 put_packet(remcom_out_buffer);
1094 return 0; 1094 return 0;
1095} 1095}
1096
1097/**
1098 * gdbstub_exit - Send an exit message to GDB
1099 * @status: The exit code to report.
1100 */
1101void gdbstub_exit(int status)
1102{
1103 unsigned char checksum, ch, buffer[3];
1104 int loop;
1105
1106 buffer[0] = 'W';
1107 buffer[1] = hex_asc_hi(status);
1108 buffer[2] = hex_asc_lo(status);
1109
1110 dbg_io_ops->write_char('$');
1111 checksum = 0;
1112
1113 for (loop = 0; loop < 3; loop++) {
1114 ch = buffer[loop];
1115 checksum += ch;
1116 dbg_io_ops->write_char(ch);
1117 }
1118
1119 dbg_io_ops->write_char('#');
1120 dbg_io_ops->write_char(hex_asc_hi(checksum));
1121 dbg_io_ops->write_char(hex_asc_lo(checksum));
1122
1123 /* make sure the output is flushed, lest the bootloader clobber it */
1124 dbg_io_ops->flush();
1125}
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45ebcc7b1..6a488ad2dce5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -908,6 +908,7 @@ NORET_TYPE void do_exit(long code)
908 profile_task_exit(tsk); 908 profile_task_exit(tsk);
909 909
910 WARN_ON(atomic_read(&tsk->fs_excl)); 910 WARN_ON(atomic_read(&tsk->fs_excl));
911 WARN_ON(blk_needs_flush_plug(tsk));
911 912
912 if (unlikely(in_interrupt())) 913 if (unlikely(in_interrupt()))
913 panic("Aiee, killing interrupt handler!"); 914 panic("Aiee, killing interrupt handler!");
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e429152ddc..e7548dee636b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
40#include <linux/tracehook.h> 40#include <linux/tracehook.h>
41#include <linux/futex.h> 41#include <linux/futex.h>
42#include <linux/compat.h> 42#include <linux/compat.h>
43#include <linux/kthread.h>
43#include <linux/task_io_accounting_ops.h> 44#include <linux/task_io_accounting_ops.h>
44#include <linux/rcupdate.h> 45#include <linux/rcupdate.h>
45#include <linux/ptrace.h> 46#include <linux/ptrace.h>
@@ -109,20 +110,25 @@ int nr_processes(void)
109} 110}
110 111
111#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 112#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
112# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) 113# define alloc_task_struct_node(node) \
113# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) 114 kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
115# define free_task_struct(tsk) \
116 kmem_cache_free(task_struct_cachep, (tsk))
114static struct kmem_cache *task_struct_cachep; 117static struct kmem_cache *task_struct_cachep;
115#endif 118#endif
116 119
117#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR 120#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
118static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) 121static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
122 int node)
119{ 123{
120#ifdef CONFIG_DEBUG_STACK_USAGE 124#ifdef CONFIG_DEBUG_STACK_USAGE
121 gfp_t mask = GFP_KERNEL | __GFP_ZERO; 125 gfp_t mask = GFP_KERNEL | __GFP_ZERO;
122#else 126#else
123 gfp_t mask = GFP_KERNEL; 127 gfp_t mask = GFP_KERNEL;
124#endif 128#endif
125 return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); 129 struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
130
131 return page ? page_address(page) : NULL;
126} 132}
127 133
128static inline void free_thread_info(struct thread_info *ti) 134static inline void free_thread_info(struct thread_info *ti)
@@ -193,6 +199,7 @@ void __put_task_struct(struct task_struct *tsk)
193 if (!profile_handoff_task(tsk)) 199 if (!profile_handoff_task(tsk))
194 free_task(tsk); 200 free_task(tsk);
195} 201}
202EXPORT_SYMBOL_GPL(__put_task_struct);
196 203
197/* 204/*
198 * macro override instead of weak attribute alias, to workaround 205 * macro override instead of weak attribute alias, to workaround
@@ -248,16 +255,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
248 struct task_struct *tsk; 255 struct task_struct *tsk;
249 struct thread_info *ti; 256 struct thread_info *ti;
250 unsigned long *stackend; 257 unsigned long *stackend;
251 258 int node = tsk_fork_get_node(orig);
252 int err; 259 int err;
253 260
254 prepare_to_copy(orig); 261 prepare_to_copy(orig);
255 262
256 tsk = alloc_task_struct(); 263 tsk = alloc_task_struct_node(node);
257 if (!tsk) 264 if (!tsk)
258 return NULL; 265 return NULL;
259 266
260 ti = alloc_thread_info(tsk); 267 ti = alloc_thread_info_node(tsk, node);
261 if (!ti) { 268 if (!ti) {
262 free_task_struct(tsk); 269 free_task_struct(tsk);
263 return NULL; 270 return NULL;
@@ -1180,12 +1187,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1180 pid = alloc_pid(p->nsproxy->pid_ns); 1187 pid = alloc_pid(p->nsproxy->pid_ns);
1181 if (!pid) 1188 if (!pid)
1182 goto bad_fork_cleanup_io; 1189 goto bad_fork_cleanup_io;
1183
1184 if (clone_flags & CLONE_NEWPID) {
1185 retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
1186 if (retval < 0)
1187 goto bad_fork_free_pid;
1188 }
1189 } 1190 }
1190 1191
1191 p->pid = pid_nr(pid); 1192 p->pid = pid_nr(pid);
@@ -1204,6 +1205,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1204 * Clear TID on mm_release()? 1205 * Clear TID on mm_release()?
1205 */ 1206 */
1206 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; 1207 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
1208#ifdef CONFIG_BLOCK
1209 p->plug = NULL;
1210#endif
1207#ifdef CONFIG_FUTEX 1211#ifdef CONFIG_FUTEX
1208 p->robust_list = NULL; 1212 p->robust_list = NULL;
1209#ifdef CONFIG_COMPAT 1213#ifdef CONFIG_COMPAT
@@ -1289,7 +1293,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1289 tracehook_finish_clone(p, clone_flags, trace); 1293 tracehook_finish_clone(p, clone_flags, trace);
1290 1294
1291 if (thread_group_leader(p)) { 1295 if (thread_group_leader(p)) {
1292 if (clone_flags & CLONE_NEWPID) 1296 if (is_child_reaper(pid))
1293 p->nsproxy->pid_ns->child_reaper = p; 1297 p->nsproxy->pid_ns->child_reaper = p;
1294 1298
1295 p->signal->leader_pid = pid; 1299 p->signal->leader_pid = pid;
@@ -1512,38 +1516,24 @@ void __init proc_caches_init(void)
1512} 1516}
1513 1517
1514/* 1518/*
1515 * Check constraints on flags passed to the unshare system call and 1519 * Check constraints on flags passed to the unshare system call.
1516 * force unsharing of additional process context as appropriate.
1517 */ 1520 */
1518static void check_unshare_flags(unsigned long *flags_ptr) 1521static int check_unshare_flags(unsigned long unshare_flags)
1519{ 1522{
1523 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1524 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1525 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
1526 return -EINVAL;
1520 /* 1527 /*
1521 * If unsharing a thread from a thread group, must also 1528 * Not implemented, but pretend it works if there is nothing to
1522 * unshare vm. 1529 * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
1523 */ 1530 * needs to unshare vm.
1524 if (*flags_ptr & CLONE_THREAD)
1525 *flags_ptr |= CLONE_VM;
1526
1527 /*
1528 * If unsharing vm, must also unshare signal handlers.
1529 */
1530 if (*flags_ptr & CLONE_VM)
1531 *flags_ptr |= CLONE_SIGHAND;
1532
1533 /*
1534 * If unsharing namespace, must also unshare filesystem information.
1535 */ 1531 */
1536 if (*flags_ptr & CLONE_NEWNS) 1532 if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
1537 *flags_ptr |= CLONE_FS; 1533 /* FIXME: get_task_mm() increments ->mm_users */
1538} 1534 if (atomic_read(&current->mm->mm_users) > 1)
1539 1535 return -EINVAL;
1540/* 1536 }
1541 * Unsharing of tasks created with CLONE_THREAD is not supported yet
1542 */
1543static int unshare_thread(unsigned long unshare_flags)
1544{
1545 if (unshare_flags & CLONE_THREAD)
1546 return -EINVAL;
1547 1537
1548 return 0; 1538 return 0;
1549} 1539}
@@ -1570,34 +1560,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1570} 1560}
1571 1561
1572/* 1562/*
1573 * Unsharing of sighand is not supported yet
1574 */
1575static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
1576{
1577 struct sighand_struct *sigh = current->sighand;
1578
1579 if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
1580 return -EINVAL;
1581 else
1582 return 0;
1583}
1584
1585/*
1586 * Unshare vm if it is being shared
1587 */
1588static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
1589{
1590 struct mm_struct *mm = current->mm;
1591
1592 if ((unshare_flags & CLONE_VM) &&
1593 (mm && atomic_read(&mm->mm_users) > 1)) {
1594 return -EINVAL;
1595 }
1596
1597 return 0;
1598}
1599
1600/*
1601 * Unshare file descriptor table if it is being shared 1563 * Unshare file descriptor table if it is being shared
1602 */ 1564 */
1603static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) 1565static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
@@ -1625,45 +1587,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
1625 */ 1587 */
1626SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) 1588SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1627{ 1589{
1628 int err = 0;
1629 struct fs_struct *fs, *new_fs = NULL; 1590 struct fs_struct *fs, *new_fs = NULL;
1630 struct sighand_struct *new_sigh = NULL;
1631 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1632 struct files_struct *fd, *new_fd = NULL; 1591 struct files_struct *fd, *new_fd = NULL;
1633 struct nsproxy *new_nsproxy = NULL; 1592 struct nsproxy *new_nsproxy = NULL;
1634 int do_sysvsem = 0; 1593 int do_sysvsem = 0;
1594 int err;
1635 1595
1636 check_unshare_flags(&unshare_flags); 1596 err = check_unshare_flags(unshare_flags);
1637 1597 if (err)
1638 /* Return -EINVAL for all unsupported flags */
1639 err = -EINVAL;
1640 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1641 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1642 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
1643 goto bad_unshare_out; 1598 goto bad_unshare_out;
1644 1599
1645 /* 1600 /*
1601 * If unsharing namespace, must also unshare filesystem information.
1602 */
1603 if (unshare_flags & CLONE_NEWNS)
1604 unshare_flags |= CLONE_FS;
1605 /*
1646 * CLONE_NEWIPC must also detach from the undolist: after switching 1606 * CLONE_NEWIPC must also detach from the undolist: after switching
1647 * to a new ipc namespace, the semaphore arrays from the old 1607 * to a new ipc namespace, the semaphore arrays from the old
1648 * namespace are unreachable. 1608 * namespace are unreachable.
1649 */ 1609 */
1650 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) 1610 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
1651 do_sysvsem = 1; 1611 do_sysvsem = 1;
1652 if ((err = unshare_thread(unshare_flags)))
1653 goto bad_unshare_out;
1654 if ((err = unshare_fs(unshare_flags, &new_fs))) 1612 if ((err = unshare_fs(unshare_flags, &new_fs)))
1655 goto bad_unshare_cleanup_thread; 1613 goto bad_unshare_out;
1656 if ((err = unshare_sighand(unshare_flags, &new_sigh)))
1657 goto bad_unshare_cleanup_fs;
1658 if ((err = unshare_vm(unshare_flags, &new_mm)))
1659 goto bad_unshare_cleanup_sigh;
1660 if ((err = unshare_fd(unshare_flags, &new_fd))) 1614 if ((err = unshare_fd(unshare_flags, &new_fd)))
1661 goto bad_unshare_cleanup_vm; 1615 goto bad_unshare_cleanup_fs;
1662 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, 1616 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1663 new_fs))) 1617 new_fs)))
1664 goto bad_unshare_cleanup_fd; 1618 goto bad_unshare_cleanup_fd;
1665 1619
1666 if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { 1620 if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
1667 if (do_sysvsem) { 1621 if (do_sysvsem) {
1668 /* 1622 /*
1669 * CLONE_SYSVSEM is equivalent to sys_exit(). 1623 * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1689,19 +1643,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1689 spin_unlock(&fs->lock); 1643 spin_unlock(&fs->lock);
1690 } 1644 }
1691 1645
1692 if (new_mm) {
1693 mm = current->mm;
1694 active_mm = current->active_mm;
1695 current->mm = new_mm;
1696 current->active_mm = new_mm;
1697 if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
1698 atomic_dec(&mm->oom_disable_count);
1699 atomic_inc(&new_mm->oom_disable_count);
1700 }
1701 activate_mm(active_mm, new_mm);
1702 new_mm = mm;
1703 }
1704
1705 if (new_fd) { 1646 if (new_fd) {
1706 fd = current->files; 1647 fd = current->files;
1707 current->files = new_fd; 1648 current->files = new_fd;
@@ -1718,20 +1659,10 @@ bad_unshare_cleanup_fd:
1718 if (new_fd) 1659 if (new_fd)
1719 put_files_struct(new_fd); 1660 put_files_struct(new_fd);
1720 1661
1721bad_unshare_cleanup_vm:
1722 if (new_mm)
1723 mmput(new_mm);
1724
1725bad_unshare_cleanup_sigh:
1726 if (new_sigh)
1727 if (atomic_dec_and_test(&new_sigh->count))
1728 kmem_cache_free(sighand_cachep, new_sigh);
1729
1730bad_unshare_cleanup_fs: 1662bad_unshare_cleanup_fs:
1731 if (new_fs) 1663 if (new_fs)
1732 free_fs_struct(new_fs); 1664 free_fs_struct(new_fs);
1733 1665
1734bad_unshare_cleanup_thread:
1735bad_unshare_out: 1666bad_unshare_out:
1736 return err; 1667 return err;
1737} 1668}
diff --git a/kernel/futex.c b/kernel/futex.c
index b766d28accd6..dfb924ffe65b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -381,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
381 return NULL; 381 return NULL;
382} 382}
383 383
384static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 384static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
385 u32 uval, u32 newval)
385{ 386{
386 u32 curval; 387 int ret;
387 388
388 pagefault_disable(); 389 pagefault_disable();
389 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 390 ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
390 pagefault_enable(); 391 pagefault_enable();
391 392
392 return curval; 393 return ret;
393} 394}
394 395
395static int get_futex_value_locked(u32 *dest, u32 __user *from) 396static int get_futex_value_locked(u32 *dest, u32 __user *from)
@@ -674,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
674 struct task_struct *task, int set_waiters) 675 struct task_struct *task, int set_waiters)
675{ 676{
676 int lock_taken, ret, ownerdied = 0; 677 int lock_taken, ret, ownerdied = 0;
677 u32 uval, newval, curval; 678 u32 uval, newval, curval, vpid = task_pid_vnr(task);
678 679
679retry: 680retry:
680 ret = lock_taken = 0; 681 ret = lock_taken = 0;
@@ -684,19 +685,17 @@ retry:
684 * (by doing a 0 -> TID atomic cmpxchg), while holding all 685 * (by doing a 0 -> TID atomic cmpxchg), while holding all
685 * the locks. It will most likely not succeed. 686 * the locks. It will most likely not succeed.
686 */ 687 */
687 newval = task_pid_vnr(task); 688 newval = vpid;
688 if (set_waiters) 689 if (set_waiters)
689 newval |= FUTEX_WAITERS; 690 newval |= FUTEX_WAITERS;
690 691
691 curval = cmpxchg_futex_value_locked(uaddr, 0, newval); 692 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
692
693 if (unlikely(curval == -EFAULT))
694 return -EFAULT; 693 return -EFAULT;
695 694
696 /* 695 /*
697 * Detect deadlocks. 696 * Detect deadlocks.
698 */ 697 */
699 if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) 698 if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
700 return -EDEADLK; 699 return -EDEADLK;
701 700
702 /* 701 /*
@@ -723,14 +722,12 @@ retry:
723 */ 722 */
724 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { 723 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
725 /* Keep the OWNER_DIED bit */ 724 /* Keep the OWNER_DIED bit */
726 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); 725 newval = (curval & ~FUTEX_TID_MASK) | vpid;
727 ownerdied = 0; 726 ownerdied = 0;
728 lock_taken = 1; 727 lock_taken = 1;
729 } 728 }
730 729
731 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 730 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
732
733 if (unlikely(curval == -EFAULT))
734 return -EFAULT; 731 return -EFAULT;
735 if (unlikely(curval != uval)) 732 if (unlikely(curval != uval))
736 goto retry; 733 goto retry;
@@ -775,6 +772,24 @@ retry:
775 return ret; 772 return ret;
776} 773}
777 774
775/**
776 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
777 * @q: The futex_q to unqueue
778 *
779 * The q->lock_ptr must not be NULL and must be held by the caller.
780 */
781static void __unqueue_futex(struct futex_q *q)
782{
783 struct futex_hash_bucket *hb;
784
785 if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
786 || WARN_ON(plist_node_empty(&q->list)))
787 return;
788
789 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
790 plist_del(&q->list, &hb->chain);
791}
792
778/* 793/*
779 * The hash bucket lock must be held when this is called. 794 * The hash bucket lock must be held when this is called.
780 * Afterwards, the futex_q must not be accessed. 795 * Afterwards, the futex_q must not be accessed.
@@ -792,7 +807,7 @@ static void wake_futex(struct futex_q *q)
792 */ 807 */
793 get_task_struct(p); 808 get_task_struct(p);
794 809
795 plist_del(&q->list, &q->list.plist); 810 __unqueue_futex(q);
796 /* 811 /*
797 * The waiting task can free the futex_q as soon as 812 * The waiting task can free the futex_q as soon as
798 * q->lock_ptr = NULL is written, without taking any locks. A 813 * q->lock_ptr = NULL is written, without taking any locks. A
@@ -843,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
843 858
844 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 859 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
845 860
846 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 861 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
847
848 if (curval == -EFAULT)
849 ret = -EFAULT; 862 ret = -EFAULT;
850 else if (curval != uval) 863 else if (curval != uval)
851 ret = -EINVAL; 864 ret = -EINVAL;
@@ -880,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
880 * There is no waiter, so we unlock the futex. The owner died 893 * There is no waiter, so we unlock the futex. The owner died
881 * bit has not to be preserved here. We are the owner: 894 * bit has not to be preserved here. We are the owner:
882 */ 895 */
883 oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); 896 if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
884 897 return -EFAULT;
885 if (oldval == -EFAULT)
886 return oldval;
887 if (oldval != uval) 898 if (oldval != uval)
888 return -EAGAIN; 899 return -EAGAIN;
889 900
@@ -1071,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1071 plist_del(&q->list, &hb1->chain); 1082 plist_del(&q->list, &hb1->chain);
1072 plist_add(&q->list, &hb2->chain); 1083 plist_add(&q->list, &hb2->chain);
1073 q->lock_ptr = &hb2->lock; 1084 q->lock_ptr = &hb2->lock;
1074#ifdef CONFIG_DEBUG_PI_LIST
1075 q->list.plist.spinlock = &hb2->lock;
1076#endif
1077 } 1085 }
1078 get_futex_key_refs(key2); 1086 get_futex_key_refs(key2);
1079 q->key = *key2; 1087 q->key = *key2;
@@ -1100,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1100 get_futex_key_refs(key); 1108 get_futex_key_refs(key);
1101 q->key = *key; 1109 q->key = *key;
1102 1110
1103 WARN_ON(plist_node_empty(&q->list)); 1111 __unqueue_futex(q);
1104 plist_del(&q->list, &q->list.plist);
1105 1112
1106 WARN_ON(!q->rt_waiter); 1113 WARN_ON(!q->rt_waiter);
1107 q->rt_waiter = NULL; 1114 q->rt_waiter = NULL;
1108 1115
1109 q->lock_ptr = &hb->lock; 1116 q->lock_ptr = &hb->lock;
1110#ifdef CONFIG_DEBUG_PI_LIST
1111 q->list.plist.spinlock = &hb->lock;
1112#endif
1113 1117
1114 wake_up_state(q->task, TASK_NORMAL); 1118 wake_up_state(q->task, TASK_NORMAL);
1115} 1119}
@@ -1457,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1457 prio = min(current->normal_prio, MAX_RT_PRIO); 1461 prio = min(current->normal_prio, MAX_RT_PRIO);
1458 1462
1459 plist_node_init(&q->list, prio); 1463 plist_node_init(&q->list, prio);
1460#ifdef CONFIG_DEBUG_PI_LIST
1461 q->list.plist.spinlock = &hb->lock;
1462#endif
1463 plist_add(&q->list, &hb->chain); 1464 plist_add(&q->list, &hb->chain);
1464 q->task = current; 1465 q->task = current;
1465 spin_unlock(&hb->lock); 1466 spin_unlock(&hb->lock);
@@ -1504,8 +1505,7 @@ retry:
1504 spin_unlock(lock_ptr); 1505 spin_unlock(lock_ptr);
1505 goto retry; 1506 goto retry;
1506 } 1507 }
1507 WARN_ON(plist_node_empty(&q->list)); 1508 __unqueue_futex(q);
1508 plist_del(&q->list, &q->list.plist);
1509 1509
1510 BUG_ON(q->pi_state); 1510 BUG_ON(q->pi_state);
1511 1511
@@ -1525,8 +1525,7 @@ retry:
1525static void unqueue_me_pi(struct futex_q *q) 1525static void unqueue_me_pi(struct futex_q *q)
1526 __releases(q->lock_ptr) 1526 __releases(q->lock_ptr)
1527{ 1527{
1528 WARN_ON(plist_node_empty(&q->list)); 1528 __unqueue_futex(q);
1529 plist_del(&q->list, &q->list.plist);
1530 1529
1531 BUG_ON(!q->pi_state); 1530 BUG_ON(!q->pi_state);
1532 free_pi_state(q->pi_state); 1531 free_pi_state(q->pi_state);
@@ -1556,10 +1555,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1556 1555
1557 /* 1556 /*
1558 * We are here either because we stole the rtmutex from the 1557 * We are here either because we stole the rtmutex from the
1559 * pending owner or we are the pending owner which failed to 1558 * previous highest priority waiter or we are the highest priority
1560 * get the rtmutex. We have to replace the pending owner TID 1559 * waiter but failed to get the rtmutex the first time.
1561 * in the user space variable. This must be atomic as we have 1560 * We have to replace the newowner TID in the user space variable.
1562 * to preserve the owner died bit here. 1561 * This must be atomic as we have to preserve the owner died bit here.
1563 * 1562 *
1564 * Note: We write the user space value _before_ changing the pi_state 1563 * Note: We write the user space value _before_ changing the pi_state
1565 * because we can fault here. Imagine swapped out pages or a fork 1564 * because we can fault here. Imagine swapped out pages or a fork
@@ -1578,9 +1577,7 @@ retry:
1578 while (1) { 1577 while (1) {
1579 newval = (uval & FUTEX_OWNER_DIED) | newtid; 1578 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1580 1579
1581 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 1580 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1582
1583 if (curval == -EFAULT)
1584 goto handle_fault; 1581 goto handle_fault;
1585 if (curval == uval) 1582 if (curval == uval)
1586 break; 1583 break;
@@ -1608,8 +1605,8 @@ retry:
1608 1605
1609 /* 1606 /*
1610 * To handle the page fault we need to drop the hash bucket 1607 * To handle the page fault we need to drop the hash bucket
1611 * lock here. That gives the other task (either the pending 1608 * lock here. That gives the other task (either the highest priority
1612 * owner itself or the task which stole the rtmutex) the 1609 * waiter itself or the task which stole the rtmutex) the
1613 * chance to try the fixup of the pi_state. So once we are 1610 * chance to try the fixup of the pi_state. So once we are
1614 * back from handling the fault we need to check the pi_state 1611 * back from handling the fault we need to check the pi_state
1615 * after reacquiring the hash bucket lock and before trying to 1612 * after reacquiring the hash bucket lock and before trying to
@@ -1685,18 +1682,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1685 /* 1682 /*
1686 * pi_state is incorrect, some other task did a lock steal and 1683 * pi_state is incorrect, some other task did a lock steal and
1687 * we returned due to timeout or signal without taking the 1684 * we returned due to timeout or signal without taking the
1688 * rt_mutex. Too late. We can access the rt_mutex_owner without 1685 * rt_mutex. Too late.
1689 * locking, as the other task is now blocked on the hash bucket
1690 * lock. Fix the state up.
1691 */ 1686 */
1687 raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
1692 owner = rt_mutex_owner(&q->pi_state->pi_mutex); 1688 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1689 if (!owner)
1690 owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
1691 raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
1693 ret = fixup_pi_state_owner(uaddr, q, owner); 1692 ret = fixup_pi_state_owner(uaddr, q, owner);
1694 goto out; 1693 goto out;
1695 } 1694 }
1696 1695
1697 /* 1696 /*
1698 * Paranoia check. If we did not take the lock, then we should not be 1697 * Paranoia check. If we did not take the lock, then we should not be
1699 * the owner, nor the pending owner, of the rt_mutex. 1698 * the owner of the rt_mutex.
1700 */ 1699 */
1701 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) 1700 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1702 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " 1701 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
@@ -1781,13 +1780,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1781 * 1780 *
1782 * The basic logical guarantee of a futex is that it blocks ONLY 1781 * The basic logical guarantee of a futex is that it blocks ONLY
1783 * if cond(var) is known to be true at the time of blocking, for 1782 * if cond(var) is known to be true at the time of blocking, for
1784 * any cond. If we queued after testing *uaddr, that would open 1783 * any cond. If we locked the hash-bucket after testing *uaddr, that
1785 * a race condition where we could block indefinitely with 1784 * would open a race condition where we could block indefinitely with
1786 * cond(var) false, which would violate the guarantee. 1785 * cond(var) false, which would violate the guarantee.
1787 * 1786 *
1788 * A consequence is that futex_wait() can return zero and absorb 1787 * On the other hand, we insert q and release the hash-bucket only
1789 * a wakeup when *uaddr != val on entry to the syscall. This is 1788 * after testing *uaddr. This guarantees that futex_wait() will NOT
1790 * rare, but normal. 1789 * absorb a wakeup if *uaddr does not match the desired values
1790 * while the syscall executes.
1791 */ 1791 */
1792retry: 1792retry:
1793 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); 1793 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
@@ -2046,9 +2046,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2046{ 2046{
2047 struct futex_hash_bucket *hb; 2047 struct futex_hash_bucket *hb;
2048 struct futex_q *this, *next; 2048 struct futex_q *this, *next;
2049 u32 uval;
2050 struct plist_head *head; 2049 struct plist_head *head;
2051 union futex_key key = FUTEX_KEY_INIT; 2050 union futex_key key = FUTEX_KEY_INIT;
2051 u32 uval, vpid = task_pid_vnr(current);
2052 int ret; 2052 int ret;
2053 2053
2054retry: 2054retry:
@@ -2057,7 +2057,7 @@ retry:
2057 /* 2057 /*
2058 * We release only a lock we actually own: 2058 * We release only a lock we actually own:
2059 */ 2059 */
2060 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 2060 if ((uval & FUTEX_TID_MASK) != vpid)
2061 return -EPERM; 2061 return -EPERM;
2062 2062
2063 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); 2063 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
@@ -2072,17 +2072,14 @@ retry:
2072 * again. If it succeeds then we can return without waking 2072 * again. If it succeeds then we can return without waking
2073 * anyone else up: 2073 * anyone else up:
2074 */ 2074 */
2075 if (!(uval & FUTEX_OWNER_DIED)) 2075 if (!(uval & FUTEX_OWNER_DIED) &&
2076 uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); 2076 cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2077
2078
2079 if (unlikely(uval == -EFAULT))
2080 goto pi_faulted; 2077 goto pi_faulted;
2081 /* 2078 /*
2082 * Rare case: we managed to release the lock atomically, 2079 * Rare case: we managed to release the lock atomically,
2083 * no need to wake anyone else up: 2080 * no need to wake anyone else up:
2084 */ 2081 */
2085 if (unlikely(uval == task_pid_vnr(current))) 2082 if (unlikely(uval == vpid))
2086 goto out_unlock; 2083 goto out_unlock;
2087 2084
2088 /* 2085 /*
@@ -2167,7 +2164,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2167 * We were woken prior to requeue by a timeout or a signal. 2164 * We were woken prior to requeue by a timeout or a signal.
2168 * Unqueue the futex_q and determine which it was. 2165 * Unqueue the futex_q and determine which it was.
2169 */ 2166 */
2170 plist_del(&q->list, &q->list.plist); 2167 plist_del(&q->list, &hb->chain);
2171 2168
2172 /* Handle spurious wakeups gracefully */ 2169 /* Handle spurious wakeups gracefully */
2173 ret = -EWOULDBLOCK; 2170 ret = -EWOULDBLOCK;
@@ -2421,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
2421 goto err_unlock; 2418 goto err_unlock;
2422 ret = -EPERM; 2419 ret = -EPERM;
2423 pcred = __task_cred(p); 2420 pcred = __task_cred(p);
2421 /* If victim is in different user_ns, then uids are not
2422 comparable, so we must have CAP_SYS_PTRACE */
2423 if (cred->user->user_ns != pcred->user->user_ns) {
2424 if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
2425 goto err_unlock;
2426 goto ok;
2427 }
2428 /* If victim is in same user_ns, then uids are comparable */
2424 if (cred->euid != pcred->euid && 2429 if (cred->euid != pcred->euid &&
2425 cred->euid != pcred->uid && 2430 cred->euid != pcred->uid &&
2426 !capable(CAP_SYS_PTRACE)) 2431 !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
2427 goto err_unlock; 2432 goto err_unlock;
2433ok:
2428 head = p->robust_list; 2434 head = p->robust_list;
2429 rcu_read_unlock(); 2435 rcu_read_unlock();
2430 } 2436 }
@@ -2463,11 +2469,20 @@ retry:
2463 * userspace. 2469 * userspace.
2464 */ 2470 */
2465 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 2471 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2466 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); 2472 /*
2467 2473 * We are not holding a lock here, but we want to have
2468 if (nval == -EFAULT) 2474 * the pagefault_disable/enable() protection because
2469 return -1; 2475 * we want to handle the fault gracefully. If the
2470 2476 * access fails we try to fault in the futex with R/W
2477 * verification via get_user_pages. get_user() above
2478 * does not guarantee R/W access. If that fails we
2479 * give up and leave the futex locked.
2480 */
2481 if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
2482 if (fault_in_user_writeable(uaddr))
2483 return -1;
2484 goto retry;
2485 }
2471 if (nval != uval) 2486 if (nval != uval)
2472 goto retry; 2487 goto retry;
2473 2488
@@ -2678,8 +2693,7 @@ static int __init futex_init(void)
2678 * implementation, the non-functional ones will return 2693 * implementation, the non-functional ones will return
2679 * -ENOSYS. 2694 * -ENOSYS.
2680 */ 2695 */
2681 curval = cmpxchg_futex_value_locked(NULL, 0, 0); 2696 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2682 if (curval == -EFAULT)
2683 futex_cmpxchg_enabled = 1; 2697 futex_cmpxchg_enabled = 1;
2684 2698
2685 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 2699 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index a7934ac75e5b..5f9e689dc8f0 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -153,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
153 goto err_unlock; 153 goto err_unlock;
154 ret = -EPERM; 154 ret = -EPERM;
155 pcred = __task_cred(p); 155 pcred = __task_cred(p);
156 /* If victim is in different user_ns, then uids are not
157 comparable, so we must have CAP_SYS_PTRACE */
158 if (cred->user->user_ns != pcred->user->user_ns) {
159 if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
160 goto err_unlock;
161 goto ok;
162 }
163 /* If victim is in same user_ns, then uids are comparable */
156 if (cred->euid != pcred->euid && 164 if (cred->euid != pcred->euid &&
157 cred->euid != pcred->uid && 165 cred->euid != pcred->uid &&
158 !capable(CAP_SYS_PTRACE)) 166 !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
159 goto err_unlock; 167 goto err_unlock;
168ok:
160 head = p->compat_robust_list; 169 head = p->compat_robust_list;
161 rcu_read_unlock(); 170 rcu_read_unlock();
162 } 171 }
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 70a298d6da71..b8cadf70b1fb 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -34,7 +34,7 @@ config GCOV_KERNEL
34config GCOV_PROFILE_ALL 34config GCOV_PROFILE_ALL
35 bool "Profile entire Kernel" 35 bool "Profile entire Kernel"
36 depends on GCOV_KERNEL 36 depends on GCOV_KERNEL
37 depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE 37 depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
38 default n 38 default n
39 ---help--- 39 ---help---
40 This options activates profiling for the entire kernel. 40 This options activates profiling for the entire kernel.
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 3f761001d517..e97ca59e2520 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,3 +1,3 @@
1EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' 1ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
2 2
3obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o 3obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/groups.c b/kernel/groups.c
index 253dc0f35cf4..1cc476d52dd3 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
233 struct group_info *group_info; 233 struct group_info *group_info;
234 int retval; 234 int retval;
235 235
236 if (!capable(CAP_SETGID)) 236 if (!nsown_capable(CAP_SETGID))
237 return -EPERM; 237 return -EPERM;
238 if ((unsigned)gidsetsize > NGROUPS_MAX) 238 if ((unsigned)gidsetsize > NGROUPS_MAX)
239 return -EINVAL; 239 return -EINVAL;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0c8d7c048615..9017478c5d4c 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -53,11 +53,10 @@
53/* 53/*
54 * The timer bases: 54 * The timer bases:
55 * 55 *
56 * Note: If we want to add new timer bases, we have to skip the two 56 * There are more clockids then hrtimer bases. Thus, we index
57 * clock ids captured by the cpu-timers. We do this by holding empty 57 * into the timer bases by the hrtimer_base_type enum. When trying
58 * entries rather than doing math adjustment of the clock ids. 58 * to reach a base using a clockid, hrtimer_clockid_to_base()
59 * This ensures that we capture erroneous accesses to these clock ids 59 * is used to convert from clockid to the proper hrtimer_base_type.
60 * rather than moving them into the range of valid clock id's.
61 */ 60 */
62DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = 61DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
63{ 62{
@@ -74,30 +73,39 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
74 .get_time = &ktime_get, 73 .get_time = &ktime_get,
75 .resolution = KTIME_LOW_RES, 74 .resolution = KTIME_LOW_RES,
76 }, 75 },
76 {
77 .index = CLOCK_BOOTTIME,
78 .get_time = &ktime_get_boottime,
79 .resolution = KTIME_LOW_RES,
80 },
77 } 81 }
78}; 82};
79 83
84static int hrtimer_clock_to_base_table[MAX_CLOCKS];
85
86static inline int hrtimer_clockid_to_base(clockid_t clock_id)
87{
88 return hrtimer_clock_to_base_table[clock_id];
89}
90
91
80/* 92/*
81 * Get the coarse grained time at the softirq based on xtime and 93 * Get the coarse grained time at the softirq based on xtime and
82 * wall_to_monotonic. 94 * wall_to_monotonic.
83 */ 95 */
84static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) 96static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
85{ 97{
86 ktime_t xtim, tomono; 98 ktime_t xtim, mono, boot;
87 struct timespec xts, tom; 99 struct timespec xts, tom, slp;
88 unsigned long seq;
89 100
90 do { 101 get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
91 seq = read_seqbegin(&xtime_lock);
92 xts = __current_kernel_time();
93 tom = __get_wall_to_monotonic();
94 } while (read_seqretry(&xtime_lock, seq));
95 102
96 xtim = timespec_to_ktime(xts); 103 xtim = timespec_to_ktime(xts);
97 tomono = timespec_to_ktime(tom); 104 mono = ktime_add(xtim, timespec_to_ktime(tom));
98 base->clock_base[CLOCK_REALTIME].softirq_time = xtim; 105 boot = ktime_add(mono, timespec_to_ktime(slp));
99 base->clock_base[CLOCK_MONOTONIC].softirq_time = 106 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
100 ktime_add(xtim, tomono); 107 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
108 base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
101} 109}
102 110
103/* 111/*
@@ -184,10 +192,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
184 struct hrtimer_cpu_base *new_cpu_base; 192 struct hrtimer_cpu_base *new_cpu_base;
185 int this_cpu = smp_processor_id(); 193 int this_cpu = smp_processor_id();
186 int cpu = hrtimer_get_target(this_cpu, pinned); 194 int cpu = hrtimer_get_target(this_cpu, pinned);
195 int basenum = hrtimer_clockid_to_base(base->index);
187 196
188again: 197again:
189 new_cpu_base = &per_cpu(hrtimer_bases, cpu); 198 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
190 new_base = &new_cpu_base->clock_base[base->index]; 199 new_base = &new_cpu_base->clock_base[basenum];
191 200
192 if (base != new_base) { 201 if (base != new_base) {
193 /* 202 /*
@@ -334,6 +343,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe);
334 343
335static struct debug_obj_descr hrtimer_debug_descr; 344static struct debug_obj_descr hrtimer_debug_descr;
336 345
346static void *hrtimer_debug_hint(void *addr)
347{
348 return ((struct hrtimer *) addr)->function;
349}
350
337/* 351/*
338 * fixup_init is called when: 352 * fixup_init is called when:
339 * - an active object is initialized 353 * - an active object is initialized
@@ -393,6 +407,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
393 407
394static struct debug_obj_descr hrtimer_debug_descr = { 408static struct debug_obj_descr hrtimer_debug_descr = {
395 .name = "hrtimer", 409 .name = "hrtimer",
410 .debug_hint = hrtimer_debug_hint,
396 .fixup_init = hrtimer_fixup_init, 411 .fixup_init = hrtimer_fixup_init,
397 .fixup_activate = hrtimer_fixup_activate, 412 .fixup_activate = hrtimer_fixup_activate,
398 .fixup_free = hrtimer_fixup_free, 413 .fixup_free = hrtimer_fixup_free,
@@ -611,24 +626,23 @@ static int hrtimer_reprogram(struct hrtimer *timer,
611static void retrigger_next_event(void *arg) 626static void retrigger_next_event(void *arg)
612{ 627{
613 struct hrtimer_cpu_base *base; 628 struct hrtimer_cpu_base *base;
614 struct timespec realtime_offset, wtm; 629 struct timespec realtime_offset, wtm, sleep;
615 unsigned long seq;
616 630
617 if (!hrtimer_hres_active()) 631 if (!hrtimer_hres_active())
618 return; 632 return;
619 633
620 do { 634 get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm,
621 seq = read_seqbegin(&xtime_lock); 635 &sleep);
622 wtm = __get_wall_to_monotonic();
623 } while (read_seqretry(&xtime_lock, seq));
624 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); 636 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
625 637
626 base = &__get_cpu_var(hrtimer_bases); 638 base = &__get_cpu_var(hrtimer_bases);
627 639
628 /* Adjust CLOCK_REALTIME offset */ 640 /* Adjust CLOCK_REALTIME offset */
629 raw_spin_lock(&base->lock); 641 raw_spin_lock(&base->lock);
630 base->clock_base[CLOCK_REALTIME].offset = 642 base->clock_base[HRTIMER_BASE_REALTIME].offset =
631 timespec_to_ktime(realtime_offset); 643 timespec_to_ktime(realtime_offset);
644 base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
645 timespec_to_ktime(sleep);
632 646
633 hrtimer_force_reprogram(base, 0); 647 hrtimer_force_reprogram(base, 0);
634 raw_spin_unlock(&base->lock); 648 raw_spin_unlock(&base->lock);
@@ -673,14 +687,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
673} 687}
674 688
675/* 689/*
676 * Initialize the high resolution related parts of a hrtimer
677 */
678static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
679{
680}
681
682
683/*
684 * When High resolution timers are active, try to reprogram. Note, that in case 690 * When High resolution timers are active, try to reprogram. Note, that in case
685 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry 691 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
686 * check happens. The timer gets enqueued into the rbtree. The reprogramming 692 * check happens. The timer gets enqueued into the rbtree. The reprogramming
@@ -725,8 +731,9 @@ static int hrtimer_switch_to_hres(void)
725 return 0; 731 return 0;
726 } 732 }
727 base->hres_active = 1; 733 base->hres_active = 1;
728 base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; 734 base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES;
729 base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; 735 base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES;
736 base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES;
730 737
731 tick_setup_sched_timer(); 738 tick_setup_sched_timer();
732 739
@@ -750,7 +757,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
750 return 0; 757 return 0;
751} 758}
752static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } 759static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
753static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
754 760
755#endif /* CONFIG_HIGH_RES_TIMERS */ 761#endif /* CONFIG_HIGH_RES_TIMERS */
756 762
@@ -1121,6 +1127,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1121 enum hrtimer_mode mode) 1127 enum hrtimer_mode mode)
1122{ 1128{
1123 struct hrtimer_cpu_base *cpu_base; 1129 struct hrtimer_cpu_base *cpu_base;
1130 int base;
1124 1131
1125 memset(timer, 0, sizeof(struct hrtimer)); 1132 memset(timer, 0, sizeof(struct hrtimer));
1126 1133
@@ -1129,8 +1136,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1129 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) 1136 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
1130 clock_id = CLOCK_MONOTONIC; 1137 clock_id = CLOCK_MONOTONIC;
1131 1138
1132 timer->base = &cpu_base->clock_base[clock_id]; 1139 base = hrtimer_clockid_to_base(clock_id);
1133 hrtimer_init_timer_hres(timer); 1140 timer->base = &cpu_base->clock_base[base];
1134 timerqueue_init(&timer->node); 1141 timerqueue_init(&timer->node);
1135 1142
1136#ifdef CONFIG_TIMER_STATS 1143#ifdef CONFIG_TIMER_STATS
@@ -1165,9 +1172,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
1165int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) 1172int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1166{ 1173{
1167 struct hrtimer_cpu_base *cpu_base; 1174 struct hrtimer_cpu_base *cpu_base;
1175 int base = hrtimer_clockid_to_base(which_clock);
1168 1176
1169 cpu_base = &__raw_get_cpu_var(hrtimer_bases); 1177 cpu_base = &__raw_get_cpu_var(hrtimer_bases);
1170 *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); 1178 *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
1171 1179
1172 return 0; 1180 return 0;
1173} 1181}
@@ -1714,6 +1722,10 @@ static struct notifier_block __cpuinitdata hrtimers_nb = {
1714 1722
1715void __init hrtimers_init(void) 1723void __init hrtimers_init(void)
1716{ 1724{
1725 hrtimer_clock_to_base_table[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME;
1726 hrtimer_clock_to_base_table[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC;
1727 hrtimer_clock_to_base_table[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME;
1728
1717 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, 1729 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
1718 (void *)(long)smp_processor_id()); 1730 (void *)(long)smp_processor_id());
1719 register_cpu_notifier(&hrtimers_nb); 1731 register_cpu_notifier(&hrtimers_nb);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 8e42fec7686d..00f2c037267a 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,5 +1,6 @@
1# Select this to activate the generic irq options below
1config HAVE_GENERIC_HARDIRQS 2config HAVE_GENERIC_HARDIRQS
2 def_bool n 3 bool
3 4
4if HAVE_GENERIC_HARDIRQS 5if HAVE_GENERIC_HARDIRQS
5menu "IRQ subsystem" 6menu "IRQ subsystem"
@@ -11,26 +12,48 @@ config GENERIC_HARDIRQS
11 12
12# Select this to disable the deprecated stuff 13# Select this to disable the deprecated stuff
13config GENERIC_HARDIRQS_NO_DEPRECATED 14config GENERIC_HARDIRQS_NO_DEPRECATED
14 def_bool n 15 bool
16
17config GENERIC_HARDIRQS_NO_COMPAT
18 bool
15 19
16# Options selectable by the architecture code 20# Options selectable by the architecture code
21
22# Make sparse irq Kconfig switch below available
17config HAVE_SPARSE_IRQ 23config HAVE_SPARSE_IRQ
18 def_bool n 24 bool
19 25
26# Enable the generic irq autoprobe mechanism
20config GENERIC_IRQ_PROBE 27config GENERIC_IRQ_PROBE
21 def_bool n 28 bool
29
30# Use the generic /proc/interrupts implementation
31config GENERIC_IRQ_SHOW
32 bool
33
34# Print level/edge extra information
35config GENERIC_IRQ_SHOW_LEVEL
36 bool
22 37
38# Support for delayed migration from interrupt context
23config GENERIC_PENDING_IRQ 39config GENERIC_PENDING_IRQ
24 def_bool n 40 bool
25 41
42# Alpha specific irq affinity mechanism
26config AUTO_IRQ_AFFINITY 43config AUTO_IRQ_AFFINITY
27 def_bool n 44 bool
28
29config IRQ_PER_CPU
30 def_bool n
31 45
46# Tasklet based software resend for pending interrupts on enable_irq()
32config HARDIRQS_SW_RESEND 47config HARDIRQS_SW_RESEND
33 def_bool n 48 bool
49
50# Preflow handler support for fasteoi (sparc64)
51config IRQ_PREFLOW_FASTEOI
52 bool
53
54# Support forced irq threading
55config IRQ_FORCED_THREADING
56 bool
34 57
35config SPARSE_IRQ 58config SPARSE_IRQ
36 bool "Support sparse irq numbering" 59 bool "Support sparse irq numbering"
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 505798f86c36..394784c57060 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -17,7 +17,7 @@
17/* 17/*
18 * Autodetection depends on the fact that any interrupt that 18 * Autodetection depends on the fact that any interrupt that
19 * comes in on to an unassigned handler will get stuck with 19 * comes in on to an unassigned handler will get stuck with
20 * "IRQ_WAITING" cleared and the interrupt disabled. 20 * "IRQS_WAITING" cleared and the interrupt disabled.
21 */ 21 */
22static DEFINE_MUTEX(probing_active); 22static DEFINE_MUTEX(probing_active);
23 23
@@ -32,7 +32,6 @@ unsigned long probe_irq_on(void)
32{ 32{
33 struct irq_desc *desc; 33 struct irq_desc *desc;
34 unsigned long mask = 0; 34 unsigned long mask = 0;
35 unsigned int status;
36 int i; 35 int i;
37 36
38 /* 37 /*
@@ -46,13 +45,7 @@ unsigned long probe_irq_on(void)
46 */ 45 */
47 for_each_irq_desc_reverse(i, desc) { 46 for_each_irq_desc_reverse(i, desc) {
48 raw_spin_lock_irq(&desc->lock); 47 raw_spin_lock_irq(&desc->lock);
49 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 48 if (!desc->action && irq_settings_can_probe(desc)) {
50 /*
51 * An old-style architecture might still have
52 * the handle_bad_irq handler there:
53 */
54 compat_irq_chip_set_default_handler(desc);
55
56 /* 49 /*
57 * Some chips need to know about probing in 50 * Some chips need to know about probing in
58 * progress: 51 * progress:
@@ -60,7 +53,7 @@ unsigned long probe_irq_on(void)
60 if (desc->irq_data.chip->irq_set_type) 53 if (desc->irq_data.chip->irq_set_type)
61 desc->irq_data.chip->irq_set_type(&desc->irq_data, 54 desc->irq_data.chip->irq_set_type(&desc->irq_data,
62 IRQ_TYPE_PROBE); 55 IRQ_TYPE_PROBE);
63 desc->irq_data.chip->irq_startup(&desc->irq_data); 56 irq_startup(desc);
64 } 57 }
65 raw_spin_unlock_irq(&desc->lock); 58 raw_spin_unlock_irq(&desc->lock);
66 } 59 }
@@ -75,10 +68,12 @@ unsigned long probe_irq_on(void)
75 */ 68 */
76 for_each_irq_desc_reverse(i, desc) { 69 for_each_irq_desc_reverse(i, desc) {
77 raw_spin_lock_irq(&desc->lock); 70 raw_spin_lock_irq(&desc->lock);
78 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 71 if (!desc->action && irq_settings_can_probe(desc)) {
79 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 72 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
80 if (desc->irq_data.chip->irq_startup(&desc->irq_data)) 73 if (irq_startup(desc)) {
81 desc->status |= IRQ_PENDING; 74 irq_compat_set_pending(desc);
75 desc->istate |= IRQS_PENDING;
76 }
82 } 77 }
83 raw_spin_unlock_irq(&desc->lock); 78 raw_spin_unlock_irq(&desc->lock);
84 } 79 }
@@ -93,13 +88,12 @@ unsigned long probe_irq_on(void)
93 */ 88 */
94 for_each_irq_desc(i, desc) { 89 for_each_irq_desc(i, desc) {
95 raw_spin_lock_irq(&desc->lock); 90 raw_spin_lock_irq(&desc->lock);
96 status = desc->status;
97 91
98 if (status & IRQ_AUTODETECT) { 92 if (desc->istate & IRQS_AUTODETECT) {
99 /* It triggered already - consider it spurious. */ 93 /* It triggered already - consider it spurious. */
100 if (!(status & IRQ_WAITING)) { 94 if (!(desc->istate & IRQS_WAITING)) {
101 desc->status = status & ~IRQ_AUTODETECT; 95 desc->istate &= ~IRQS_AUTODETECT;
102 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 96 irq_shutdown(desc);
103 } else 97 } else
104 if (i < 32) 98 if (i < 32)
105 mask |= 1 << i; 99 mask |= 1 << i;
@@ -125,20 +119,18 @@ EXPORT_SYMBOL(probe_irq_on);
125 */ 119 */
126unsigned int probe_irq_mask(unsigned long val) 120unsigned int probe_irq_mask(unsigned long val)
127{ 121{
128 unsigned int status, mask = 0; 122 unsigned int mask = 0;
129 struct irq_desc *desc; 123 struct irq_desc *desc;
130 int i; 124 int i;
131 125
132 for_each_irq_desc(i, desc) { 126 for_each_irq_desc(i, desc) {
133 raw_spin_lock_irq(&desc->lock); 127 raw_spin_lock_irq(&desc->lock);
134 status = desc->status; 128 if (desc->istate & IRQS_AUTODETECT) {
135 129 if (i < 16 && !(desc->istate & IRQS_WAITING))
136 if (status & IRQ_AUTODETECT) {
137 if (i < 16 && !(status & IRQ_WAITING))
138 mask |= 1 << i; 130 mask |= 1 << i;
139 131
140 desc->status = status & ~IRQ_AUTODETECT; 132 desc->istate &= ~IRQS_AUTODETECT;
141 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 133 irq_shutdown(desc);
142 } 134 }
143 raw_spin_unlock_irq(&desc->lock); 135 raw_spin_unlock_irq(&desc->lock);
144 } 136 }
@@ -169,20 +161,18 @@ int probe_irq_off(unsigned long val)
169{ 161{
170 int i, irq_found = 0, nr_of_irqs = 0; 162 int i, irq_found = 0, nr_of_irqs = 0;
171 struct irq_desc *desc; 163 struct irq_desc *desc;
172 unsigned int status;
173 164
174 for_each_irq_desc(i, desc) { 165 for_each_irq_desc(i, desc) {
175 raw_spin_lock_irq(&desc->lock); 166 raw_spin_lock_irq(&desc->lock);
176 status = desc->status;
177 167
178 if (status & IRQ_AUTODETECT) { 168 if (desc->istate & IRQS_AUTODETECT) {
179 if (!(status & IRQ_WAITING)) { 169 if (!(desc->istate & IRQS_WAITING)) {
180 if (!nr_of_irqs) 170 if (!nr_of_irqs)
181 irq_found = i; 171 irq_found = i;
182 nr_of_irqs++; 172 nr_of_irqs++;
183 } 173 }
184 desc->status = status & ~IRQ_AUTODETECT; 174 desc->istate &= ~IRQS_AUTODETECT;
185 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 175 irq_shutdown(desc);
186 } 176 }
187 raw_spin_unlock_irq(&desc->lock); 177 raw_spin_unlock_irq(&desc->lock);
188 } 178 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index baa5c4acad83..c9c0601f0615 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -19,140 +19,110 @@
19#include "internals.h" 19#include "internals.h"
20 20
21/** 21/**
22 * set_irq_chip - set the irq chip for an irq 22 * irq_set_chip - set the irq chip for an irq
23 * @irq: irq number 23 * @irq: irq number
24 * @chip: pointer to irq chip description structure 24 * @chip: pointer to irq chip description structure
25 */ 25 */
26int set_irq_chip(unsigned int irq, struct irq_chip *chip) 26int irq_set_chip(unsigned int irq, struct irq_chip *chip)
27{ 27{
28 struct irq_desc *desc = irq_to_desc(irq);
29 unsigned long flags; 28 unsigned long flags;
29 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
30 30
31 if (!desc) { 31 if (!desc)
32 WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
33 return -EINVAL; 32 return -EINVAL;
34 }
35 33
36 if (!chip) 34 if (!chip)
37 chip = &no_irq_chip; 35 chip = &no_irq_chip;
38 36
39 raw_spin_lock_irqsave(&desc->lock, flags);
40 irq_chip_set_defaults(chip); 37 irq_chip_set_defaults(chip);
41 desc->irq_data.chip = chip; 38 desc->irq_data.chip = chip;
42 raw_spin_unlock_irqrestore(&desc->lock, flags); 39 irq_put_desc_unlock(desc, flags);
43
44 return 0; 40 return 0;
45} 41}
46EXPORT_SYMBOL(set_irq_chip); 42EXPORT_SYMBOL(irq_set_chip);
47 43
48/** 44/**
49 * set_irq_type - set the irq trigger type for an irq 45 * irq_set_type - set the irq trigger type for an irq
50 * @irq: irq number 46 * @irq: irq number
51 * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h 47 * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
52 */ 48 */
53int set_irq_type(unsigned int irq, unsigned int type) 49int irq_set_irq_type(unsigned int irq, unsigned int type)
54{ 50{
55 struct irq_desc *desc = irq_to_desc(irq);
56 unsigned long flags; 51 unsigned long flags;
57 int ret = -ENXIO; 52 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
53 int ret = 0;
58 54
59 if (!desc) { 55 if (!desc)
60 printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); 56 return -EINVAL;
61 return -ENODEV;
62 }
63 57
64 type &= IRQ_TYPE_SENSE_MASK; 58 type &= IRQ_TYPE_SENSE_MASK;
65 if (type == IRQ_TYPE_NONE) 59 if (type != IRQ_TYPE_NONE)
66 return 0; 60 ret = __irq_set_trigger(desc, irq, type);
67 61 irq_put_desc_busunlock(desc, flags);
68 raw_spin_lock_irqsave(&desc->lock, flags);
69 ret = __irq_set_trigger(desc, irq, type);
70 raw_spin_unlock_irqrestore(&desc->lock, flags);
71 return ret; 62 return ret;
72} 63}
73EXPORT_SYMBOL(set_irq_type); 64EXPORT_SYMBOL(irq_set_irq_type);
74 65
75/** 66/**
76 * set_irq_data - set irq type data for an irq 67 * irq_set_handler_data - set irq handler data for an irq
77 * @irq: Interrupt number 68 * @irq: Interrupt number
78 * @data: Pointer to interrupt specific data 69 * @data: Pointer to interrupt specific data
79 * 70 *
80 * Set the hardware irq controller data for an irq 71 * Set the hardware irq controller data for an irq
81 */ 72 */
82int set_irq_data(unsigned int irq, void *data) 73int irq_set_handler_data(unsigned int irq, void *data)
83{ 74{
84 struct irq_desc *desc = irq_to_desc(irq);
85 unsigned long flags; 75 unsigned long flags;
76 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
86 77
87 if (!desc) { 78 if (!desc)
88 printk(KERN_ERR
89 "Trying to install controller data for IRQ%d\n", irq);
90 return -EINVAL; 79 return -EINVAL;
91 }
92
93 raw_spin_lock_irqsave(&desc->lock, flags);
94 desc->irq_data.handler_data = data; 80 desc->irq_data.handler_data = data;
95 raw_spin_unlock_irqrestore(&desc->lock, flags); 81 irq_put_desc_unlock(desc, flags);
96 return 0; 82 return 0;
97} 83}
98EXPORT_SYMBOL(set_irq_data); 84EXPORT_SYMBOL(irq_set_handler_data);
99 85
100/** 86/**
101 * set_irq_msi - set MSI descriptor data for an irq 87 * irq_set_msi_desc - set MSI descriptor data for an irq
102 * @irq: Interrupt number 88 * @irq: Interrupt number
103 * @entry: Pointer to MSI descriptor data 89 * @entry: Pointer to MSI descriptor data
104 * 90 *
105 * Set the MSI descriptor entry for an irq 91 * Set the MSI descriptor entry for an irq
106 */ 92 */
107int set_irq_msi(unsigned int irq, struct msi_desc *entry) 93int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
108{ 94{
109 struct irq_desc *desc = irq_to_desc(irq);
110 unsigned long flags; 95 unsigned long flags;
96 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
111 97
112 if (!desc) { 98 if (!desc)
113 printk(KERN_ERR
114 "Trying to install msi data for IRQ%d\n", irq);
115 return -EINVAL; 99 return -EINVAL;
116 }
117
118 raw_spin_lock_irqsave(&desc->lock, flags);
119 desc->irq_data.msi_desc = entry; 100 desc->irq_data.msi_desc = entry;
120 if (entry) 101 if (entry)
121 entry->irq = irq; 102 entry->irq = irq;
122 raw_spin_unlock_irqrestore(&desc->lock, flags); 103 irq_put_desc_unlock(desc, flags);
123 return 0; 104 return 0;
124} 105}
125 106
126/** 107/**
127 * set_irq_chip_data - set irq chip data for an irq 108 * irq_set_chip_data - set irq chip data for an irq
128 * @irq: Interrupt number 109 * @irq: Interrupt number
129 * @data: Pointer to chip specific data 110 * @data: Pointer to chip specific data
130 * 111 *
131 * Set the hardware irq chip data for an irq 112 * Set the hardware irq chip data for an irq
132 */ 113 */
133int set_irq_chip_data(unsigned int irq, void *data) 114int irq_set_chip_data(unsigned int irq, void *data)
134{ 115{
135 struct irq_desc *desc = irq_to_desc(irq);
136 unsigned long flags; 116 unsigned long flags;
117 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
137 118
138 if (!desc) { 119 if (!desc)
139 printk(KERN_ERR
140 "Trying to install chip data for IRQ%d\n", irq);
141 return -EINVAL;
142 }
143
144 if (!desc->irq_data.chip) {
145 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
146 return -EINVAL; 120 return -EINVAL;
147 }
148
149 raw_spin_lock_irqsave(&desc->lock, flags);
150 desc->irq_data.chip_data = data; 121 desc->irq_data.chip_data = data;
151 raw_spin_unlock_irqrestore(&desc->lock, flags); 122 irq_put_desc_unlock(desc, flags);
152
153 return 0; 123 return 0;
154} 124}
155EXPORT_SYMBOL(set_irq_chip_data); 125EXPORT_SYMBOL(irq_set_chip_data);
156 126
157struct irq_data *irq_get_irq_data(unsigned int irq) 127struct irq_data *irq_get_irq_data(unsigned int irq)
158{ 128{
@@ -162,72 +132,75 @@ struct irq_data *irq_get_irq_data(unsigned int irq)
162} 132}
163EXPORT_SYMBOL_GPL(irq_get_irq_data); 133EXPORT_SYMBOL_GPL(irq_get_irq_data);
164 134
165/** 135static void irq_state_clr_disabled(struct irq_desc *desc)
166 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
167 *
168 * @irq: Interrupt number
169 * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag
170 *
171 * The IRQ_NESTED_THREAD flag indicates that on
172 * request_threaded_irq() no separate interrupt thread should be
173 * created for the irq as the handler are called nested in the
174 * context of a demultiplexing interrupt handler thread.
175 */
176void set_irq_nested_thread(unsigned int irq, int nest)
177{ 136{
178 struct irq_desc *desc = irq_to_desc(irq); 137 desc->istate &= ~IRQS_DISABLED;
179 unsigned long flags; 138 irq_compat_clr_disabled(desc);
180
181 if (!desc)
182 return;
183
184 raw_spin_lock_irqsave(&desc->lock, flags);
185 if (nest)
186 desc->status |= IRQ_NESTED_THREAD;
187 else
188 desc->status &= ~IRQ_NESTED_THREAD;
189 raw_spin_unlock_irqrestore(&desc->lock, flags);
190} 139}
191EXPORT_SYMBOL_GPL(set_irq_nested_thread);
192 140
193/* 141static void irq_state_set_disabled(struct irq_desc *desc)
194 * default enable function
195 */
196static void default_enable(struct irq_data *data)
197{ 142{
198 struct irq_desc *desc = irq_data_to_desc(data); 143 desc->istate |= IRQS_DISABLED;
144 irq_compat_set_disabled(desc);
145}
199 146
200 desc->irq_data.chip->irq_unmask(&desc->irq_data); 147static void irq_state_clr_masked(struct irq_desc *desc)
201 desc->status &= ~IRQ_MASKED; 148{
149 desc->istate &= ~IRQS_MASKED;
150 irq_compat_clr_masked(desc);
202} 151}
203 152
204/* 153static void irq_state_set_masked(struct irq_desc *desc)
205 * default disable function
206 */
207static void default_disable(struct irq_data *data)
208{ 154{
155 desc->istate |= IRQS_MASKED;
156 irq_compat_set_masked(desc);
209} 157}
210 158
211/* 159int irq_startup(struct irq_desc *desc)
212 * default startup function
213 */
214static unsigned int default_startup(struct irq_data *data)
215{ 160{
216 struct irq_desc *desc = irq_data_to_desc(data); 161 irq_state_clr_disabled(desc);
162 desc->depth = 0;
163
164 if (desc->irq_data.chip->irq_startup) {
165 int ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
166 irq_state_clr_masked(desc);
167 return ret;
168 }
217 169
218 desc->irq_data.chip->irq_enable(data); 170 irq_enable(desc);
219 return 0; 171 return 0;
220} 172}
221 173
222/* 174void irq_shutdown(struct irq_desc *desc)
223 * default shutdown function
224 */
225static void default_shutdown(struct irq_data *data)
226{ 175{
227 struct irq_desc *desc = irq_data_to_desc(data); 176 irq_state_set_disabled(desc);
177 desc->depth = 1;
178 if (desc->irq_data.chip->irq_shutdown)
179 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
180 if (desc->irq_data.chip->irq_disable)
181 desc->irq_data.chip->irq_disable(&desc->irq_data);
182 else
183 desc->irq_data.chip->irq_mask(&desc->irq_data);
184 irq_state_set_masked(desc);
185}
228 186
229 desc->irq_data.chip->irq_mask(&desc->irq_data); 187void irq_enable(struct irq_desc *desc)
230 desc->status |= IRQ_MASKED; 188{
189 irq_state_clr_disabled(desc);
190 if (desc->irq_data.chip->irq_enable)
191 desc->irq_data.chip->irq_enable(&desc->irq_data);
192 else
193 desc->irq_data.chip->irq_unmask(&desc->irq_data);
194 irq_state_clr_masked(desc);
195}
196
197void irq_disable(struct irq_desc *desc)
198{
199 irq_state_set_disabled(desc);
200 if (desc->irq_data.chip->irq_disable) {
201 desc->irq_data.chip->irq_disable(&desc->irq_data);
202 irq_state_set_masked(desc);
203 }
231} 204}
232 205
233#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED 206#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
@@ -315,10 +288,6 @@ static void compat_bus_sync_unlock(struct irq_data *data)
315void irq_chip_set_defaults(struct irq_chip *chip) 288void irq_chip_set_defaults(struct irq_chip *chip)
316{ 289{
317#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED 290#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
318 /*
319 * Compat fixup functions need to be before we set the
320 * defaults for enable/disable/startup/shutdown
321 */
322 if (chip->enable) 291 if (chip->enable)
323 chip->irq_enable = compat_irq_enable; 292 chip->irq_enable = compat_irq_enable;
324 if (chip->disable) 293 if (chip->disable)
@@ -327,33 +296,8 @@ void irq_chip_set_defaults(struct irq_chip *chip)
327 chip->irq_shutdown = compat_irq_shutdown; 296 chip->irq_shutdown = compat_irq_shutdown;
328 if (chip->startup) 297 if (chip->startup)
329 chip->irq_startup = compat_irq_startup; 298 chip->irq_startup = compat_irq_startup;
330#endif
331 /*
332 * The real defaults
333 */
334 if (!chip->irq_enable)
335 chip->irq_enable = default_enable;
336 if (!chip->irq_disable)
337 chip->irq_disable = default_disable;
338 if (!chip->irq_startup)
339 chip->irq_startup = default_startup;
340 /*
341 * We use chip->irq_disable, when the user provided its own. When
342 * we have default_disable set for chip->irq_disable, then we need
343 * to use default_shutdown, otherwise the irq line is not
344 * disabled on free_irq():
345 */
346 if (!chip->irq_shutdown)
347 chip->irq_shutdown = chip->irq_disable != default_disable ?
348 chip->irq_disable : default_shutdown;
349
350#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
351 if (!chip->end) 299 if (!chip->end)
352 chip->end = dummy_irq_chip.end; 300 chip->end = dummy_irq_chip.end;
353
354 /*
355 * Now fix up the remaining compat handlers
356 */
357 if (chip->bus_lock) 301 if (chip->bus_lock)
358 chip->irq_bus_lock = compat_bus_lock; 302 chip->irq_bus_lock = compat_bus_lock;
359 if (chip->bus_sync_unlock) 303 if (chip->bus_sync_unlock)
@@ -388,22 +332,22 @@ static inline void mask_ack_irq(struct irq_desc *desc)
388 if (desc->irq_data.chip->irq_ack) 332 if (desc->irq_data.chip->irq_ack)
389 desc->irq_data.chip->irq_ack(&desc->irq_data); 333 desc->irq_data.chip->irq_ack(&desc->irq_data);
390 } 334 }
391 desc->status |= IRQ_MASKED; 335 irq_state_set_masked(desc);
392} 336}
393 337
394static inline void mask_irq(struct irq_desc *desc) 338void mask_irq(struct irq_desc *desc)
395{ 339{
396 if (desc->irq_data.chip->irq_mask) { 340 if (desc->irq_data.chip->irq_mask) {
397 desc->irq_data.chip->irq_mask(&desc->irq_data); 341 desc->irq_data.chip->irq_mask(&desc->irq_data);
398 desc->status |= IRQ_MASKED; 342 irq_state_set_masked(desc);
399 } 343 }
400} 344}
401 345
402static inline void unmask_irq(struct irq_desc *desc) 346void unmask_irq(struct irq_desc *desc)
403{ 347{
404 if (desc->irq_data.chip->irq_unmask) { 348 if (desc->irq_data.chip->irq_unmask) {
405 desc->irq_data.chip->irq_unmask(&desc->irq_data); 349 desc->irq_data.chip->irq_unmask(&desc->irq_data);
406 desc->status &= ~IRQ_MASKED; 350 irq_state_clr_masked(desc);
407 } 351 }
408} 352}
409 353
@@ -428,10 +372,11 @@ void handle_nested_irq(unsigned int irq)
428 kstat_incr_irqs_this_cpu(irq, desc); 372 kstat_incr_irqs_this_cpu(irq, desc);
429 373
430 action = desc->action; 374 action = desc->action;
431 if (unlikely(!action || (desc->status & IRQ_DISABLED))) 375 if (unlikely(!action || (desc->istate & IRQS_DISABLED)))
432 goto out_unlock; 376 goto out_unlock;
433 377
434 desc->status |= IRQ_INPROGRESS; 378 irq_compat_set_progress(desc);
379 desc->istate |= IRQS_INPROGRESS;
435 raw_spin_unlock_irq(&desc->lock); 380 raw_spin_unlock_irq(&desc->lock);
436 381
437 action_ret = action->thread_fn(action->irq, action->dev_id); 382 action_ret = action->thread_fn(action->irq, action->dev_id);
@@ -439,13 +384,21 @@ void handle_nested_irq(unsigned int irq)
439 note_interrupt(irq, desc, action_ret); 384 note_interrupt(irq, desc, action_ret);
440 385
441 raw_spin_lock_irq(&desc->lock); 386 raw_spin_lock_irq(&desc->lock);
442 desc->status &= ~IRQ_INPROGRESS; 387 desc->istate &= ~IRQS_INPROGRESS;
388 irq_compat_clr_progress(desc);
443 389
444out_unlock: 390out_unlock:
445 raw_spin_unlock_irq(&desc->lock); 391 raw_spin_unlock_irq(&desc->lock);
446} 392}
447EXPORT_SYMBOL_GPL(handle_nested_irq); 393EXPORT_SYMBOL_GPL(handle_nested_irq);
448 394
395static bool irq_check_poll(struct irq_desc *desc)
396{
397 if (!(desc->istate & IRQS_POLL_INPROGRESS))
398 return false;
399 return irq_wait_for_poll(desc);
400}
401
449/** 402/**
450 * handle_simple_irq - Simple and software-decoded IRQs. 403 * handle_simple_irq - Simple and software-decoded IRQs.
451 * @irq: the interrupt number 404 * @irq: the interrupt number
@@ -461,29 +414,20 @@ EXPORT_SYMBOL_GPL(handle_nested_irq);
461void 414void
462handle_simple_irq(unsigned int irq, struct irq_desc *desc) 415handle_simple_irq(unsigned int irq, struct irq_desc *desc)
463{ 416{
464 struct irqaction *action;
465 irqreturn_t action_ret;
466
467 raw_spin_lock(&desc->lock); 417 raw_spin_lock(&desc->lock);
468 418
469 if (unlikely(desc->status & IRQ_INPROGRESS)) 419 if (unlikely(desc->istate & IRQS_INPROGRESS))
470 goto out_unlock; 420 if (!irq_check_poll(desc))
471 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 421 goto out_unlock;
422
423 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
472 kstat_incr_irqs_this_cpu(irq, desc); 424 kstat_incr_irqs_this_cpu(irq, desc);
473 425
474 action = desc->action; 426 if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED)))
475 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
476 goto out_unlock; 427 goto out_unlock;
477 428
478 desc->status |= IRQ_INPROGRESS; 429 handle_irq_event(desc);
479 raw_spin_unlock(&desc->lock);
480 430
481 action_ret = handle_IRQ_event(irq, action);
482 if (!noirqdebug)
483 note_interrupt(irq, desc, action_ret);
484
485 raw_spin_lock(&desc->lock);
486 desc->status &= ~IRQ_INPROGRESS;
487out_unlock: 431out_unlock:
488 raw_spin_unlock(&desc->lock); 432 raw_spin_unlock(&desc->lock);
489} 433}
@@ -501,42 +445,42 @@ out_unlock:
501void 445void
502handle_level_irq(unsigned int irq, struct irq_desc *desc) 446handle_level_irq(unsigned int irq, struct irq_desc *desc)
503{ 447{
504 struct irqaction *action;
505 irqreturn_t action_ret;
506
507 raw_spin_lock(&desc->lock); 448 raw_spin_lock(&desc->lock);
508 mask_ack_irq(desc); 449 mask_ack_irq(desc);
509 450
510 if (unlikely(desc->status & IRQ_INPROGRESS)) 451 if (unlikely(desc->istate & IRQS_INPROGRESS))
511 goto out_unlock; 452 if (!irq_check_poll(desc))
512 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 453 goto out_unlock;
454
455 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
513 kstat_incr_irqs_this_cpu(irq, desc); 456 kstat_incr_irqs_this_cpu(irq, desc);
514 457
515 /* 458 /*
516 * If its disabled or no action available 459 * If its disabled or no action available
517 * keep it masked and get out of here 460 * keep it masked and get out of here
518 */ 461 */
519 action = desc->action; 462 if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED)))
520 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
521 goto out_unlock; 463 goto out_unlock;
522 464
523 desc->status |= IRQ_INPROGRESS; 465 handle_irq_event(desc);
524 raw_spin_unlock(&desc->lock);
525
526 action_ret = handle_IRQ_event(irq, action);
527 if (!noirqdebug)
528 note_interrupt(irq, desc, action_ret);
529 466
530 raw_spin_lock(&desc->lock); 467 if (!(desc->istate & (IRQS_DISABLED | IRQS_ONESHOT)))
531 desc->status &= ~IRQ_INPROGRESS;
532
533 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
534 unmask_irq(desc); 468 unmask_irq(desc);
535out_unlock: 469out_unlock:
536 raw_spin_unlock(&desc->lock); 470 raw_spin_unlock(&desc->lock);
537} 471}
538EXPORT_SYMBOL_GPL(handle_level_irq); 472EXPORT_SYMBOL_GPL(handle_level_irq);
539 473
474#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
475static inline void preflow_handler(struct irq_desc *desc)
476{
477 if (desc->preflow_handler)
478 desc->preflow_handler(&desc->irq_data);
479}
480#else
481static inline void preflow_handler(struct irq_desc *desc) { }
482#endif
483
540/** 484/**
541 * handle_fasteoi_irq - irq handler for transparent controllers 485 * handle_fasteoi_irq - irq handler for transparent controllers
542 * @irq: the interrupt number 486 * @irq: the interrupt number
@@ -550,42 +494,41 @@ EXPORT_SYMBOL_GPL(handle_level_irq);
550void 494void
551handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) 495handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
552{ 496{
553 struct irqaction *action;
554 irqreturn_t action_ret;
555
556 raw_spin_lock(&desc->lock); 497 raw_spin_lock(&desc->lock);
557 498
558 if (unlikely(desc->status & IRQ_INPROGRESS)) 499 if (unlikely(desc->istate & IRQS_INPROGRESS))
559 goto out; 500 if (!irq_check_poll(desc))
501 goto out;
560 502
561 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 503 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
562 kstat_incr_irqs_this_cpu(irq, desc); 504 kstat_incr_irqs_this_cpu(irq, desc);
563 505
564 /* 506 /*
565 * If its disabled or no action available 507 * If its disabled or no action available
566 * then mask it and get out of here: 508 * then mask it and get out of here:
567 */ 509 */
568 action = desc->action; 510 if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) {
569 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 511 irq_compat_set_pending(desc);
570 desc->status |= IRQ_PENDING; 512 desc->istate |= IRQS_PENDING;
571 mask_irq(desc); 513 mask_irq(desc);
572 goto out; 514 goto out;
573 } 515 }
574 516
575 desc->status |= IRQ_INPROGRESS; 517 if (desc->istate & IRQS_ONESHOT)
576 desc->status &= ~IRQ_PENDING; 518 mask_irq(desc);
577 raw_spin_unlock(&desc->lock);
578 519
579 action_ret = handle_IRQ_event(irq, action); 520 preflow_handler(desc);
580 if (!noirqdebug) 521 handle_irq_event(desc);
581 note_interrupt(irq, desc, action_ret);
582 522
583 raw_spin_lock(&desc->lock); 523out_eoi:
584 desc->status &= ~IRQ_INPROGRESS;
585out:
586 desc->irq_data.chip->irq_eoi(&desc->irq_data); 524 desc->irq_data.chip->irq_eoi(&desc->irq_data);
587 525out_unlock:
588 raw_spin_unlock(&desc->lock); 526 raw_spin_unlock(&desc->lock);
527 return;
528out:
529 if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED))
530 goto out_eoi;
531 goto out_unlock;
589} 532}
590 533
591/** 534/**
@@ -609,32 +552,28 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
609{ 552{
610 raw_spin_lock(&desc->lock); 553 raw_spin_lock(&desc->lock);
611 554
612 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 555 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
613
614 /* 556 /*
615 * If we're currently running this IRQ, or its disabled, 557 * If we're currently running this IRQ, or its disabled,
616 * we shouldn't process the IRQ. Mark it pending, handle 558 * we shouldn't process the IRQ. Mark it pending, handle
617 * the necessary masking and go out 559 * the necessary masking and go out
618 */ 560 */
619 if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || 561 if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) ||
620 !desc->action)) { 562 !desc->action))) {
621 desc->status |= (IRQ_PENDING | IRQ_MASKED); 563 if (!irq_check_poll(desc)) {
622 mask_ack_irq(desc); 564 irq_compat_set_pending(desc);
623 goto out_unlock; 565 desc->istate |= IRQS_PENDING;
566 mask_ack_irq(desc);
567 goto out_unlock;
568 }
624 } 569 }
625 kstat_incr_irqs_this_cpu(irq, desc); 570 kstat_incr_irqs_this_cpu(irq, desc);
626 571
627 /* Start handling the irq */ 572 /* Start handling the irq */
628 desc->irq_data.chip->irq_ack(&desc->irq_data); 573 desc->irq_data.chip->irq_ack(&desc->irq_data);
629 574
630 /* Mark the IRQ currently in progress.*/
631 desc->status |= IRQ_INPROGRESS;
632
633 do { 575 do {
634 struct irqaction *action = desc->action; 576 if (unlikely(!desc->action)) {
635 irqreturn_t action_ret;
636
637 if (unlikely(!action)) {
638 mask_irq(desc); 577 mask_irq(desc);
639 goto out_unlock; 578 goto out_unlock;
640 } 579 }
@@ -644,22 +583,17 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
644 * one, we could have masked the irq. 583 * one, we could have masked the irq.
645 * Renable it, if it was not disabled in meantime. 584 * Renable it, if it was not disabled in meantime.
646 */ 585 */
647 if (unlikely((desc->status & 586 if (unlikely(desc->istate & IRQS_PENDING)) {
648 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == 587 if (!(desc->istate & IRQS_DISABLED) &&
649 (IRQ_PENDING | IRQ_MASKED))) { 588 (desc->istate & IRQS_MASKED))
650 unmask_irq(desc); 589 unmask_irq(desc);
651 } 590 }
652 591
653 desc->status &= ~IRQ_PENDING; 592 handle_irq_event(desc);
654 raw_spin_unlock(&desc->lock);
655 action_ret = handle_IRQ_event(irq, action);
656 if (!noirqdebug)
657 note_interrupt(irq, desc, action_ret);
658 raw_spin_lock(&desc->lock);
659 593
660 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); 594 } while ((desc->istate & IRQS_PENDING) &&
595 !(desc->istate & IRQS_DISABLED));
661 596
662 desc->status &= ~IRQ_INPROGRESS;
663out_unlock: 597out_unlock:
664 raw_spin_unlock(&desc->lock); 598 raw_spin_unlock(&desc->lock);
665} 599}
@@ -674,103 +608,84 @@ out_unlock:
674void 608void
675handle_percpu_irq(unsigned int irq, struct irq_desc *desc) 609handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
676{ 610{
677 irqreturn_t action_ret; 611 struct irq_chip *chip = irq_desc_get_chip(desc);
678 612
679 kstat_incr_irqs_this_cpu(irq, desc); 613 kstat_incr_irqs_this_cpu(irq, desc);
680 614
681 if (desc->irq_data.chip->irq_ack) 615 if (chip->irq_ack)
682 desc->irq_data.chip->irq_ack(&desc->irq_data); 616 chip->irq_ack(&desc->irq_data);
683 617
684 action_ret = handle_IRQ_event(irq, desc->action); 618 handle_irq_event_percpu(desc, desc->action);
685 if (!noirqdebug)
686 note_interrupt(irq, desc, action_ret);
687 619
688 if (desc->irq_data.chip->irq_eoi) 620 if (chip->irq_eoi)
689 desc->irq_data.chip->irq_eoi(&desc->irq_data); 621 chip->irq_eoi(&desc->irq_data);
690} 622}
691 623
692void 624void
693__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, 625__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
694 const char *name) 626 const char *name)
695{ 627{
696 struct irq_desc *desc = irq_to_desc(irq);
697 unsigned long flags; 628 unsigned long flags;
629 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
698 630
699 if (!desc) { 631 if (!desc)
700 printk(KERN_ERR
701 "Trying to install type control for IRQ%d\n", irq);
702 return; 632 return;
703 }
704 633
705 if (!handle) 634 if (!handle) {
706 handle = handle_bad_irq; 635 handle = handle_bad_irq;
707 else if (desc->irq_data.chip == &no_irq_chip) { 636 } else {
708 printk(KERN_WARNING "Trying to install %sinterrupt handler " 637 if (WARN_ON(desc->irq_data.chip == &no_irq_chip))
709 "for IRQ%d\n", is_chained ? "chained " : "", irq); 638 goto out;
710 /*
711 * Some ARM implementations install a handler for really dumb
712 * interrupt hardware without setting an irq_chip. This worked
713 * with the ARM no_irq_chip but the check in setup_irq would
714 * prevent us to setup the interrupt at all. Switch it to
715 * dummy_irq_chip for easy transition.
716 */
717 desc->irq_data.chip = &dummy_irq_chip;
718 } 639 }
719 640
720 chip_bus_lock(desc);
721 raw_spin_lock_irqsave(&desc->lock, flags);
722
723 /* Uninstall? */ 641 /* Uninstall? */
724 if (handle == handle_bad_irq) { 642 if (handle == handle_bad_irq) {
725 if (desc->irq_data.chip != &no_irq_chip) 643 if (desc->irq_data.chip != &no_irq_chip)
726 mask_ack_irq(desc); 644 mask_ack_irq(desc);
727 desc->status |= IRQ_DISABLED; 645 irq_compat_set_disabled(desc);
646 desc->istate |= IRQS_DISABLED;
728 desc->depth = 1; 647 desc->depth = 1;
729 } 648 }
730 desc->handle_irq = handle; 649 desc->handle_irq = handle;
731 desc->name = name; 650 desc->name = name;
732 651
733 if (handle != handle_bad_irq && is_chained) { 652 if (handle != handle_bad_irq && is_chained) {
734 desc->status &= ~IRQ_DISABLED; 653 irq_settings_set_noprobe(desc);
735 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; 654 irq_settings_set_norequest(desc);
736 desc->depth = 0; 655 irq_startup(desc);
737 desc->irq_data.chip->irq_startup(&desc->irq_data);
738 } 656 }
739 raw_spin_unlock_irqrestore(&desc->lock, flags); 657out:
740 chip_bus_sync_unlock(desc); 658 irq_put_desc_busunlock(desc, flags);
741}
742EXPORT_SYMBOL_GPL(__set_irq_handler);
743
744void
745set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
746 irq_flow_handler_t handle)
747{
748 set_irq_chip(irq, chip);
749 __set_irq_handler(irq, handle, 0, NULL);
750} 659}
660EXPORT_SYMBOL_GPL(__irq_set_handler);
751 661
752void 662void
753set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, 663irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
754 irq_flow_handler_t handle, const char *name) 664 irq_flow_handler_t handle, const char *name)
755{ 665{
756 set_irq_chip(irq, chip); 666 irq_set_chip(irq, chip);
757 __set_irq_handler(irq, handle, 0, name); 667 __irq_set_handler(irq, handle, 0, name);
758} 668}
759 669
760void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) 670void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
761{ 671{
762 struct irq_desc *desc = irq_to_desc(irq);
763 unsigned long flags; 672 unsigned long flags;
673 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
764 674
765 if (!desc) 675 if (!desc)
766 return; 676 return;
677 irq_settings_clr_and_set(desc, clr, set);
678
679 irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
680 IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
681 if (irq_settings_has_no_balance_set(desc))
682 irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
683 if (irq_settings_is_per_cpu(desc))
684 irqd_set(&desc->irq_data, IRQD_PER_CPU);
685 if (irq_settings_can_move_pcntxt(desc))
686 irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
767 687
768 /* Sanitize flags */ 688 irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
769 set &= IRQF_MODIFY_MASK;
770 clr &= IRQF_MODIFY_MASK;
771 689
772 raw_spin_lock_irqsave(&desc->lock, flags); 690 irq_put_desc_unlock(desc, flags);
773 desc->status &= ~clr;
774 desc->status |= set;
775 raw_spin_unlock_irqrestore(&desc->lock, flags);
776} 691}
diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h
new file mode 100644
index 000000000000..6bbaf66aca85
--- /dev/null
+++ b/kernel/irq/compat.h
@@ -0,0 +1,72 @@
1/*
2 * Compat layer for transition period
3 */
4#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
5static inline void irq_compat_set_progress(struct irq_desc *desc)
6{
7 desc->status |= IRQ_INPROGRESS;
8}
9
10static inline void irq_compat_clr_progress(struct irq_desc *desc)
11{
12 desc->status &= ~IRQ_INPROGRESS;
13}
14static inline void irq_compat_set_disabled(struct irq_desc *desc)
15{
16 desc->status |= IRQ_DISABLED;
17}
18static inline void irq_compat_clr_disabled(struct irq_desc *desc)
19{
20 desc->status &= ~IRQ_DISABLED;
21}
22static inline void irq_compat_set_pending(struct irq_desc *desc)
23{
24 desc->status |= IRQ_PENDING;
25}
26
27static inline void irq_compat_clr_pending(struct irq_desc *desc)
28{
29 desc->status &= ~IRQ_PENDING;
30}
31static inline void irq_compat_set_masked(struct irq_desc *desc)
32{
33 desc->status |= IRQ_MASKED;
34}
35
36static inline void irq_compat_clr_masked(struct irq_desc *desc)
37{
38 desc->status &= ~IRQ_MASKED;
39}
40static inline void irq_compat_set_move_pending(struct irq_desc *desc)
41{
42 desc->status |= IRQ_MOVE_PENDING;
43}
44
45static inline void irq_compat_clr_move_pending(struct irq_desc *desc)
46{
47 desc->status &= ~IRQ_MOVE_PENDING;
48}
49static inline void irq_compat_set_affinity(struct irq_desc *desc)
50{
51 desc->status |= IRQ_AFFINITY_SET;
52}
53
54static inline void irq_compat_clr_affinity(struct irq_desc *desc)
55{
56 desc->status &= ~IRQ_AFFINITY_SET;
57}
58#else
59static inline void irq_compat_set_progress(struct irq_desc *desc) { }
60static inline void irq_compat_clr_progress(struct irq_desc *desc) { }
61static inline void irq_compat_set_disabled(struct irq_desc *desc) { }
62static inline void irq_compat_clr_disabled(struct irq_desc *desc) { }
63static inline void irq_compat_set_pending(struct irq_desc *desc) { }
64static inline void irq_compat_clr_pending(struct irq_desc *desc) { }
65static inline void irq_compat_set_masked(struct irq_desc *desc) { }
66static inline void irq_compat_clr_masked(struct irq_desc *desc) { }
67static inline void irq_compat_set_move_pending(struct irq_desc *desc) { }
68static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { }
69static inline void irq_compat_set_affinity(struct irq_desc *desc) { }
70static inline void irq_compat_clr_affinity(struct irq_desc *desc) { }
71#endif
72
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
new file mode 100644
index 000000000000..d1a33b7fa61d
--- /dev/null
+++ b/kernel/irq/debug.h
@@ -0,0 +1,40 @@
1/*
2 * Debugging printout:
3 */
4
5#include <linux/kallsyms.h>
6
7#define P(f) if (desc->status & f) printk("%14s set\n", #f)
8#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
9
10static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
11{
12 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
13 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
14 printk("->handle_irq(): %p, ", desc->handle_irq);
15 print_symbol("%s\n", (unsigned long)desc->handle_irq);
16 printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
17 print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
18 printk("->action(): %p\n", desc->action);
19 if (desc->action) {
20 printk("->action->handler(): %p, ", desc->action->handler);
21 print_symbol("%s\n", (unsigned long)desc->action->handler);
22 }
23
24 P(IRQ_LEVEL);
25 P(IRQ_PER_CPU);
26 P(IRQ_NOPROBE);
27 P(IRQ_NOREQUEST);
28 P(IRQ_NOAUTOEN);
29
30 PS(IRQS_AUTODETECT);
31 PS(IRQS_INPROGRESS);
32 PS(IRQS_REPLAY);
33 PS(IRQS_WAITING);
34 PS(IRQS_DISABLED);
35 PS(IRQS_PENDING);
36 PS(IRQS_MASKED);
37}
38
39#undef P
40#undef PS
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3540a7190122..517561fc7317 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -51,30 +51,92 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
51 "but no thread function available.", irq, action->name); 51 "but no thread function available.", irq, action->name);
52} 52}
53 53
54/** 54static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
55 * handle_IRQ_event - irq action chain handler 55{
56 * @irq: the interrupt number 56 /*
57 * @action: the interrupt action chain for this irq 57 * Wake up the handler thread for this action. In case the
58 * 58 * thread crashed and was killed we just pretend that we
59 * Handles the action chain of an irq event 59 * handled the interrupt. The hardirq handler has disabled the
60 */ 60 * device interrupt, so no irq storm is lurking. If the
61irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) 61 * RUNTHREAD bit is already set, nothing to do.
62 */
63 if (test_bit(IRQTF_DIED, &action->thread_flags) ||
64 test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
65 return;
66
67 /*
68 * It's safe to OR the mask lockless here. We have only two
69 * places which write to threads_oneshot: This code and the
70 * irq thread.
71 *
72 * This code is the hard irq context and can never run on two
73 * cpus in parallel. If it ever does we have more serious
74 * problems than this bitmask.
75 *
76 * The irq threads of this irq which clear their "running" bit
77 * in threads_oneshot are serialized via desc->lock against
78 * each other and they are serialized against this code by
79 * IRQS_INPROGRESS.
80 *
81 * Hard irq handler:
82 *
83 * spin_lock(desc->lock);
84 * desc->state |= IRQS_INPROGRESS;
85 * spin_unlock(desc->lock);
86 * set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
87 * desc->threads_oneshot |= mask;
88 * spin_lock(desc->lock);
89 * desc->state &= ~IRQS_INPROGRESS;
90 * spin_unlock(desc->lock);
91 *
92 * irq thread:
93 *
94 * again:
95 * spin_lock(desc->lock);
96 * if (desc->state & IRQS_INPROGRESS) {
97 * spin_unlock(desc->lock);
98 * while(desc->state & IRQS_INPROGRESS)
99 * cpu_relax();
100 * goto again;
101 * }
102 * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
103 * desc->threads_oneshot &= ~mask;
104 * spin_unlock(desc->lock);
105 *
106 * So either the thread waits for us to clear IRQS_INPROGRESS
107 * or we are waiting in the flow handler for desc->lock to be
108 * released before we reach this point. The thread also checks
109 * IRQTF_RUNTHREAD under desc->lock. If set it leaves
110 * threads_oneshot untouched and runs the thread another time.
111 */
112 desc->threads_oneshot |= action->thread_mask;
113 wake_up_process(action->thread);
114}
115
116irqreturn_t
117handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
62{ 118{
63 irqreturn_t ret, retval = IRQ_NONE; 119 irqreturn_t retval = IRQ_NONE;
64 unsigned int status = 0; 120 unsigned int random = 0, irq = desc->irq_data.irq;
65 121
66 do { 122 do {
123 irqreturn_t res;
124
67 trace_irq_handler_entry(irq, action); 125 trace_irq_handler_entry(irq, action);
68 ret = action->handler(irq, action->dev_id); 126 res = action->handler(irq, action->dev_id);
69 trace_irq_handler_exit(irq, action, ret); 127 trace_irq_handler_exit(irq, action, res);
70 128
71 switch (ret) { 129 if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
130 irq, action->handler))
131 local_irq_disable();
132
133 switch (res) {
72 case IRQ_WAKE_THREAD: 134 case IRQ_WAKE_THREAD:
73 /* 135 /*
74 * Set result to handled so the spurious check 136 * Set result to handled so the spurious check
75 * does not trigger. 137 * does not trigger.
76 */ 138 */
77 ret = IRQ_HANDLED; 139 res = IRQ_HANDLED;
78 140
79 /* 141 /*
80 * Catch drivers which return WAKE_THREAD but 142 * Catch drivers which return WAKE_THREAD but
@@ -85,36 +147,56 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
85 break; 147 break;
86 } 148 }
87 149
88 /* 150 irq_wake_thread(desc, action);
89 * Wake up the handler thread for this
90 * action. In case the thread crashed and was
91 * killed we just pretend that we handled the
92 * interrupt. The hardirq handler above has
93 * disabled the device interrupt, so no irq
94 * storm is lurking.
95 */
96 if (likely(!test_bit(IRQTF_DIED,
97 &action->thread_flags))) {
98 set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
99 wake_up_process(action->thread);
100 }
101 151
102 /* Fall through to add to randomness */ 152 /* Fall through to add to randomness */
103 case IRQ_HANDLED: 153 case IRQ_HANDLED:
104 status |= action->flags; 154 random |= action->flags;
105 break; 155 break;
106 156
107 default: 157 default:
108 break; 158 break;
109 } 159 }
110 160
111 retval |= ret; 161 retval |= res;
112 action = action->next; 162 action = action->next;
113 } while (action); 163 } while (action);
114 164
115 if (status & IRQF_SAMPLE_RANDOM) 165 if (random & IRQF_SAMPLE_RANDOM)
116 add_interrupt_randomness(irq); 166 add_interrupt_randomness(irq);
117 local_irq_disable();
118 167
168 if (!noirqdebug)
169 note_interrupt(irq, desc, retval);
119 return retval; 170 return retval;
120} 171}
172
173irqreturn_t handle_irq_event(struct irq_desc *desc)
174{
175 struct irqaction *action = desc->action;
176 irqreturn_t ret;
177
178 irq_compat_clr_pending(desc);
179 desc->istate &= ~IRQS_PENDING;
180 irq_compat_set_progress(desc);
181 desc->istate |= IRQS_INPROGRESS;
182 raw_spin_unlock(&desc->lock);
183
184 ret = handle_irq_event_percpu(desc, action);
185
186 raw_spin_lock(&desc->lock);
187 desc->istate &= ~IRQS_INPROGRESS;
188 irq_compat_clr_progress(desc);
189 return ret;
190}
191
192/**
193 * handle_IRQ_event - irq action chain handler
194 * @irq: the interrupt number
195 * @action: the interrupt action chain for this irq
196 *
197 * Handles the action chain of an irq event
198 */
199irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
200{
201 return handle_irq_event_percpu(irq_to_desc(irq), action);
202}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 99c3bc8a6fb4..6c6ec9a49027 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,5 +1,9 @@
1/* 1/*
2 * IRQ subsystem internal functions and variables: 2 * IRQ subsystem internal functions and variables:
3 *
4 * Do not ever include this file from anything else than
5 * kernel/irq/. Do not even think about using any information outside
6 * of this file for your non core code.
3 */ 7 */
4#include <linux/irqdesc.h> 8#include <linux/irqdesc.h>
5 9
@@ -9,25 +13,89 @@
9# define IRQ_BITMAP_BITS NR_IRQS 13# define IRQ_BITMAP_BITS NR_IRQS
10#endif 14#endif
11 15
16#define istate core_internal_state__do_not_mess_with_it
17
18#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
19# define status status_use_accessors
20#endif
21
12extern int noirqdebug; 22extern int noirqdebug;
13 23
24/*
25 * Bits used by threaded handlers:
26 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
27 * IRQTF_DIED - handler thread died
28 * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
29 * IRQTF_AFFINITY - irq thread is requested to adjust affinity
30 * IRQTF_FORCED_THREAD - irq action is force threaded
31 */
32enum {
33 IRQTF_RUNTHREAD,
34 IRQTF_DIED,
35 IRQTF_WARNED,
36 IRQTF_AFFINITY,
37 IRQTF_FORCED_THREAD,
38};
39
40/*
41 * Bit masks for desc->state
42 *
43 * IRQS_AUTODETECT - autodetection in progress
44 * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt
45 * detection
46 * IRQS_POLL_INPROGRESS - polling in progress
47 * IRQS_INPROGRESS - Interrupt in progress
48 * IRQS_ONESHOT - irq is not unmasked in primary handler
49 * IRQS_REPLAY - irq is replayed
50 * IRQS_WAITING - irq is waiting
51 * IRQS_DISABLED - irq is disabled
52 * IRQS_PENDING - irq is pending and replayed later
53 * IRQS_MASKED - irq is masked
54 * IRQS_SUSPENDED - irq is suspended
55 */
56enum {
57 IRQS_AUTODETECT = 0x00000001,
58 IRQS_SPURIOUS_DISABLED = 0x00000002,
59 IRQS_POLL_INPROGRESS = 0x00000008,
60 IRQS_INPROGRESS = 0x00000010,
61 IRQS_ONESHOT = 0x00000020,
62 IRQS_REPLAY = 0x00000040,
63 IRQS_WAITING = 0x00000080,
64 IRQS_DISABLED = 0x00000100,
65 IRQS_PENDING = 0x00000200,
66 IRQS_MASKED = 0x00000400,
67 IRQS_SUSPENDED = 0x00000800,
68};
69
70#include "compat.h"
71#include "debug.h"
72#include "settings.h"
73
14#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) 74#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
15 75
16/* Set default functions for irq_chip structures: */ 76/* Set default functions for irq_chip structures: */
17extern void irq_chip_set_defaults(struct irq_chip *chip); 77extern void irq_chip_set_defaults(struct irq_chip *chip);
18 78
19/* Set default handler: */
20extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
21
22extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 79extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
23 unsigned long flags); 80 unsigned long flags);
24extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); 81extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
25extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 82extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
26 83
84extern int irq_startup(struct irq_desc *desc);
85extern void irq_shutdown(struct irq_desc *desc);
86extern void irq_enable(struct irq_desc *desc);
87extern void irq_disable(struct irq_desc *desc);
88extern void mask_irq(struct irq_desc *desc);
89extern void unmask_irq(struct irq_desc *desc);
90
27extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 91extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
28 92
93irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
94irqreturn_t handle_irq_event(struct irq_desc *desc);
95
29/* Resending of interrupts :*/ 96/* Resending of interrupts :*/
30void check_irq_resend(struct irq_desc *desc, unsigned int irq); 97void check_irq_resend(struct irq_desc *desc, unsigned int irq);
98bool irq_wait_for_poll(struct irq_desc *desc);
31 99
32#ifdef CONFIG_PROC_FS 100#ifdef CONFIG_PROC_FS
33extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); 101extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
@@ -43,20 +111,10 @@ static inline void unregister_handler_proc(unsigned int irq,
43 struct irqaction *action) { } 111 struct irqaction *action) { }
44#endif 112#endif
45 113
46extern int irq_select_affinity_usr(unsigned int irq); 114extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
47 115
48extern void irq_set_thread_affinity(struct irq_desc *desc); 116extern void irq_set_thread_affinity(struct irq_desc *desc);
49 117
50#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
51static inline void irq_end(unsigned int irq, struct irq_desc *desc)
52{
53 if (desc->irq_data.chip && desc->irq_data.chip->end)
54 desc->irq_data.chip->end(irq);
55}
56#else
57static inline void irq_end(unsigned int irq, struct irq_desc *desc) { }
58#endif
59
60/* Inline functions for support of irq chips on slow busses */ 118/* Inline functions for support of irq chips on slow busses */
61static inline void chip_bus_lock(struct irq_desc *desc) 119static inline void chip_bus_lock(struct irq_desc *desc)
62{ 120{
@@ -70,43 +128,60 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
70 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); 128 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
71} 129}
72 130
131struct irq_desc *
132__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus);
133void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
134
135static inline struct irq_desc *
136irq_get_desc_buslock(unsigned int irq, unsigned long *flags)
137{
138 return __irq_get_desc_lock(irq, flags, true);
139}
140
141static inline void
142irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
143{
144 __irq_put_desc_unlock(desc, flags, true);
145}
146
147static inline struct irq_desc *
148irq_get_desc_lock(unsigned int irq, unsigned long *flags)
149{
150 return __irq_get_desc_lock(irq, flags, false);
151}
152
153static inline void
154irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
155{
156 __irq_put_desc_unlock(desc, flags, false);
157}
158
73/* 159/*
74 * Debugging printout: 160 * Manipulation functions for irq_data.state
75 */ 161 */
162static inline void irqd_set_move_pending(struct irq_data *d)
163{
164 d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
165 irq_compat_set_move_pending(irq_data_to_desc(d));
166}
76 167
77#include <linux/kallsyms.h> 168static inline void irqd_clr_move_pending(struct irq_data *d)
78 169{
79#define P(f) if (desc->status & f) printk("%14s set\n", #f) 170 d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
171 irq_compat_clr_move_pending(irq_data_to_desc(d));
172}
80 173
81static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) 174static inline void irqd_clear(struct irq_data *d, unsigned int mask)
82{ 175{
83 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", 176 d->state_use_accessors &= ~mask;
84 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
85 printk("->handle_irq(): %p, ", desc->handle_irq);
86 print_symbol("%s\n", (unsigned long)desc->handle_irq);
87 printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
88 print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
89 printk("->action(): %p\n", desc->action);
90 if (desc->action) {
91 printk("->action->handler(): %p, ", desc->action->handler);
92 print_symbol("%s\n", (unsigned long)desc->action->handler);
93 }
94
95 P(IRQ_INPROGRESS);
96 P(IRQ_DISABLED);
97 P(IRQ_PENDING);
98 P(IRQ_REPLAY);
99 P(IRQ_AUTODETECT);
100 P(IRQ_WAITING);
101 P(IRQ_LEVEL);
102 P(IRQ_MASKED);
103#ifdef CONFIG_IRQ_PER_CPU
104 P(IRQ_PER_CPU);
105#endif
106 P(IRQ_NOPROBE);
107 P(IRQ_NOREQUEST);
108 P(IRQ_NOAUTOEN);
109} 177}
110 178
111#undef P 179static inline void irqd_set(struct irq_data *d, unsigned int mask)
180{
181 d->state_use_accessors |= mask;
182}
112 183
184static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
185{
186 return d->state_use_accessors & mask;
187}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 2039bea31bdf..6fb014f172f7 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -79,7 +79,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
79 desc->irq_data.chip_data = NULL; 79 desc->irq_data.chip_data = NULL;
80 desc->irq_data.handler_data = NULL; 80 desc->irq_data.handler_data = NULL;
81 desc->irq_data.msi_desc = NULL; 81 desc->irq_data.msi_desc = NULL;
82 desc->status = IRQ_DEFAULT_INIT_FLAGS; 82 irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
83 desc->istate = IRQS_DISABLED;
83 desc->handle_irq = handle_bad_irq; 84 desc->handle_irq = handle_bad_irq;
84 desc->depth = 1; 85 desc->depth = 1;
85 desc->irq_count = 0; 86 desc->irq_count = 0;
@@ -197,13 +198,12 @@ err:
197 return -ENOMEM; 198 return -ENOMEM;
198} 199}
199 200
200struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) 201static int irq_expand_nr_irqs(unsigned int nr)
201{ 202{
202 int res = irq_alloc_descs(irq, irq, 1, node); 203 if (nr > IRQ_BITMAP_BITS)
203 204 return -ENOMEM;
204 if (res == -EEXIST || res == irq) 205 nr_irqs = nr;
205 return irq_to_desc(irq); 206 return 0;
206 return NULL;
207} 207}
208 208
209int __init early_irq_init(void) 209int __init early_irq_init(void)
@@ -238,7 +238,7 @@ int __init early_irq_init(void)
238 238
239struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { 239struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
240 [0 ... NR_IRQS-1] = { 240 [0 ... NR_IRQS-1] = {
241 .status = IRQ_DEFAULT_INIT_FLAGS, 241 .istate = IRQS_DISABLED,
242 .handle_irq = handle_bad_irq, 242 .handle_irq = handle_bad_irq,
243 .depth = 1, 243 .depth = 1,
244 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), 244 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
@@ -260,8 +260,8 @@ int __init early_irq_init(void)
260 for (i = 0; i < count; i++) { 260 for (i = 0; i < count; i++) {
261 desc[i].irq_data.irq = i; 261 desc[i].irq_data.irq = i;
262 desc[i].irq_data.chip = &no_irq_chip; 262 desc[i].irq_data.chip = &no_irq_chip;
263 /* TODO : do this allocation on-demand ... */
264 desc[i].kstat_irqs = alloc_percpu(unsigned int); 263 desc[i].kstat_irqs = alloc_percpu(unsigned int);
264 irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
265 alloc_masks(desc + i, GFP_KERNEL, node); 265 alloc_masks(desc + i, GFP_KERNEL, node);
266 desc_smp_init(desc + i, node); 266 desc_smp_init(desc + i, node);
267 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 267 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -274,11 +274,6 @@ struct irq_desc *irq_to_desc(unsigned int irq)
274 return (irq < NR_IRQS) ? irq_desc + irq : NULL; 274 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
275} 275}
276 276
277struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
278{
279 return irq_to_desc(irq);
280}
281
282static void free_desc(unsigned int irq) 277static void free_desc(unsigned int irq)
283{ 278{
284 dynamic_irq_cleanup(irq); 279 dynamic_irq_cleanup(irq);
@@ -286,24 +281,14 @@ static void free_desc(unsigned int irq)
286 281
287static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) 282static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
288{ 283{
289#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
290 struct irq_desc *desc;
291 unsigned int i;
292
293 for (i = 0; i < cnt; i++) {
294 desc = irq_to_desc(start + i);
295 if (desc && !desc->kstat_irqs) {
296 unsigned int __percpu *stats = alloc_percpu(unsigned int);
297
298 if (!stats)
299 return -1;
300 if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
301 free_percpu(stats);
302 }
303 }
304#endif
305 return start; 284 return start;
306} 285}
286
287static int irq_expand_nr_irqs(unsigned int nr)
288{
289 return -ENOMEM;
290}
291
307#endif /* !CONFIG_SPARSE_IRQ */ 292#endif /* !CONFIG_SPARSE_IRQ */
308 293
309/* Dynamic interrupt handling */ 294/* Dynamic interrupt handling */
@@ -347,14 +332,17 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
347 332
348 mutex_lock(&sparse_irq_lock); 333 mutex_lock(&sparse_irq_lock);
349 334
350 start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); 335 start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
336 from, cnt, 0);
351 ret = -EEXIST; 337 ret = -EEXIST;
352 if (irq >=0 && start != irq) 338 if (irq >=0 && start != irq)
353 goto err; 339 goto err;
354 340
355 ret = -ENOMEM; 341 if (start + cnt > nr_irqs) {
356 if (start >= nr_irqs) 342 ret = irq_expand_nr_irqs(start + cnt);
357 goto err; 343 if (ret)
344 goto err;
345 }
358 346
359 bitmap_set(allocated_irqs, start, cnt); 347 bitmap_set(allocated_irqs, start, cnt);
360 mutex_unlock(&sparse_irq_lock); 348 mutex_unlock(&sparse_irq_lock);
@@ -401,6 +389,26 @@ unsigned int irq_get_next_irq(unsigned int offset)
401 return find_next_bit(allocated_irqs, nr_irqs, offset); 389 return find_next_bit(allocated_irqs, nr_irqs, offset);
402} 390}
403 391
392struct irq_desc *
393__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus)
394{
395 struct irq_desc *desc = irq_to_desc(irq);
396
397 if (desc) {
398 if (bus)
399 chip_bus_lock(desc);
400 raw_spin_lock_irqsave(&desc->lock, *flags);
401 }
402 return desc;
403}
404
405void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
406{
407 raw_spin_unlock_irqrestore(&desc->lock, flags);
408 if (bus)
409 chip_bus_sync_unlock(desc);
410}
411
404/** 412/**
405 * dynamic_irq_cleanup - cleanup a dynamically allocated irq 413 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
406 * @irq: irq number to initialize 414 * @irq: irq number to initialize
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9033c1c70828..0a2aa73e536c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,17 @@
17 17
18#include "internals.h" 18#include "internals.h"
19 19
20#ifdef CONFIG_IRQ_FORCED_THREADING
21__read_mostly bool force_irqthreads;
22
23static int __init setup_forced_irqthreads(char *arg)
24{
25 force_irqthreads = true;
26 return 0;
27}
28early_param("threadirqs", setup_forced_irqthreads);
29#endif
30
20/** 31/**
21 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 32 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
22 * @irq: interrupt number to wait for 33 * @irq: interrupt number to wait for
@@ -30,7 +41,7 @@
30void synchronize_irq(unsigned int irq) 41void synchronize_irq(unsigned int irq)
31{ 42{
32 struct irq_desc *desc = irq_to_desc(irq); 43 struct irq_desc *desc = irq_to_desc(irq);
33 unsigned int status; 44 unsigned int state;
34 45
35 if (!desc) 46 if (!desc)
36 return; 47 return;
@@ -42,16 +53,16 @@ void synchronize_irq(unsigned int irq)
42 * Wait until we're out of the critical section. This might 53 * Wait until we're out of the critical section. This might
43 * give the wrong answer due to the lack of memory barriers. 54 * give the wrong answer due to the lack of memory barriers.
44 */ 55 */
45 while (desc->status & IRQ_INPROGRESS) 56 while (desc->istate & IRQS_INPROGRESS)
46 cpu_relax(); 57 cpu_relax();
47 58
48 /* Ok, that indicated we're done: double-check carefully. */ 59 /* Ok, that indicated we're done: double-check carefully. */
49 raw_spin_lock_irqsave(&desc->lock, flags); 60 raw_spin_lock_irqsave(&desc->lock, flags);
50 status = desc->status; 61 state = desc->istate;
51 raw_spin_unlock_irqrestore(&desc->lock, flags); 62 raw_spin_unlock_irqrestore(&desc->lock, flags);
52 63
53 /* Oops, that failed? */ 64 /* Oops, that failed? */
54 } while (status & IRQ_INPROGRESS); 65 } while (state & IRQS_INPROGRESS);
55 66
56 /* 67 /*
57 * We made sure that no hardirq handler is running. Now verify 68 * We made sure that no hardirq handler is running. Now verify
@@ -73,8 +84,8 @@ int irq_can_set_affinity(unsigned int irq)
73{ 84{
74 struct irq_desc *desc = irq_to_desc(irq); 85 struct irq_desc *desc = irq_to_desc(irq);
75 86
76 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip || 87 if (!desc || !irqd_can_balance(&desc->irq_data) ||
77 !desc->irq_data.chip->irq_set_affinity) 88 !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
78 return 0; 89 return 0;
79 90
80 return 1; 91 return 1;
@@ -100,67 +111,169 @@ void irq_set_thread_affinity(struct irq_desc *desc)
100 } 111 }
101} 112}
102 113
114#ifdef CONFIG_GENERIC_PENDING_IRQ
115static inline bool irq_can_move_pcntxt(struct irq_desc *desc)
116{
117 return irq_settings_can_move_pcntxt(desc);
118}
119static inline bool irq_move_pending(struct irq_desc *desc)
120{
121 return irqd_is_setaffinity_pending(&desc->irq_data);
122}
123static inline void
124irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
125{
126 cpumask_copy(desc->pending_mask, mask);
127}
128static inline void
129irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
130{
131 cpumask_copy(mask, desc->pending_mask);
132}
133#else
134static inline bool irq_can_move_pcntxt(struct irq_desc *desc) { return true; }
135static inline bool irq_move_pending(struct irq_desc *desc) { return false; }
136static inline void
137irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { }
138static inline void
139irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
140#endif
141
103/** 142/**
104 * irq_set_affinity - Set the irq affinity of a given irq 143 * irq_set_affinity - Set the irq affinity of a given irq
105 * @irq: Interrupt to set affinity 144 * @irq: Interrupt to set affinity
106 * @cpumask: cpumask 145 * @cpumask: cpumask
107 * 146 *
108 */ 147 */
109int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) 148int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
110{ 149{
111 struct irq_desc *desc = irq_to_desc(irq); 150 struct irq_desc *desc = irq_to_desc(irq);
112 struct irq_chip *chip = desc->irq_data.chip; 151 struct irq_chip *chip = desc->irq_data.chip;
113 unsigned long flags; 152 unsigned long flags;
153 int ret = 0;
114 154
115 if (!chip->irq_set_affinity) 155 if (!chip->irq_set_affinity)
116 return -EINVAL; 156 return -EINVAL;
117 157
118 raw_spin_lock_irqsave(&desc->lock, flags); 158 raw_spin_lock_irqsave(&desc->lock, flags);
119 159
120#ifdef CONFIG_GENERIC_PENDING_IRQ 160 if (irq_can_move_pcntxt(desc)) {
121 if (desc->status & IRQ_MOVE_PCNTXT) { 161 ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
122 if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { 162 switch (ret) {
123 cpumask_copy(desc->irq_data.affinity, cpumask); 163 case IRQ_SET_MASK_OK:
164 cpumask_copy(desc->irq_data.affinity, mask);
165 case IRQ_SET_MASK_OK_NOCOPY:
124 irq_set_thread_affinity(desc); 166 irq_set_thread_affinity(desc);
167 ret = 0;
125 } 168 }
169 } else {
170 irqd_set_move_pending(&desc->irq_data);
171 irq_copy_pending(desc, mask);
126 } 172 }
127 else { 173
128 desc->status |= IRQ_MOVE_PENDING; 174 if (desc->affinity_notify) {
129 cpumask_copy(desc->pending_mask, cpumask); 175 kref_get(&desc->affinity_notify->kref);
130 } 176 schedule_work(&desc->affinity_notify->work);
131#else
132 if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
133 cpumask_copy(desc->irq_data.affinity, cpumask);
134 irq_set_thread_affinity(desc);
135 } 177 }
136#endif 178 irq_compat_set_affinity(desc);
137 desc->status |= IRQ_AFFINITY_SET; 179 irqd_set(&desc->irq_data, IRQD_AFFINITY_SET);
138 raw_spin_unlock_irqrestore(&desc->lock, flags); 180 raw_spin_unlock_irqrestore(&desc->lock, flags);
139 return 0; 181 return ret;
140} 182}
141 183
142int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) 184int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
143{ 185{
186 unsigned long flags;
187 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
188
189 if (!desc)
190 return -EINVAL;
191 desc->affinity_hint = m;
192 irq_put_desc_unlock(desc, flags);
193 return 0;
194}
195EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
196
197static void irq_affinity_notify(struct work_struct *work)
198{
199 struct irq_affinity_notify *notify =
200 container_of(work, struct irq_affinity_notify, work);
201 struct irq_desc *desc = irq_to_desc(notify->irq);
202 cpumask_var_t cpumask;
203 unsigned long flags;
204
205 if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL))
206 goto out;
207
208 raw_spin_lock_irqsave(&desc->lock, flags);
209 if (irq_move_pending(desc))
210 irq_get_pending(cpumask, desc);
211 else
212 cpumask_copy(cpumask, desc->irq_data.affinity);
213 raw_spin_unlock_irqrestore(&desc->lock, flags);
214
215 notify->notify(notify, cpumask);
216
217 free_cpumask_var(cpumask);
218out:
219 kref_put(&notify->kref, notify->release);
220}
221
222/**
223 * irq_set_affinity_notifier - control notification of IRQ affinity changes
224 * @irq: Interrupt for which to enable/disable notification
225 * @notify: Context for notification, or %NULL to disable
226 * notification. Function pointers must be initialised;
227 * the other fields will be initialised by this function.
228 *
229 * Must be called in process context. Notification may only be enabled
230 * after the IRQ is allocated and must be disabled before the IRQ is
231 * freed using free_irq().
232 */
233int
234irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
235{
144 struct irq_desc *desc = irq_to_desc(irq); 236 struct irq_desc *desc = irq_to_desc(irq);
237 struct irq_affinity_notify *old_notify;
145 unsigned long flags; 238 unsigned long flags;
146 239
240 /* The release function is promised process context */
241 might_sleep();
242
147 if (!desc) 243 if (!desc)
148 return -EINVAL; 244 return -EINVAL;
149 245
246 /* Complete initialisation of *notify */
247 if (notify) {
248 notify->irq = irq;
249 kref_init(&notify->kref);
250 INIT_WORK(&notify->work, irq_affinity_notify);
251 }
252
150 raw_spin_lock_irqsave(&desc->lock, flags); 253 raw_spin_lock_irqsave(&desc->lock, flags);
151 desc->affinity_hint = m; 254 old_notify = desc->affinity_notify;
255 desc->affinity_notify = notify;
152 raw_spin_unlock_irqrestore(&desc->lock, flags); 256 raw_spin_unlock_irqrestore(&desc->lock, flags);
153 257
258 if (old_notify)
259 kref_put(&old_notify->kref, old_notify->release);
260
154 return 0; 261 return 0;
155} 262}
156EXPORT_SYMBOL_GPL(irq_set_affinity_hint); 263EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
157 264
158#ifndef CONFIG_AUTO_IRQ_AFFINITY 265#ifndef CONFIG_AUTO_IRQ_AFFINITY
159/* 266/*
160 * Generic version of the affinity autoselector. 267 * Generic version of the affinity autoselector.
161 */ 268 */
162static int setup_affinity(unsigned int irq, struct irq_desc *desc) 269static int
270setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
163{ 271{
272 struct irq_chip *chip = irq_desc_get_chip(desc);
273 struct cpumask *set = irq_default_affinity;
274 int ret;
275
276 /* Excludes PER_CPU and NO_BALANCE interrupts */
164 if (!irq_can_set_affinity(irq)) 277 if (!irq_can_set_affinity(irq))
165 return 0; 278 return 0;
166 279
@@ -168,22 +281,29 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
168 * Preserve an userspace affinity setup, but make sure that 281 * Preserve an userspace affinity setup, but make sure that
169 * one of the targets is online. 282 * one of the targets is online.
170 */ 283 */
171 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { 284 if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
172 if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) 285 if (cpumask_intersects(desc->irq_data.affinity,
173 < nr_cpu_ids) 286 cpu_online_mask))
174 goto set_affinity; 287 set = desc->irq_data.affinity;
175 else 288 else {
176 desc->status &= ~IRQ_AFFINITY_SET; 289 irq_compat_clr_affinity(desc);
290 irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
291 }
177 } 292 }
178 293
179 cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity); 294 cpumask_and(mask, cpu_online_mask, set);
180set_affinity: 295 ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
181 desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); 296 switch (ret) {
182 297 case IRQ_SET_MASK_OK:
298 cpumask_copy(desc->irq_data.affinity, mask);
299 case IRQ_SET_MASK_OK_NOCOPY:
300 irq_set_thread_affinity(desc);
301 }
183 return 0; 302 return 0;
184} 303}
185#else 304#else
186static inline int setup_affinity(unsigned int irq, struct irq_desc *d) 305static inline int
306setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask)
187{ 307{
188 return irq_select_affinity(irq); 308 return irq_select_affinity(irq);
189} 309}
@@ -192,23 +312,21 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
192/* 312/*
193 * Called when affinity is set via /proc/irq 313 * Called when affinity is set via /proc/irq
194 */ 314 */
195int irq_select_affinity_usr(unsigned int irq) 315int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
196{ 316{
197 struct irq_desc *desc = irq_to_desc(irq); 317 struct irq_desc *desc = irq_to_desc(irq);
198 unsigned long flags; 318 unsigned long flags;
199 int ret; 319 int ret;
200 320
201 raw_spin_lock_irqsave(&desc->lock, flags); 321 raw_spin_lock_irqsave(&desc->lock, flags);
202 ret = setup_affinity(irq, desc); 322 ret = setup_affinity(irq, desc, mask);
203 if (!ret)
204 irq_set_thread_affinity(desc);
205 raw_spin_unlock_irqrestore(&desc->lock, flags); 323 raw_spin_unlock_irqrestore(&desc->lock, flags);
206
207 return ret; 324 return ret;
208} 325}
209 326
210#else 327#else
211static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) 328static inline int
329setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
212{ 330{
213 return 0; 331 return 0;
214} 332}
@@ -219,13 +337,23 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
219 if (suspend) { 337 if (suspend) {
220 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) 338 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
221 return; 339 return;
222 desc->status |= IRQ_SUSPENDED; 340 desc->istate |= IRQS_SUSPENDED;
223 } 341 }
224 342
225 if (!desc->depth++) { 343 if (!desc->depth++)
226 desc->status |= IRQ_DISABLED; 344 irq_disable(desc);
227 desc->irq_data.chip->irq_disable(&desc->irq_data); 345}
228 } 346
347static int __disable_irq_nosync(unsigned int irq)
348{
349 unsigned long flags;
350 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
351
352 if (!desc)
353 return -EINVAL;
354 __disable_irq(desc, irq, false);
355 irq_put_desc_busunlock(desc, flags);
356 return 0;
229} 357}
230 358
231/** 359/**
@@ -241,17 +369,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
241 */ 369 */
242void disable_irq_nosync(unsigned int irq) 370void disable_irq_nosync(unsigned int irq)
243{ 371{
244 struct irq_desc *desc = irq_to_desc(irq); 372 __disable_irq_nosync(irq);
245 unsigned long flags;
246
247 if (!desc)
248 return;
249
250 chip_bus_lock(desc);
251 raw_spin_lock_irqsave(&desc->lock, flags);
252 __disable_irq(desc, irq, false);
253 raw_spin_unlock_irqrestore(&desc->lock, flags);
254 chip_bus_sync_unlock(desc);
255} 373}
256EXPORT_SYMBOL(disable_irq_nosync); 374EXPORT_SYMBOL(disable_irq_nosync);
257 375
@@ -269,21 +387,24 @@ EXPORT_SYMBOL(disable_irq_nosync);
269 */ 387 */
270void disable_irq(unsigned int irq) 388void disable_irq(unsigned int irq)
271{ 389{
272 struct irq_desc *desc = irq_to_desc(irq); 390 if (!__disable_irq_nosync(irq))
273
274 if (!desc)
275 return;
276
277 disable_irq_nosync(irq);
278 if (desc->action)
279 synchronize_irq(irq); 391 synchronize_irq(irq);
280} 392}
281EXPORT_SYMBOL(disable_irq); 393EXPORT_SYMBOL(disable_irq);
282 394
283void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) 395void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
284{ 396{
285 if (resume) 397 if (resume) {
286 desc->status &= ~IRQ_SUSPENDED; 398 if (!(desc->istate & IRQS_SUSPENDED)) {
399 if (!desc->action)
400 return;
401 if (!(desc->action->flags & IRQF_FORCE_RESUME))
402 return;
403 /* Pretend that it got disabled ! */
404 desc->depth++;
405 }
406 desc->istate &= ~IRQS_SUSPENDED;
407 }
287 408
288 switch (desc->depth) { 409 switch (desc->depth) {
289 case 0: 410 case 0:
@@ -291,12 +412,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
291 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); 412 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
292 break; 413 break;
293 case 1: { 414 case 1: {
294 unsigned int status = desc->status & ~IRQ_DISABLED; 415 if (desc->istate & IRQS_SUSPENDED)
295
296 if (desc->status & IRQ_SUSPENDED)
297 goto err_out; 416 goto err_out;
298 /* Prevent probing on this irq: */ 417 /* Prevent probing on this irq: */
299 desc->status = status | IRQ_NOPROBE; 418 irq_settings_set_noprobe(desc);
419 irq_enable(desc);
300 check_irq_resend(desc, irq); 420 check_irq_resend(desc, irq);
301 /* fall-through */ 421 /* fall-through */
302 } 422 }
@@ -318,21 +438,18 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
318 */ 438 */
319void enable_irq(unsigned int irq) 439void enable_irq(unsigned int irq)
320{ 440{
321 struct irq_desc *desc = irq_to_desc(irq);
322 unsigned long flags; 441 unsigned long flags;
442 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
323 443
324 if (!desc) 444 if (!desc)
325 return; 445 return;
446 if (WARN(!desc->irq_data.chip,
447 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
448 goto out;
326 449
327 if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable,
328 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
329 return;
330
331 chip_bus_lock(desc);
332 raw_spin_lock_irqsave(&desc->lock, flags);
333 __enable_irq(desc, irq, false); 450 __enable_irq(desc, irq, false);
334 raw_spin_unlock_irqrestore(&desc->lock, flags); 451out:
335 chip_bus_sync_unlock(desc); 452 irq_put_desc_busunlock(desc, flags);
336} 453}
337EXPORT_SYMBOL(enable_irq); 454EXPORT_SYMBOL(enable_irq);
338 455
@@ -348,7 +465,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
348} 465}
349 466
350/** 467/**
351 * set_irq_wake - control irq power management wakeup 468 * irq_set_irq_wake - control irq power management wakeup
352 * @irq: interrupt to control 469 * @irq: interrupt to control
353 * @on: enable/disable power management wakeup 470 * @on: enable/disable power management wakeup
354 * 471 *
@@ -359,23 +476,22 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
359 * Wakeup mode lets this IRQ wake the system from sleep 476 * Wakeup mode lets this IRQ wake the system from sleep
360 * states like "suspend to RAM". 477 * states like "suspend to RAM".
361 */ 478 */
362int set_irq_wake(unsigned int irq, unsigned int on) 479int irq_set_irq_wake(unsigned int irq, unsigned int on)
363{ 480{
364 struct irq_desc *desc = irq_to_desc(irq);
365 unsigned long flags; 481 unsigned long flags;
482 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
366 int ret = 0; 483 int ret = 0;
367 484
368 /* wakeup-capable irqs can be shared between drivers that 485 /* wakeup-capable irqs can be shared between drivers that
369 * don't need to have the same sleep mode behaviors. 486 * don't need to have the same sleep mode behaviors.
370 */ 487 */
371 raw_spin_lock_irqsave(&desc->lock, flags);
372 if (on) { 488 if (on) {
373 if (desc->wake_depth++ == 0) { 489 if (desc->wake_depth++ == 0) {
374 ret = set_irq_wake_real(irq, on); 490 ret = set_irq_wake_real(irq, on);
375 if (ret) 491 if (ret)
376 desc->wake_depth = 0; 492 desc->wake_depth = 0;
377 else 493 else
378 desc->status |= IRQ_WAKEUP; 494 irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
379 } 495 }
380 } else { 496 } else {
381 if (desc->wake_depth == 0) { 497 if (desc->wake_depth == 0) {
@@ -385,14 +501,13 @@ int set_irq_wake(unsigned int irq, unsigned int on)
385 if (ret) 501 if (ret)
386 desc->wake_depth = 1; 502 desc->wake_depth = 1;
387 else 503 else
388 desc->status &= ~IRQ_WAKEUP; 504 irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
389 } 505 }
390 } 506 }
391 507 irq_put_desc_busunlock(desc, flags);
392 raw_spin_unlock_irqrestore(&desc->lock, flags);
393 return ret; 508 return ret;
394} 509}
395EXPORT_SYMBOL(set_irq_wake); 510EXPORT_SYMBOL(irq_set_irq_wake);
396 511
397/* 512/*
398 * Internal function that tells the architecture code whether a 513 * Internal function that tells the architecture code whether a
@@ -401,43 +516,27 @@ EXPORT_SYMBOL(set_irq_wake);
401 */ 516 */
402int can_request_irq(unsigned int irq, unsigned long irqflags) 517int can_request_irq(unsigned int irq, unsigned long irqflags)
403{ 518{
404 struct irq_desc *desc = irq_to_desc(irq);
405 struct irqaction *action;
406 unsigned long flags; 519 unsigned long flags;
520 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
521 int canrequest = 0;
407 522
408 if (!desc) 523 if (!desc)
409 return 0; 524 return 0;
410 525
411 if (desc->status & IRQ_NOREQUEST) 526 if (irq_settings_can_request(desc)) {
412 return 0; 527 if (desc->action)
413 528 if (irqflags & desc->action->flags & IRQF_SHARED)
414 raw_spin_lock_irqsave(&desc->lock, flags); 529 canrequest =1;
415 action = desc->action; 530 }
416 if (action) 531 irq_put_desc_unlock(desc, flags);
417 if (irqflags & action->flags & IRQF_SHARED) 532 return canrequest;
418 action = NULL;
419
420 raw_spin_unlock_irqrestore(&desc->lock, flags);
421
422 return !action;
423}
424
425void compat_irq_chip_set_default_handler(struct irq_desc *desc)
426{
427 /*
428 * If the architecture still has not overriden
429 * the flow handler then zap the default. This
430 * should catch incorrect flow-type setting.
431 */
432 if (desc->handle_irq == &handle_bad_irq)
433 desc->handle_irq = NULL;
434} 533}
435 534
436int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 535int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
437 unsigned long flags) 536 unsigned long flags)
438{ 537{
439 int ret;
440 struct irq_chip *chip = desc->irq_data.chip; 538 struct irq_chip *chip = desc->irq_data.chip;
539 int ret, unmask = 0;
441 540
442 if (!chip || !chip->irq_set_type) { 541 if (!chip || !chip->irq_set_type) {
443 /* 542 /*
@@ -449,23 +548,43 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
449 return 0; 548 return 0;
450 } 549 }
451 550
551 flags &= IRQ_TYPE_SENSE_MASK;
552
553 if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
554 if (!(desc->istate & IRQS_MASKED))
555 mask_irq(desc);
556 if (!(desc->istate & IRQS_DISABLED))
557 unmask = 1;
558 }
559
452 /* caller masked out all except trigger mode flags */ 560 /* caller masked out all except trigger mode flags */
453 ret = chip->irq_set_type(&desc->irq_data, flags); 561 ret = chip->irq_set_type(&desc->irq_data, flags);
454 562
455 if (ret) 563 switch (ret) {
456 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", 564 case IRQ_SET_MASK_OK:
457 flags, irq, chip->irq_set_type); 565 irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
458 else { 566 irqd_set(&desc->irq_data, flags);
459 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) 567
460 flags |= IRQ_LEVEL; 568 case IRQ_SET_MASK_OK_NOCOPY:
461 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ 569 flags = irqd_get_trigger_type(&desc->irq_data);
462 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); 570 irq_settings_set_trigger_mask(desc, flags);
463 desc->status |= flags; 571 irqd_clear(&desc->irq_data, IRQD_LEVEL);
572 irq_settings_clr_level(desc);
573 if (flags & IRQ_TYPE_LEVEL_MASK) {
574 irq_settings_set_level(desc);
575 irqd_set(&desc->irq_data, IRQD_LEVEL);
576 }
464 577
465 if (chip != desc->irq_data.chip) 578 if (chip != desc->irq_data.chip)
466 irq_chip_set_defaults(desc->irq_data.chip); 579 irq_chip_set_defaults(desc->irq_data.chip);
580 ret = 0;
581 break;
582 default:
583 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
584 flags, irq, chip->irq_set_type);
467 } 585 }
468 586 if (unmask)
587 unmask_irq(desc);
469 return ret; 588 return ret;
470} 589}
471 590
@@ -509,8 +628,11 @@ static int irq_wait_for_interrupt(struct irqaction *action)
509 * handler finished. unmask if the interrupt has not been disabled and 628 * handler finished. unmask if the interrupt has not been disabled and
510 * is marked MASKED. 629 * is marked MASKED.
511 */ 630 */
512static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 631static void irq_finalize_oneshot(struct irq_desc *desc,
632 struct irqaction *action, bool force)
513{ 633{
634 if (!(desc->istate & IRQS_ONESHOT))
635 return;
514again: 636again:
515 chip_bus_lock(desc); 637 chip_bus_lock(desc);
516 raw_spin_lock_irq(&desc->lock); 638 raw_spin_lock_irq(&desc->lock);
@@ -522,26 +644,44 @@ again:
522 * The thread is faster done than the hard interrupt handler 644 * The thread is faster done than the hard interrupt handler
523 * on the other CPU. If we unmask the irq line then the 645 * on the other CPU. If we unmask the irq line then the
524 * interrupt can come in again and masks the line, leaves due 646 * interrupt can come in again and masks the line, leaves due
525 * to IRQ_INPROGRESS and the irq line is masked forever. 647 * to IRQS_INPROGRESS and the irq line is masked forever.
648 *
649 * This also serializes the state of shared oneshot handlers
650 * versus "desc->threads_onehsot |= action->thread_mask;" in
651 * irq_wake_thread(). See the comment there which explains the
652 * serialization.
526 */ 653 */
527 if (unlikely(desc->status & IRQ_INPROGRESS)) { 654 if (unlikely(desc->istate & IRQS_INPROGRESS)) {
528 raw_spin_unlock_irq(&desc->lock); 655 raw_spin_unlock_irq(&desc->lock);
529 chip_bus_sync_unlock(desc); 656 chip_bus_sync_unlock(desc);
530 cpu_relax(); 657 cpu_relax();
531 goto again; 658 goto again;
532 } 659 }
533 660
534 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 661 /*
535 desc->status &= ~IRQ_MASKED; 662 * Now check again, whether the thread should run. Otherwise
663 * we would clear the threads_oneshot bit of this thread which
664 * was just set.
665 */
666 if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
667 goto out_unlock;
668
669 desc->threads_oneshot &= ~action->thread_mask;
670
671 if (!desc->threads_oneshot && !(desc->istate & IRQS_DISABLED) &&
672 (desc->istate & IRQS_MASKED)) {
673 irq_compat_clr_masked(desc);
674 desc->istate &= ~IRQS_MASKED;
536 desc->irq_data.chip->irq_unmask(&desc->irq_data); 675 desc->irq_data.chip->irq_unmask(&desc->irq_data);
537 } 676 }
677out_unlock:
538 raw_spin_unlock_irq(&desc->lock); 678 raw_spin_unlock_irq(&desc->lock);
539 chip_bus_sync_unlock(desc); 679 chip_bus_sync_unlock(desc);
540} 680}
541 681
542#ifdef CONFIG_SMP 682#ifdef CONFIG_SMP
543/* 683/*
544 * Check whether we need to change the affinity of the interrupt thread. 684 * Check whether we need to chasnge the affinity of the interrupt thread.
545 */ 685 */
546static void 686static void
547irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) 687irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
@@ -573,6 +713,32 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
573#endif 713#endif
574 714
575/* 715/*
716 * Interrupts which are not explicitely requested as threaded
717 * interrupts rely on the implicit bh/preempt disable of the hard irq
718 * context. So we need to disable bh here to avoid deadlocks and other
719 * side effects.
720 */
721static void
722irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
723{
724 local_bh_disable();
725 action->thread_fn(action->irq, action->dev_id);
726 irq_finalize_oneshot(desc, action, false);
727 local_bh_enable();
728}
729
730/*
731 * Interrupts explicitely requested as threaded interupts want to be
732 * preemtible - many of them need to sleep and wait for slow busses to
733 * complete.
734 */
735static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action)
736{
737 action->thread_fn(action->irq, action->dev_id);
738 irq_finalize_oneshot(desc, action, false);
739}
740
741/*
576 * Interrupt handler thread 742 * Interrupt handler thread
577 */ 743 */
578static int irq_thread(void *data) 744static int irq_thread(void *data)
@@ -582,7 +748,14 @@ static int irq_thread(void *data)
582 }; 748 };
583 struct irqaction *action = data; 749 struct irqaction *action = data;
584 struct irq_desc *desc = irq_to_desc(action->irq); 750 struct irq_desc *desc = irq_to_desc(action->irq);
585 int wake, oneshot = desc->status & IRQ_ONESHOT; 751 void (*handler_fn)(struct irq_desc *desc, struct irqaction *action);
752 int wake;
753
754 if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
755 &action->thread_flags))
756 handler_fn = irq_forced_thread_fn;
757 else
758 handler_fn = irq_thread_fn;
586 759
587 sched_setscheduler(current, SCHED_FIFO, &param); 760 sched_setscheduler(current, SCHED_FIFO, &param);
588 current->irqaction = action; 761 current->irqaction = action;
@@ -594,23 +767,20 @@ static int irq_thread(void *data)
594 atomic_inc(&desc->threads_active); 767 atomic_inc(&desc->threads_active);
595 768
596 raw_spin_lock_irq(&desc->lock); 769 raw_spin_lock_irq(&desc->lock);
597 if (unlikely(desc->status & IRQ_DISABLED)) { 770 if (unlikely(desc->istate & IRQS_DISABLED)) {
598 /* 771 /*
599 * CHECKME: We might need a dedicated 772 * CHECKME: We might need a dedicated
600 * IRQ_THREAD_PENDING flag here, which 773 * IRQ_THREAD_PENDING flag here, which
601 * retriggers the thread in check_irq_resend() 774 * retriggers the thread in check_irq_resend()
602 * but AFAICT IRQ_PENDING should be fine as it 775 * but AFAICT IRQS_PENDING should be fine as it
603 * retriggers the interrupt itself --- tglx 776 * retriggers the interrupt itself --- tglx
604 */ 777 */
605 desc->status |= IRQ_PENDING; 778 irq_compat_set_pending(desc);
779 desc->istate |= IRQS_PENDING;
606 raw_spin_unlock_irq(&desc->lock); 780 raw_spin_unlock_irq(&desc->lock);
607 } else { 781 } else {
608 raw_spin_unlock_irq(&desc->lock); 782 raw_spin_unlock_irq(&desc->lock);
609 783 handler_fn(desc, action);
610 action->thread_fn(action->irq, action->dev_id);
611
612 if (oneshot)
613 irq_finalize_oneshot(action->irq, desc);
614 } 784 }
615 785
616 wake = atomic_dec_and_test(&desc->threads_active); 786 wake = atomic_dec_and_test(&desc->threads_active);
@@ -619,6 +789,9 @@ static int irq_thread(void *data)
619 wake_up(&desc->wait_for_threads); 789 wake_up(&desc->wait_for_threads);
620 } 790 }
621 791
792 /* Prevent a stale desc->threads_oneshot */
793 irq_finalize_oneshot(desc, action, true);
794
622 /* 795 /*
623 * Clear irqaction. Otherwise exit_irq_thread() would make 796 * Clear irqaction. Otherwise exit_irq_thread() would make
624 * fuzz about an active irq thread going into nirvana. 797 * fuzz about an active irq thread going into nirvana.
@@ -633,6 +806,7 @@ static int irq_thread(void *data)
633void exit_irq_thread(void) 806void exit_irq_thread(void)
634{ 807{
635 struct task_struct *tsk = current; 808 struct task_struct *tsk = current;
809 struct irq_desc *desc;
636 810
637 if (!tsk->irqaction) 811 if (!tsk->irqaction)
638 return; 812 return;
@@ -641,6 +815,14 @@ void exit_irq_thread(void)
641 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", 815 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
642 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); 816 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
643 817
818 desc = irq_to_desc(tsk->irqaction->irq);
819
820 /*
821 * Prevent a stale desc->threads_oneshot. Must be called
822 * before setting the IRQTF_DIED flag.
823 */
824 irq_finalize_oneshot(desc, tsk->irqaction, true);
825
644 /* 826 /*
645 * Set the THREAD DIED flag to prevent further wakeups of the 827 * Set the THREAD DIED flag to prevent further wakeups of the
646 * soon to be gone threaded handler. 828 * soon to be gone threaded handler.
@@ -648,6 +830,22 @@ void exit_irq_thread(void)
648 set_bit(IRQTF_DIED, &tsk->irqaction->flags); 830 set_bit(IRQTF_DIED, &tsk->irqaction->flags);
649} 831}
650 832
833static void irq_setup_forced_threading(struct irqaction *new)
834{
835 if (!force_irqthreads)
836 return;
837 if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
838 return;
839
840 new->flags |= IRQF_ONESHOT;
841
842 if (!new->thread_fn) {
843 set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
844 new->thread_fn = new->handler;
845 new->handler = irq_default_primary_handler;
846 }
847}
848
651/* 849/*
652 * Internal function to register an irqaction - typically used to 850 * Internal function to register an irqaction - typically used to
653 * allocate special interrupts that are part of the architecture. 851 * allocate special interrupts that are part of the architecture.
@@ -657,9 +855,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
657{ 855{
658 struct irqaction *old, **old_ptr; 856 struct irqaction *old, **old_ptr;
659 const char *old_name = NULL; 857 const char *old_name = NULL;
660 unsigned long flags; 858 unsigned long flags, thread_mask = 0;
661 int nested, shared = 0; 859 int ret, nested, shared = 0;
662 int ret; 860 cpumask_var_t mask;
663 861
664 if (!desc) 862 if (!desc)
665 return -EINVAL; 863 return -EINVAL;
@@ -683,15 +881,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
683 rand_initialize_irq(irq); 881 rand_initialize_irq(irq);
684 } 882 }
685 883
686 /* Oneshot interrupts are not allowed with shared */
687 if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
688 return -EINVAL;
689
690 /* 884 /*
691 * Check whether the interrupt nests into another interrupt 885 * Check whether the interrupt nests into another interrupt
692 * thread. 886 * thread.
693 */ 887 */
694 nested = desc->status & IRQ_NESTED_THREAD; 888 nested = irq_settings_is_nested_thread(desc);
695 if (nested) { 889 if (nested) {
696 if (!new->thread_fn) 890 if (!new->thread_fn)
697 return -EINVAL; 891 return -EINVAL;
@@ -701,6 +895,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
701 * dummy function which warns when called. 895 * dummy function which warns when called.
702 */ 896 */
703 new->handler = irq_nested_primary_handler; 897 new->handler = irq_nested_primary_handler;
898 } else {
899 irq_setup_forced_threading(new);
704 } 900 }
705 901
706 /* 902 /*
@@ -724,6 +920,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
724 new->thread = t; 920 new->thread = t;
725 } 921 }
726 922
923 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
924 ret = -ENOMEM;
925 goto out_thread;
926 }
927
727 /* 928 /*
728 * The following block of code has to be executed atomically 929 * The following block of code has to be executed atomically
729 */ 930 */
@@ -735,29 +936,40 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
735 * Can't share interrupts unless both agree to and are 936 * Can't share interrupts unless both agree to and are
736 * the same type (level, edge, polarity). So both flag 937 * the same type (level, edge, polarity). So both flag
737 * fields must have IRQF_SHARED set and the bits which 938 * fields must have IRQF_SHARED set and the bits which
738 * set the trigger type must match. 939 * set the trigger type must match. Also all must
940 * agree on ONESHOT.
739 */ 941 */
740 if (!((old->flags & new->flags) & IRQF_SHARED) || 942 if (!((old->flags & new->flags) & IRQF_SHARED) ||
741 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) { 943 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
944 ((old->flags ^ new->flags) & IRQF_ONESHOT)) {
742 old_name = old->name; 945 old_name = old->name;
743 goto mismatch; 946 goto mismatch;
744 } 947 }
745 948
746#if defined(CONFIG_IRQ_PER_CPU)
747 /* All handlers must agree on per-cpuness */ 949 /* All handlers must agree on per-cpuness */
748 if ((old->flags & IRQF_PERCPU) != 950 if ((old->flags & IRQF_PERCPU) !=
749 (new->flags & IRQF_PERCPU)) 951 (new->flags & IRQF_PERCPU))
750 goto mismatch; 952 goto mismatch;
751#endif
752 953
753 /* add new interrupt at end of irq queue */ 954 /* add new interrupt at end of irq queue */
754 do { 955 do {
956 thread_mask |= old->thread_mask;
755 old_ptr = &old->next; 957 old_ptr = &old->next;
756 old = *old_ptr; 958 old = *old_ptr;
757 } while (old); 959 } while (old);
758 shared = 1; 960 shared = 1;
759 } 961 }
760 962
963 /*
964 * Setup the thread mask for this irqaction. Unlikely to have
965 * 32 resp 64 irqs sharing one line, but who knows.
966 */
967 if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) {
968 ret = -EBUSY;
969 goto out_mask;
970 }
971 new->thread_mask = 1 << ffz(thread_mask);
972
761 if (!shared) { 973 if (!shared) {
762 irq_chip_set_defaults(desc->irq_data.chip); 974 irq_chip_set_defaults(desc->irq_data.chip);
763 975
@@ -769,42 +981,44 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
769 new->flags & IRQF_TRIGGER_MASK); 981 new->flags & IRQF_TRIGGER_MASK);
770 982
771 if (ret) 983 if (ret)
772 goto out_thread; 984 goto out_mask;
773 } else 985 }
774 compat_irq_chip_set_default_handler(desc); 986
775#if defined(CONFIG_IRQ_PER_CPU) 987 desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
776 if (new->flags & IRQF_PERCPU) 988 IRQS_INPROGRESS | IRQS_ONESHOT | \
777 desc->status |= IRQ_PER_CPU; 989 IRQS_WAITING);
778#endif
779 990
780 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT | 991 if (new->flags & IRQF_PERCPU) {
781 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); 992 irqd_set(&desc->irq_data, IRQD_PER_CPU);
993 irq_settings_set_per_cpu(desc);
994 }
782 995
783 if (new->flags & IRQF_ONESHOT) 996 if (new->flags & IRQF_ONESHOT)
784 desc->status |= IRQ_ONESHOT; 997 desc->istate |= IRQS_ONESHOT;
785 998
786 if (!(desc->status & IRQ_NOAUTOEN)) { 999 if (irq_settings_can_autoenable(desc))
787 desc->depth = 0; 1000 irq_startup(desc);
788 desc->status &= ~IRQ_DISABLED; 1001 else
789 desc->irq_data.chip->irq_startup(&desc->irq_data);
790 } else
791 /* Undo nested disables: */ 1002 /* Undo nested disables: */
792 desc->depth = 1; 1003 desc->depth = 1;
793 1004
794 /* Exclude IRQ from balancing if requested */ 1005 /* Exclude IRQ from balancing if requested */
795 if (new->flags & IRQF_NOBALANCING) 1006 if (new->flags & IRQF_NOBALANCING) {
796 desc->status |= IRQ_NO_BALANCING; 1007 irq_settings_set_no_balancing(desc);
1008 irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
1009 }
797 1010
798 /* Set default affinity mask once everything is setup */ 1011 /* Set default affinity mask once everything is setup */
799 setup_affinity(irq, desc); 1012 setup_affinity(irq, desc, mask);
800 1013
801 } else if ((new->flags & IRQF_TRIGGER_MASK) 1014 } else if (new->flags & IRQF_TRIGGER_MASK) {
802 && (new->flags & IRQF_TRIGGER_MASK) 1015 unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
803 != (desc->status & IRQ_TYPE_SENSE_MASK)) { 1016 unsigned int omsk = irq_settings_get_trigger_mask(desc);
804 /* hope the handler works with the actual trigger mode... */ 1017
805 pr_warning("IRQ %d uses trigger mode %d; requested %d\n", 1018 if (nmsk != omsk)
806 irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK), 1019 /* hope the handler works with current trigger mode */
807 (int)(new->flags & IRQF_TRIGGER_MASK)); 1020 pr_warning("IRQ %d uses trigger mode %u; requested %u\n",
1021 irq, nmsk, omsk);
808 } 1022 }
809 1023
810 new->irq = irq; 1024 new->irq = irq;
@@ -818,8 +1032,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
818 * Check whether we disabled the irq via the spurious handler 1032 * Check whether we disabled the irq via the spurious handler
819 * before. Reenable it and give it another chance. 1033 * before. Reenable it and give it another chance.
820 */ 1034 */
821 if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { 1035 if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
822 desc->status &= ~IRQ_SPURIOUS_DISABLED; 1036 desc->istate &= ~IRQS_SPURIOUS_DISABLED;
823 __enable_irq(desc, irq, false); 1037 __enable_irq(desc, irq, false);
824 } 1038 }
825 1039
@@ -849,8 +1063,11 @@ mismatch:
849#endif 1063#endif
850 ret = -EBUSY; 1064 ret = -EBUSY;
851 1065
852out_thread: 1066out_mask:
853 raw_spin_unlock_irqrestore(&desc->lock, flags); 1067 raw_spin_unlock_irqrestore(&desc->lock, flags);
1068 free_cpumask_var(mask);
1069
1070out_thread:
854 if (new->thread) { 1071 if (new->thread) {
855 struct task_struct *t = new->thread; 1072 struct task_struct *t = new->thread;
856 1073
@@ -871,9 +1088,14 @@ out_thread:
871 */ 1088 */
872int setup_irq(unsigned int irq, struct irqaction *act) 1089int setup_irq(unsigned int irq, struct irqaction *act)
873{ 1090{
1091 int retval;
874 struct irq_desc *desc = irq_to_desc(irq); 1092 struct irq_desc *desc = irq_to_desc(irq);
875 1093
876 return __setup_irq(irq, desc, act); 1094 chip_bus_lock(desc);
1095 retval = __setup_irq(irq, desc, act);
1096 chip_bus_sync_unlock(desc);
1097
1098 return retval;
877} 1099}
878EXPORT_SYMBOL_GPL(setup_irq); 1100EXPORT_SYMBOL_GPL(setup_irq);
879 1101
@@ -924,13 +1146,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
924#endif 1146#endif
925 1147
926 /* If this was the last handler, shut down the IRQ line: */ 1148 /* If this was the last handler, shut down the IRQ line: */
927 if (!desc->action) { 1149 if (!desc->action)
928 desc->status |= IRQ_DISABLED; 1150 irq_shutdown(desc);
929 if (desc->irq_data.chip->irq_shutdown)
930 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
931 else
932 desc->irq_data.chip->irq_disable(&desc->irq_data);
933 }
934 1151
935#ifdef CONFIG_SMP 1152#ifdef CONFIG_SMP
936 /* make sure affinity_hint is cleaned up */ 1153 /* make sure affinity_hint is cleaned up */
@@ -1004,6 +1221,11 @@ void free_irq(unsigned int irq, void *dev_id)
1004 if (!desc) 1221 if (!desc)
1005 return; 1222 return;
1006 1223
1224#ifdef CONFIG_SMP
1225 if (WARN_ON(desc->affinity_notify))
1226 desc->affinity_notify = NULL;
1227#endif
1228
1007 chip_bus_lock(desc); 1229 chip_bus_lock(desc);
1008 kfree(__free_irq(irq, dev_id)); 1230 kfree(__free_irq(irq, dev_id));
1009 chip_bus_sync_unlock(desc); 1231 chip_bus_sync_unlock(desc);
@@ -1074,7 +1296,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1074 if (!desc) 1296 if (!desc)
1075 return -EINVAL; 1297 return -EINVAL;
1076 1298
1077 if (desc->status & IRQ_NOREQUEST) 1299 if (!irq_settings_can_request(desc))
1078 return -EINVAL; 1300 return -EINVAL;
1079 1301
1080 if (!handler) { 1302 if (!handler) {
@@ -1149,7 +1371,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
1149 if (!desc) 1371 if (!desc)
1150 return -EINVAL; 1372 return -EINVAL;
1151 1373
1152 if (desc->status & IRQ_NESTED_THREAD) { 1374 if (irq_settings_is_nested_thread(desc)) {
1153 ret = request_threaded_irq(irq, NULL, handler, 1375 ret = request_threaded_irq(irq, NULL, handler,
1154 flags, name, dev_id); 1376 flags, name, dev_id);
1155 return !ret ? IRQC_IS_NESTED : ret; 1377 return !ret ? IRQC_IS_NESTED : ret;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 441fd629ff04..ec4806d4778b 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,23 +4,23 @@
4 4
5#include "internals.h" 5#include "internals.h"
6 6
7void move_masked_irq(int irq) 7void irq_move_masked_irq(struct irq_data *idata)
8{ 8{
9 struct irq_desc *desc = irq_to_desc(irq); 9 struct irq_desc *desc = irq_data_to_desc(idata);
10 struct irq_chip *chip = desc->irq_data.chip; 10 struct irq_chip *chip = idata->chip;
11 11
12 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 12 if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
13 return; 13 return;
14 14
15 /* 15 /*
16 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. 16 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
17 */ 17 */
18 if (CHECK_IRQ_PER_CPU(desc->status)) { 18 if (!irqd_can_balance(&desc->irq_data)) {
19 WARN_ON(1); 19 WARN_ON(1);
20 return; 20 return;
21 } 21 }
22 22
23 desc->status &= ~IRQ_MOVE_PENDING; 23 irqd_clr_move_pending(&desc->irq_data);
24 24
25 if (unlikely(cpumask_empty(desc->pending_mask))) 25 if (unlikely(cpumask_empty(desc->pending_mask)))
26 return; 26 return;
@@ -53,15 +53,20 @@ void move_masked_irq(int irq)
53 cpumask_clear(desc->pending_mask); 53 cpumask_clear(desc->pending_mask);
54} 54}
55 55
56void move_native_irq(int irq) 56void move_masked_irq(int irq)
57{
58 irq_move_masked_irq(irq_get_irq_data(irq));
59}
60
61void irq_move_irq(struct irq_data *idata)
57{ 62{
58 struct irq_desc *desc = irq_to_desc(irq); 63 struct irq_desc *desc = irq_data_to_desc(idata);
59 bool masked; 64 bool masked;
60 65
61 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 66 if (likely(!irqd_is_setaffinity_pending(idata)))
62 return; 67 return;
63 68
64 if (unlikely(desc->status & IRQ_DISABLED)) 69 if (unlikely(desc->istate & IRQS_DISABLED))
65 return; 70 return;
66 71
67 /* 72 /*
@@ -69,10 +74,15 @@ void move_native_irq(int irq)
69 * threaded interrupt with ONESHOT set, we can end up with an 74 * threaded interrupt with ONESHOT set, we can end up with an
70 * interrupt storm. 75 * interrupt storm.
71 */ 76 */
72 masked = desc->status & IRQ_MASKED; 77 masked = desc->istate & IRQS_MASKED;
73 if (!masked) 78 if (!masked)
74 desc->irq_data.chip->irq_mask(&desc->irq_data); 79 idata->chip->irq_mask(idata);
75 move_masked_irq(irq); 80 irq_move_masked_irq(idata);
76 if (!masked) 81 if (!masked)
77 desc->irq_data.chip->irq_unmask(&desc->irq_data); 82 idata->chip->irq_unmask(idata);
83}
84
85void move_native_irq(int irq)
86{
87 irq_move_irq(irq_get_irq_data(irq));
78} 88}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 0d4005d85b03..f76fc00c9877 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -18,7 +18,7 @@
18 * During system-wide suspend or hibernation device drivers need to be prevented 18 * During system-wide suspend or hibernation device drivers need to be prevented
19 * from receiving interrupts and this function is provided for this purpose. 19 * from receiving interrupts and this function is provided for this purpose.
20 * It marks all interrupt lines in use, except for the timer ones, as disabled 20 * It marks all interrupt lines in use, except for the timer ones, as disabled
21 * and sets the IRQ_SUSPENDED flag for each of them. 21 * and sets the IRQS_SUSPENDED flag for each of them.
22 */ 22 */
23void suspend_device_irqs(void) 23void suspend_device_irqs(void)
24{ 24{
@@ -34,7 +34,7 @@ void suspend_device_irqs(void)
34 } 34 }
35 35
36 for_each_irq_desc(irq, desc) 36 for_each_irq_desc(irq, desc)
37 if (desc->status & IRQ_SUSPENDED) 37 if (desc->istate & IRQS_SUSPENDED)
38 synchronize_irq(irq); 38 synchronize_irq(irq);
39} 39}
40EXPORT_SYMBOL_GPL(suspend_device_irqs); 40EXPORT_SYMBOL_GPL(suspend_device_irqs);
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs);
43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() 43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
44 * 44 *
45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that 45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that
46 * have the IRQ_SUSPENDED flag set. 46 * have the IRQS_SUSPENDED flag set.
47 */ 47 */
48void resume_device_irqs(void) 48void resume_device_irqs(void)
49{ 49{
@@ -53,9 +53,6 @@ void resume_device_irqs(void)
53 for_each_irq_desc(irq, desc) { 53 for_each_irq_desc(irq, desc) {
54 unsigned long flags; 54 unsigned long flags;
55 55
56 if (!(desc->status & IRQ_SUSPENDED))
57 continue;
58
59 raw_spin_lock_irqsave(&desc->lock, flags); 56 raw_spin_lock_irqsave(&desc->lock, flags);
60 __enable_irq(desc, irq, true); 57 __enable_irq(desc, irq, true);
61 raw_spin_unlock_irqrestore(&desc->lock, flags); 58 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -71,9 +68,24 @@ int check_wakeup_irqs(void)
71 struct irq_desc *desc; 68 struct irq_desc *desc;
72 int irq; 69 int irq;
73 70
74 for_each_irq_desc(irq, desc) 71 for_each_irq_desc(irq, desc) {
75 if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING)) 72 if (irqd_is_wakeup_set(&desc->irq_data)) {
76 return -EBUSY; 73 if (desc->istate & IRQS_PENDING)
74 return -EBUSY;
75 continue;
76 }
77 /*
78 * Check the non wakeup interrupts whether they need
79 * to be masked before finally going into suspend
80 * state. That's for hardware which has no wakeup
81 * source configuration facility. The chip
82 * implementation indicates that with
83 * IRQCHIP_MASK_ON_SUSPEND.
84 */
85 if (desc->istate & IRQS_SUSPENDED &&
86 irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
87 mask_irq(desc);
88 }
77 89
78 return 0; 90 return 0;
79} 91}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6c8a2a9f8a7b..626d092eed9a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -11,6 +11,7 @@
11#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/kernel_stat.h>
14 15
15#include "internals.h" 16#include "internals.h"
16 17
@@ -24,7 +25,7 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
24 const struct cpumask *mask = desc->irq_data.affinity; 25 const struct cpumask *mask = desc->irq_data.affinity;
25 26
26#ifdef CONFIG_GENERIC_PENDING_IRQ 27#ifdef CONFIG_GENERIC_PENDING_IRQ
27 if (desc->status & IRQ_MOVE_PENDING) 28 if (irqd_is_setaffinity_pending(&desc->irq_data))
28 mask = desc->pending_mask; 29 mask = desc->pending_mask;
29#endif 30#endif
30 seq_cpumask(m, mask); 31 seq_cpumask(m, mask);
@@ -65,8 +66,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
65 cpumask_var_t new_value; 66 cpumask_var_t new_value;
66 int err; 67 int err;
67 68
68 if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity || 69 if (!irq_can_set_affinity(irq) || no_irq_affinity)
69 irq_balancing_disabled(irq))
70 return -EIO; 70 return -EIO;
71 71
72 if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) 72 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
@@ -89,7 +89,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
89 if (!cpumask_intersects(new_value, cpu_online_mask)) { 89 if (!cpumask_intersects(new_value, cpu_online_mask)) {
90 /* Special case for empty set - allow the architecture 90 /* Special case for empty set - allow the architecture
91 code to set default SMP affinity. */ 91 code to set default SMP affinity. */
92 err = irq_select_affinity_usr(irq) ? -EINVAL : count; 92 err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count;
93 } else { 93 } else {
94 irq_set_affinity(irq, new_value); 94 irq_set_affinity(irq, new_value);
95 err = count; 95 err = count;
@@ -357,3 +357,79 @@ void init_irq_proc(void)
357 } 357 }
358} 358}
359 359
360#ifdef CONFIG_GENERIC_IRQ_SHOW
361
362int __weak arch_show_interrupts(struct seq_file *p, int prec)
363{
364 return 0;
365}
366
367int show_interrupts(struct seq_file *p, void *v)
368{
369 static int prec;
370
371 unsigned long flags, any_count = 0;
372 int i = *(loff_t *) v, j;
373 struct irqaction *action;
374 struct irq_desc *desc;
375
376 if (i > nr_irqs)
377 return 0;
378
379 if (i == nr_irqs)
380 return arch_show_interrupts(p, prec);
381
382 /* print header and calculate the width of the first column */
383 if (i == 0) {
384 for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
385 j *= 10;
386
387 seq_printf(p, "%*s", prec + 8, "");
388 for_each_online_cpu(j)
389 seq_printf(p, "CPU%-8d", j);
390 seq_putc(p, '\n');
391 }
392
393 desc = irq_to_desc(i);
394 if (!desc)
395 return 0;
396
397 raw_spin_lock_irqsave(&desc->lock, flags);
398 for_each_online_cpu(j)
399 any_count |= kstat_irqs_cpu(i, j);
400 action = desc->action;
401 if (!action && !any_count)
402 goto out;
403
404 seq_printf(p, "%*d: ", prec, i);
405 for_each_online_cpu(j)
406 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
407
408 if (desc->irq_data.chip) {
409 if (desc->irq_data.chip->irq_print_chip)
410 desc->irq_data.chip->irq_print_chip(&desc->irq_data, p);
411 else if (desc->irq_data.chip->name)
412 seq_printf(p, " %8s", desc->irq_data.chip->name);
413 else
414 seq_printf(p, " %8s", "-");
415 } else {
416 seq_printf(p, " %8s", "None");
417 }
418#ifdef CONFIG_GENIRC_IRQ_SHOW_LEVEL
419 seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
420#endif
421 if (desc->name)
422 seq_printf(p, "-%-8s", desc->name);
423
424 if (action) {
425 seq_printf(p, " %s", action->name);
426 while ((action = action->next) != NULL)
427 seq_printf(p, ", %s", action->name);
428 }
429
430 seq_putc(p, '\n');
431out:
432 raw_spin_unlock_irqrestore(&desc->lock, flags);
433 return 0;
434}
435#endif
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index dc49358b73fa..ad683a99b1ec 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -55,20 +55,19 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
55 */ 55 */
56void check_irq_resend(struct irq_desc *desc, unsigned int irq) 56void check_irq_resend(struct irq_desc *desc, unsigned int irq)
57{ 57{
58 unsigned int status = desc->status;
59
60 /*
61 * Make sure the interrupt is enabled, before resending it:
62 */
63 desc->irq_data.chip->irq_enable(&desc->irq_data);
64
65 /* 58 /*
66 * We do not resend level type interrupts. Level type 59 * We do not resend level type interrupts. Level type
67 * interrupts are resent by hardware when they are still 60 * interrupts are resent by hardware when they are still
68 * active. 61 * active.
69 */ 62 */
70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 63 if (irq_settings_is_level(desc))
71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; 64 return;
65 if (desc->istate & IRQS_REPLAY)
66 return;
67 if (desc->istate & IRQS_PENDING) {
68 irq_compat_clr_pending(desc);
69 desc->istate &= ~IRQS_PENDING;
70 desc->istate |= IRQS_REPLAY;
72 71
73 if (!desc->irq_data.chip->irq_retrigger || 72 if (!desc->irq_data.chip->irq_retrigger ||
74 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { 73 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
new file mode 100644
index 000000000000..0227ad358272
--- /dev/null
+++ b/kernel/irq/settings.h
@@ -0,0 +1,138 @@
1/*
2 * Internal header to deal with irq_desc->status which will be renamed
3 * to irq_desc->settings.
4 */
5enum {
6 _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS,
7 _IRQ_PER_CPU = IRQ_PER_CPU,
8 _IRQ_LEVEL = IRQ_LEVEL,
9 _IRQ_NOPROBE = IRQ_NOPROBE,
10 _IRQ_NOREQUEST = IRQ_NOREQUEST,
11 _IRQ_NOAUTOEN = IRQ_NOAUTOEN,
12 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
13 _IRQ_NO_BALANCING = IRQ_NO_BALANCING,
14 _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
15 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
16};
17
18#define IRQ_INPROGRESS GOT_YOU_MORON
19#define IRQ_REPLAY GOT_YOU_MORON
20#define IRQ_WAITING GOT_YOU_MORON
21#define IRQ_DISABLED GOT_YOU_MORON
22#define IRQ_PENDING GOT_YOU_MORON
23#define IRQ_MASKED GOT_YOU_MORON
24#define IRQ_WAKEUP GOT_YOU_MORON
25#define IRQ_MOVE_PENDING GOT_YOU_MORON
26#define IRQ_PER_CPU GOT_YOU_MORON
27#define IRQ_NO_BALANCING GOT_YOU_MORON
28#define IRQ_AFFINITY_SET GOT_YOU_MORON
29#define IRQ_LEVEL GOT_YOU_MORON
30#define IRQ_NOPROBE GOT_YOU_MORON
31#define IRQ_NOREQUEST GOT_YOU_MORON
32#define IRQ_NOAUTOEN GOT_YOU_MORON
33#define IRQ_NESTED_THREAD GOT_YOU_MORON
34#undef IRQF_MODIFY_MASK
35#define IRQF_MODIFY_MASK GOT_YOU_MORON
36
37static inline void
38irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
39{
40 desc->status &= ~(clr & _IRQF_MODIFY_MASK);
41 desc->status |= (set & _IRQF_MODIFY_MASK);
42}
43
44static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
45{
46 return desc->status & _IRQ_PER_CPU;
47}
48
49static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
50{
51 desc->status |= _IRQ_PER_CPU;
52}
53
54static inline void irq_settings_set_no_balancing(struct irq_desc *desc)
55{
56 desc->status |= _IRQ_NO_BALANCING;
57}
58
59static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
60{
61 return desc->status & _IRQ_NO_BALANCING;
62}
63
64static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc)
65{
66 return desc->status & IRQ_TYPE_SENSE_MASK;
67}
68
69static inline void
70irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask)
71{
72 desc->status &= ~IRQ_TYPE_SENSE_MASK;
73 desc->status |= mask & IRQ_TYPE_SENSE_MASK;
74}
75
76static inline bool irq_settings_is_level(struct irq_desc *desc)
77{
78 return desc->status & _IRQ_LEVEL;
79}
80
81static inline void irq_settings_clr_level(struct irq_desc *desc)
82{
83 desc->status &= ~_IRQ_LEVEL;
84}
85
86static inline void irq_settings_set_level(struct irq_desc *desc)
87{
88 desc->status |= _IRQ_LEVEL;
89}
90
91static inline bool irq_settings_can_request(struct irq_desc *desc)
92{
93 return !(desc->status & _IRQ_NOREQUEST);
94}
95
96static inline void irq_settings_clr_norequest(struct irq_desc *desc)
97{
98 desc->status &= ~_IRQ_NOREQUEST;
99}
100
101static inline void irq_settings_set_norequest(struct irq_desc *desc)
102{
103 desc->status |= _IRQ_NOREQUEST;
104}
105
106static inline bool irq_settings_can_probe(struct irq_desc *desc)
107{
108 return !(desc->status & _IRQ_NOPROBE);
109}
110
111static inline void irq_settings_clr_noprobe(struct irq_desc *desc)
112{
113 desc->status &= ~_IRQ_NOPROBE;
114}
115
116static inline void irq_settings_set_noprobe(struct irq_desc *desc)
117{
118 desc->status |= _IRQ_NOPROBE;
119}
120
121static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
122{
123 return desc->status & _IRQ_MOVE_PCNTXT;
124}
125
126static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
127{
128 return !(desc->status & _IRQ_NOAUTOEN);
129}
130
131static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
132{
133 return desc->status & _IRQ_NESTED_THREAD;
134}
135
136/* Nothing should touch desc->status from now on */
137#undef status
138#define status USE_THE_PROPER_WRAPPERS_YOU_MORON
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 3089d3b9d5f3..dd586ebf9c8c 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -21,70 +21,94 @@ static int irqfixup __read_mostly;
21#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) 21#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
22static void poll_spurious_irqs(unsigned long dummy); 22static void poll_spurious_irqs(unsigned long dummy);
23static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); 23static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
24static int irq_poll_cpu;
25static atomic_t irq_poll_active;
26
27/*
28 * We wait here for a poller to finish.
29 *
30 * If the poll runs on this CPU, then we yell loudly and return
31 * false. That will leave the interrupt line disabled in the worst
32 * case, but it should never happen.
33 *
34 * We wait until the poller is done and then recheck disabled and
35 * action (about to be disabled). Only if it's still active, we return
36 * true and let the handler run.
37 */
38bool irq_wait_for_poll(struct irq_desc *desc)
39{
40 if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
41 "irq poll in progress on cpu %d for irq %d\n",
42 smp_processor_id(), desc->irq_data.irq))
43 return false;
44
45#ifdef CONFIG_SMP
46 do {
47 raw_spin_unlock(&desc->lock);
48 while (desc->istate & IRQS_INPROGRESS)
49 cpu_relax();
50 raw_spin_lock(&desc->lock);
51 } while (desc->istate & IRQS_INPROGRESS);
52 /* Might have been disabled in meantime */
53 return !(desc->istate & IRQS_DISABLED) && desc->action;
54#else
55 return false;
56#endif
57}
58
24 59
25/* 60/*
26 * Recovery handler for misrouted interrupts. 61 * Recovery handler for misrouted interrupts.
27 */ 62 */
28static int try_one_irq(int irq, struct irq_desc *desc) 63static int try_one_irq(int irq, struct irq_desc *desc, bool force)
29{ 64{
65 irqreturn_t ret = IRQ_NONE;
30 struct irqaction *action; 66 struct irqaction *action;
31 int ok = 0, work = 0;
32 67
33 raw_spin_lock(&desc->lock); 68 raw_spin_lock(&desc->lock);
34 /* Already running on another processor */
35 if (desc->status & IRQ_INPROGRESS) {
36 /*
37 * Already running: If it is shared get the other
38 * CPU to go looking for our mystery interrupt too
39 */
40 if (desc->action && (desc->action->flags & IRQF_SHARED))
41 desc->status |= IRQ_PENDING;
42 raw_spin_unlock(&desc->lock);
43 return ok;
44 }
45 /* Honour the normal IRQ locking */
46 desc->status |= IRQ_INPROGRESS;
47 action = desc->action;
48 raw_spin_unlock(&desc->lock);
49 69
50 while (action) { 70 /* PER_CPU and nested thread interrupts are never polled */
51 /* Only shared IRQ handlers are safe to call */ 71 if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc))
52 if (action->flags & IRQF_SHARED) { 72 goto out;
53 if (action->handler(irq, action->dev_id) ==
54 IRQ_HANDLED)
55 ok = 1;
56 }
57 action = action->next;
58 }
59 local_irq_disable();
60 /* Now clean up the flags */
61 raw_spin_lock(&desc->lock);
62 action = desc->action;
63 73
64 /* 74 /*
65 * While we were looking for a fixup someone queued a real 75 * Do not poll disabled interrupts unless the spurious
66 * IRQ clashing with our walk: 76 * disabled poller asks explicitely.
67 */ 77 */
68 while ((desc->status & IRQ_PENDING) && action) { 78 if ((desc->istate & IRQS_DISABLED) && !force)
79 goto out;
80
81 /*
82 * All handlers must agree on IRQF_SHARED, so we test just the
83 * first. Check for action->next as well.
84 */
85 action = desc->action;
86 if (!action || !(action->flags & IRQF_SHARED) ||
87 (action->flags & __IRQF_TIMER) || !action->next)
88 goto out;
89
90 /* Already running on another processor */
91 if (desc->istate & IRQS_INPROGRESS) {
69 /* 92 /*
70 * Perform real IRQ processing for the IRQ we deferred 93 * Already running: If it is shared get the other
94 * CPU to go looking for our mystery interrupt too
71 */ 95 */
72 work = 1; 96 irq_compat_set_pending(desc);
73 raw_spin_unlock(&desc->lock); 97 desc->istate |= IRQS_PENDING;
74 handle_IRQ_event(irq, action); 98 goto out;
75 raw_spin_lock(&desc->lock);
76 desc->status &= ~IRQ_PENDING;
77 } 99 }
78 desc->status &= ~IRQ_INPROGRESS;
79 /*
80 * If we did actual work for the real IRQ line we must let the
81 * IRQ controller clean up too
82 */
83 if (work)
84 irq_end(irq, desc);
85 raw_spin_unlock(&desc->lock);
86 100
87 return ok; 101 /* Mark it poll in progress */
102 desc->istate |= IRQS_POLL_INPROGRESS;
103 do {
104 if (handle_irq_event(desc) == IRQ_HANDLED)
105 ret = IRQ_HANDLED;
106 action = desc->action;
107 } while ((desc->istate & IRQS_PENDING) && action);
108 desc->istate &= ~IRQS_POLL_INPROGRESS;
109out:
110 raw_spin_unlock(&desc->lock);
111 return ret == IRQ_HANDLED;
88} 112}
89 113
90static int misrouted_irq(int irq) 114static int misrouted_irq(int irq)
@@ -92,6 +116,11 @@ static int misrouted_irq(int irq)
92 struct irq_desc *desc; 116 struct irq_desc *desc;
93 int i, ok = 0; 117 int i, ok = 0;
94 118
119 if (atomic_inc_return(&irq_poll_active) == 1)
120 goto out;
121
122 irq_poll_cpu = smp_processor_id();
123
95 for_each_irq_desc(i, desc) { 124 for_each_irq_desc(i, desc) {
96 if (!i) 125 if (!i)
97 continue; 126 continue;
@@ -99,9 +128,11 @@ static int misrouted_irq(int irq)
99 if (i == irq) /* Already tried */ 128 if (i == irq) /* Already tried */
100 continue; 129 continue;
101 130
102 if (try_one_irq(i, desc)) 131 if (try_one_irq(i, desc, false))
103 ok = 1; 132 ok = 1;
104 } 133 }
134out:
135 atomic_dec(&irq_poll_active);
105 /* So the caller can adjust the irq error counts */ 136 /* So the caller can adjust the irq error counts */
106 return ok; 137 return ok;
107} 138}
@@ -111,23 +142,28 @@ static void poll_spurious_irqs(unsigned long dummy)
111 struct irq_desc *desc; 142 struct irq_desc *desc;
112 int i; 143 int i;
113 144
145 if (atomic_inc_return(&irq_poll_active) != 1)
146 goto out;
147 irq_poll_cpu = smp_processor_id();
148
114 for_each_irq_desc(i, desc) { 149 for_each_irq_desc(i, desc) {
115 unsigned int status; 150 unsigned int state;
116 151
117 if (!i) 152 if (!i)
118 continue; 153 continue;
119 154
120 /* Racy but it doesn't matter */ 155 /* Racy but it doesn't matter */
121 status = desc->status; 156 state = desc->istate;
122 barrier(); 157 barrier();
123 if (!(status & IRQ_SPURIOUS_DISABLED)) 158 if (!(state & IRQS_SPURIOUS_DISABLED))
124 continue; 159 continue;
125 160
126 local_irq_disable(); 161 local_irq_disable();
127 try_one_irq(i, desc); 162 try_one_irq(i, desc, true);
128 local_irq_enable(); 163 local_irq_enable();
129 } 164 }
130 165out:
166 atomic_dec(&irq_poll_active);
131 mod_timer(&poll_spurious_irq_timer, 167 mod_timer(&poll_spurious_irq_timer,
132 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 168 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
133} 169}
@@ -139,15 +175,13 @@ static void poll_spurious_irqs(unsigned long dummy)
139 * 175 *
140 * (The other 100-of-100,000 interrupts may have been a correctly 176 * (The other 100-of-100,000 interrupts may have been a correctly
141 * functioning device sharing an IRQ with the failing one) 177 * functioning device sharing an IRQ with the failing one)
142 *
143 * Called under desc->lock
144 */ 178 */
145
146static void 179static void
147__report_bad_irq(unsigned int irq, struct irq_desc *desc, 180__report_bad_irq(unsigned int irq, struct irq_desc *desc,
148 irqreturn_t action_ret) 181 irqreturn_t action_ret)
149{ 182{
150 struct irqaction *action; 183 struct irqaction *action;
184 unsigned long flags;
151 185
152 if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { 186 if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
153 printk(KERN_ERR "irq event %d: bogus return value %x\n", 187 printk(KERN_ERR "irq event %d: bogus return value %x\n",
@@ -159,6 +193,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
159 dump_stack(); 193 dump_stack();
160 printk(KERN_ERR "handlers:\n"); 194 printk(KERN_ERR "handlers:\n");
161 195
196 /*
197 * We need to take desc->lock here. note_interrupt() is called
198 * w/o desc->lock held, but IRQ_PROGRESS set. We might race
199 * with something else removing an action. It's ok to take
200 * desc->lock here. See synchronize_irq().
201 */
202 raw_spin_lock_irqsave(&desc->lock, flags);
162 action = desc->action; 203 action = desc->action;
163 while (action) { 204 while (action) {
164 printk(KERN_ERR "[<%p>]", action->handler); 205 printk(KERN_ERR "[<%p>]", action->handler);
@@ -167,6 +208,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
167 printk("\n"); 208 printk("\n");
168 action = action->next; 209 action = action->next;
169 } 210 }
211 raw_spin_unlock_irqrestore(&desc->lock, flags);
170} 212}
171 213
172static void 214static void
@@ -218,6 +260,9 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
218void note_interrupt(unsigned int irq, struct irq_desc *desc, 260void note_interrupt(unsigned int irq, struct irq_desc *desc,
219 irqreturn_t action_ret) 261 irqreturn_t action_ret)
220{ 262{
263 if (desc->istate & IRQS_POLL_INPROGRESS)
264 return;
265
221 if (unlikely(action_ret != IRQ_HANDLED)) { 266 if (unlikely(action_ret != IRQ_HANDLED)) {
222 /* 267 /*
223 * If we are seeing only the odd spurious IRQ caused by 268 * If we are seeing only the odd spurious IRQ caused by
@@ -254,9 +299,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
254 * Now kill the IRQ 299 * Now kill the IRQ
255 */ 300 */
256 printk(KERN_EMERG "Disabling IRQ #%d\n", irq); 301 printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
257 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; 302 desc->istate |= IRQS_SPURIOUS_DISABLED;
258 desc->depth++; 303 desc->depth++;
259 desc->irq_data.chip->irq_disable(&desc->irq_data); 304 irq_disable(desc);
260 305
261 mod_timer(&poll_spurious_irq_timer, 306 mod_timer(&poll_spurious_irq_timer,
262 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 307 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6f6d091b5757..079f1d39a8b8 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -64,14 +64,14 @@ static inline int is_kernel_text(unsigned long addr)
64 if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || 64 if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
65 arch_is_kernel_text(addr)) 65 arch_is_kernel_text(addr))
66 return 1; 66 return 1;
67 return in_gate_area_no_task(addr); 67 return in_gate_area_no_mm(addr);
68} 68}
69 69
70static inline int is_kernel(unsigned long addr) 70static inline int is_kernel(unsigned long addr)
71{ 71{
72 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) 72 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
73 return 1; 73 return 1;
74 return in_gate_area_no_task(addr); 74 return in_gate_area_no_mm(addr);
75} 75}
76 76
77static int is_ksym_addr(unsigned long addr) 77static int is_ksym_addr(unsigned long addr)
@@ -342,13 +342,15 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
342} 342}
343 343
344/* Look up a kernel symbol and return it in a text buffer. */ 344/* Look up a kernel symbol and return it in a text buffer. */
345int sprint_symbol(char *buffer, unsigned long address) 345static int __sprint_symbol(char *buffer, unsigned long address,
346 int symbol_offset)
346{ 347{
347 char *modname; 348 char *modname;
348 const char *name; 349 const char *name;
349 unsigned long offset, size; 350 unsigned long offset, size;
350 int len; 351 int len;
351 352
353 address += symbol_offset;
352 name = kallsyms_lookup(address, &size, &offset, &modname, buffer); 354 name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
353 if (!name) 355 if (!name)
354 return sprintf(buffer, "0x%lx", address); 356 return sprintf(buffer, "0x%lx", address);
@@ -357,17 +359,53 @@ int sprint_symbol(char *buffer, unsigned long address)
357 strcpy(buffer, name); 359 strcpy(buffer, name);
358 len = strlen(buffer); 360 len = strlen(buffer);
359 buffer += len; 361 buffer += len;
362 offset -= symbol_offset;
360 363
361 if (modname) 364 if (modname)
362 len += sprintf(buffer, "+%#lx/%#lx [%s]", 365 len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname);
363 offset, size, modname);
364 else 366 else
365 len += sprintf(buffer, "+%#lx/%#lx", offset, size); 367 len += sprintf(buffer, "+%#lx/%#lx", offset, size);
366 368
367 return len; 369 return len;
368} 370}
371
372/**
373 * sprint_symbol - Look up a kernel symbol and return it in a text buffer
374 * @buffer: buffer to be stored
375 * @address: address to lookup
376 *
377 * This function looks up a kernel symbol with @address and stores its name,
378 * offset, size and module name to @buffer if possible. If no symbol was found,
379 * just saves its @address as is.
380 *
381 * This function returns the number of bytes stored in @buffer.
382 */
383int sprint_symbol(char *buffer, unsigned long address)
384{
385 return __sprint_symbol(buffer, address, 0);
386}
387
369EXPORT_SYMBOL_GPL(sprint_symbol); 388EXPORT_SYMBOL_GPL(sprint_symbol);
370 389
390/**
391 * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
392 * @buffer: buffer to be stored
393 * @address: address to lookup
394 *
395 * This function is for stack backtrace and does the same thing as
396 * sprint_symbol() but with modified/decreased @address. If there is a
397 * tail-call to the function marked "noreturn", gcc optimized out code after
398 * the call so that the stack-saved return address could point outside of the
399 * caller. This function ensures that kallsyms will find the original caller
400 * by decreasing @address.
401 *
402 * This function returns the number of bytes stored in @buffer.
403 */
404int sprint_backtrace(char *buffer, unsigned long address)
405{
406 return __sprint_symbol(buffer, address, -1);
407}
408
371/* Look up a kernel symbol and print it to the kernel messages. */ 409/* Look up a kernel symbol and print it to the kernel messages. */
372void __print_symbol(const char *fmt, unsigned long address) 410void __print_symbol(const char *fmt, unsigned long address)
373{ 411{
@@ -477,13 +515,11 @@ static int s_show(struct seq_file *m, void *p)
477 */ 515 */
478 type = iter->exported ? toupper(iter->type) : 516 type = iter->exported ? toupper(iter->type) :
479 tolower(iter->type); 517 tolower(iter->type);
480 seq_printf(m, "%0*lx %c %s\t[%s]\n", 518 seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
481 (int)(2 * sizeof(void *)), 519 type, iter->name, iter->module_name);
482 iter->value, type, iter->name, iter->module_name);
483 } else 520 } else
484 seq_printf(m, "%0*lx %c %s\n", 521 seq_printf(m, "%pK %c %s\n", (void *)iter->value,
485 (int)(2 * sizeof(void *)), 522 iter->type, iter->name);
486 iter->value, iter->type, iter->name);
487 return 0; 523 return 0;
488} 524}
489 525
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c55afba990a3..684ab3f7dd72 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,6 +27,7 @@ struct kthread_create_info
27 /* Information passed to kthread() from kthreadd. */ 27 /* Information passed to kthread() from kthreadd. */
28 int (*threadfn)(void *data); 28 int (*threadfn)(void *data);
29 void *data; 29 void *data;
30 int node;
30 31
31 /* Result passed back to kthread_create() from kthreadd. */ 32 /* Result passed back to kthread_create() from kthreadd. */
32 struct task_struct *result; 33 struct task_struct *result;
@@ -98,10 +99,23 @@ static int kthread(void *_create)
98 do_exit(ret); 99 do_exit(ret);
99} 100}
100 101
102/* called from do_fork() to get node information for about to be created task */
103int tsk_fork_get_node(struct task_struct *tsk)
104{
105#ifdef CONFIG_NUMA
106 if (tsk == kthreadd_task)
107 return tsk->pref_node_fork;
108#endif
109 return numa_node_id();
110}
111
101static void create_kthread(struct kthread_create_info *create) 112static void create_kthread(struct kthread_create_info *create)
102{ 113{
103 int pid; 114 int pid;
104 115
116#ifdef CONFIG_NUMA
117 current->pref_node_fork = create->node;
118#endif
105 /* We want our own signal handler (we take no signals by default). */ 119 /* We want our own signal handler (we take no signals by default). */
106 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); 120 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
107 if (pid < 0) { 121 if (pid < 0) {
@@ -111,15 +125,18 @@ static void create_kthread(struct kthread_create_info *create)
111} 125}
112 126
113/** 127/**
114 * kthread_create - create a kthread. 128 * kthread_create_on_node - create a kthread.
115 * @threadfn: the function to run until signal_pending(current). 129 * @threadfn: the function to run until signal_pending(current).
116 * @data: data ptr for @threadfn. 130 * @data: data ptr for @threadfn.
131 * @node: memory node number.
117 * @namefmt: printf-style name for the thread. 132 * @namefmt: printf-style name for the thread.
118 * 133 *
119 * Description: This helper function creates and names a kernel 134 * Description: This helper function creates and names a kernel
120 * thread. The thread will be stopped: use wake_up_process() to start 135 * thread. The thread will be stopped: use wake_up_process() to start
121 * it. See also kthread_run(). 136 * it. See also kthread_run().
122 * 137 *
138 * If thread is going to be bound on a particular cpu, give its node
139 * in @node, to get NUMA affinity for kthread stack, or else give -1.
123 * When woken, the thread will run @threadfn() with @data as its 140 * When woken, the thread will run @threadfn() with @data as its
124 * argument. @threadfn() can either call do_exit() directly if it is a 141 * argument. @threadfn() can either call do_exit() directly if it is a
125 * standalone thread for which noone will call kthread_stop(), or 142 * standalone thread for which noone will call kthread_stop(), or
@@ -129,15 +146,17 @@ static void create_kthread(struct kthread_create_info *create)
129 * 146 *
130 * Returns a task_struct or ERR_PTR(-ENOMEM). 147 * Returns a task_struct or ERR_PTR(-ENOMEM).
131 */ 148 */
132struct task_struct *kthread_create(int (*threadfn)(void *data), 149struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
133 void *data, 150 void *data,
134 const char namefmt[], 151 int node,
135 ...) 152 const char namefmt[],
153 ...)
136{ 154{
137 struct kthread_create_info create; 155 struct kthread_create_info create;
138 156
139 create.threadfn = threadfn; 157 create.threadfn = threadfn;
140 create.data = data; 158 create.data = data;
159 create.node = node;
141 init_completion(&create.done); 160 init_completion(&create.done);
142 161
143 spin_lock(&kthread_create_lock); 162 spin_lock(&kthread_create_lock);
@@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
164 } 183 }
165 return create.result; 184 return create.result;
166} 185}
167EXPORT_SYMBOL(kthread_create); 186EXPORT_SYMBOL(kthread_create_on_node);
168 187
169/** 188/**
170 * kthread_bind - bind a just-created kthread to a cpu. 189 * kthread_bind - bind a just-created kthread to a cpu.
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 1969d2fc4b36..71edd2f60c02 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -225,7 +225,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
225 nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, 225 nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
226 nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, 226 nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
227 nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, 227 nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
228 sum_forward_deps = 0, factor = 0; 228 sum_forward_deps = 0;
229 229
230 list_for_each_entry(class, &all_lock_classes, lock_entry) { 230 list_for_each_entry(class, &all_lock_classes, lock_entry) {
231 231
@@ -283,13 +283,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
283 nr_hardirq_unsafe * nr_hardirq_safe + 283 nr_hardirq_unsafe * nr_hardirq_safe +
284 nr_list_entries); 284 nr_list_entries);
285 285
286 /*
287 * Estimated factor between direct and indirect
288 * dependencies:
289 */
290 if (nr_list_entries)
291 factor = sum_forward_deps / nr_list_entries;
292
293#ifdef CONFIG_PROVE_LOCKING 286#ifdef CONFIG_PROVE_LOCKING
294 seq_printf(m, " dependency chains: %11lu [max: %lu]\n", 287 seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
295 nr_lock_chains, MAX_LOCKDEP_CHAINS); 288 nr_lock_chains, MAX_LOCKDEP_CHAINS);
diff --git a/kernel/module.c b/kernel/module.c
index efa290ea94bf..1f9f7bc56ca1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1168,7 +1168,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
1168{ 1168{
1169 struct module_sect_attr *sattr = 1169 struct module_sect_attr *sattr =
1170 container_of(mattr, struct module_sect_attr, mattr); 1170 container_of(mattr, struct module_sect_attr, mattr);
1171 return sprintf(buf, "0x%lx\n", sattr->address); 1171 return sprintf(buf, "0x%pK\n", (void *)sattr->address);
1172} 1172}
1173 1173
1174static void free_sect_attrs(struct module_sect_attrs *sect_attrs) 1174static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
@@ -3224,7 +3224,7 @@ static int m_show(struct seq_file *m, void *p)
3224 mod->state == MODULE_STATE_COMING ? "Loading": 3224 mod->state == MODULE_STATE_COMING ? "Loading":
3225 "Live"); 3225 "Live");
3226 /* Used by oprofile and other similar tools. */ 3226 /* Used by oprofile and other similar tools. */
3227 seq_printf(m, " 0x%p", mod->module_core); 3227 seq_printf(m, " 0x%pK", mod->module_core);
3228 3228
3229 /* Taints info */ 3229 /* Taints info */
3230 if (mod->taints) 3230 if (mod->taints)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f74e6c00e26d..a05d191ffdd9 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -69,13 +69,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
69 goto out_ns; 69 goto out_ns;
70 } 70 }
71 71
72 new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); 72 new_nsp->uts_ns = copy_utsname(flags, tsk);
73 if (IS_ERR(new_nsp->uts_ns)) { 73 if (IS_ERR(new_nsp->uts_ns)) {
74 err = PTR_ERR(new_nsp->uts_ns); 74 err = PTR_ERR(new_nsp->uts_ns);
75 goto out_uts; 75 goto out_uts;
76 } 76 }
77 77
78 new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); 78 new_nsp->ipc_ns = copy_ipcs(flags, tsk);
79 if (IS_ERR(new_nsp->ipc_ns)) { 79 if (IS_ERR(new_nsp->ipc_ns)) {
80 err = PTR_ERR(new_nsp->ipc_ns); 80 err = PTR_ERR(new_nsp->ipc_ns);
81 goto out_ipc; 81 goto out_ipc;
diff --git a/kernel/panic.c b/kernel/panic.c
index 991bb87a1704..69231670eb95 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -433,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
433 433
434core_param(panic, panic_timeout, int, 0644); 434core_param(panic, panic_timeout, int, 0644);
435core_param(pause_on_oops, pause_on_oops, int, 0644); 435core_param(pause_on_oops, pause_on_oops, int, 0644);
436
437static int __init oops_setup(char *s)
438{
439 if (!s)
440 return -EINVAL;
441 if (!strcmp(s, "panic"))
442 panic_on_oops = 1;
443 return 0;
444}
445early_param("oops", oops_setup);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 656222fcf767..c75925c4d1e2 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -38,13 +38,96 @@
38 38
39#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
40 40
41struct remote_function_call {
42 struct task_struct *p;
43 int (*func)(void *info);
44 void *info;
45 int ret;
46};
47
48static void remote_function(void *data)
49{
50 struct remote_function_call *tfc = data;
51 struct task_struct *p = tfc->p;
52
53 if (p) {
54 tfc->ret = -EAGAIN;
55 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
56 return;
57 }
58
59 tfc->ret = tfc->func(tfc->info);
60}
61
62/**
63 * task_function_call - call a function on the cpu on which a task runs
64 * @p: the task to evaluate
65 * @func: the function to be called
66 * @info: the function call argument
67 *
68 * Calls the function @func when the task is currently running. This might
69 * be on the current CPU, which just calls the function directly
70 *
71 * returns: @func return value, or
72 * -ESRCH - when the process isn't running
73 * -EAGAIN - when the process moved away
74 */
75static int
76task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
77{
78 struct remote_function_call data = {
79 .p = p,
80 .func = func,
81 .info = info,
82 .ret = -ESRCH, /* No such (running) process */
83 };
84
85 if (task_curr(p))
86 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
87
88 return data.ret;
89}
90
91/**
92 * cpu_function_call - call a function on the cpu
93 * @func: the function to be called
94 * @info: the function call argument
95 *
96 * Calls the function @func on the remote cpu.
97 *
98 * returns: @func return value or -ENXIO when the cpu is offline
99 */
100static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
101{
102 struct remote_function_call data = {
103 .p = NULL,
104 .func = func,
105 .info = info,
106 .ret = -ENXIO, /* No such CPU */
107 };
108
109 smp_call_function_single(cpu, remote_function, &data, 1);
110
111 return data.ret;
112}
113
114#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
115 PERF_FLAG_FD_OUTPUT |\
116 PERF_FLAG_PID_CGROUP)
117
41enum event_type_t { 118enum event_type_t {
42 EVENT_FLEXIBLE = 0x1, 119 EVENT_FLEXIBLE = 0x1,
43 EVENT_PINNED = 0x2, 120 EVENT_PINNED = 0x2,
44 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 121 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
45}; 122};
46 123
47atomic_t perf_task_events __read_mostly; 124/*
125 * perf_sched_events : >0 events exist
126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
127 */
128atomic_t perf_sched_events __read_mostly;
129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
130
48static atomic_t nr_mmap_events __read_mostly; 131static atomic_t nr_mmap_events __read_mostly;
49static atomic_t nr_comm_events __read_mostly; 132static atomic_t nr_comm_events __read_mostly;
50static atomic_t nr_task_events __read_mostly; 133static atomic_t nr_task_events __read_mostly;
@@ -62,12 +145,30 @@ static struct srcu_struct pmus_srcu;
62 */ 145 */
63int sysctl_perf_event_paranoid __read_mostly = 1; 146int sysctl_perf_event_paranoid __read_mostly = 1;
64 147
65int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ 148/* Minimum for 128 pages + 1 for the user control page */
149int sysctl_perf_event_mlock __read_mostly = 516; /* 'free' kb per user */
66 150
67/* 151/*
68 * max perf event sample rate 152 * max perf event sample rate
69 */ 153 */
70int sysctl_perf_event_sample_rate __read_mostly = 100000; 154#define DEFAULT_MAX_SAMPLE_RATE 100000
155int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
156static int max_samples_per_tick __read_mostly =
157 DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
158
159int perf_proc_update_handler(struct ctl_table *table, int write,
160 void __user *buffer, size_t *lenp,
161 loff_t *ppos)
162{
163 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
164
165 if (ret || !write)
166 return ret;
167
168 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
169
170 return 0;
171}
71 172
72static atomic64_t perf_event_id; 173static atomic64_t perf_event_id;
73 174
@@ -75,7 +176,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
75 enum event_type_t event_type); 176 enum event_type_t event_type);
76 177
77static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 178static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
78 enum event_type_t event_type); 179 enum event_type_t event_type,
180 struct task_struct *task);
181
182static void update_context_time(struct perf_event_context *ctx);
183static u64 perf_event_time(struct perf_event *event);
79 184
80void __weak perf_event_print_debug(void) { } 185void __weak perf_event_print_debug(void) { }
81 186
@@ -89,6 +194,360 @@ static inline u64 perf_clock(void)
89 return local_clock(); 194 return local_clock();
90} 195}
91 196
197static inline struct perf_cpu_context *
198__get_cpu_context(struct perf_event_context *ctx)
199{
200 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
201}
202
203#ifdef CONFIG_CGROUP_PERF
204
205/*
206 * Must ensure cgroup is pinned (css_get) before calling
207 * this function. In other words, we cannot call this function
208 * if there is no cgroup event for the current CPU context.
209 */
210static inline struct perf_cgroup *
211perf_cgroup_from_task(struct task_struct *task)
212{
213 return container_of(task_subsys_state(task, perf_subsys_id),
214 struct perf_cgroup, css);
215}
216
217static inline bool
218perf_cgroup_match(struct perf_event *event)
219{
220 struct perf_event_context *ctx = event->ctx;
221 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
222
223 return !event->cgrp || event->cgrp == cpuctx->cgrp;
224}
225
226static inline void perf_get_cgroup(struct perf_event *event)
227{
228 css_get(&event->cgrp->css);
229}
230
231static inline void perf_put_cgroup(struct perf_event *event)
232{
233 css_put(&event->cgrp->css);
234}
235
236static inline void perf_detach_cgroup(struct perf_event *event)
237{
238 perf_put_cgroup(event);
239 event->cgrp = NULL;
240}
241
242static inline int is_cgroup_event(struct perf_event *event)
243{
244 return event->cgrp != NULL;
245}
246
247static inline u64 perf_cgroup_event_time(struct perf_event *event)
248{
249 struct perf_cgroup_info *t;
250
251 t = per_cpu_ptr(event->cgrp->info, event->cpu);
252 return t->time;
253}
254
255static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
256{
257 struct perf_cgroup_info *info;
258 u64 now;
259
260 now = perf_clock();
261
262 info = this_cpu_ptr(cgrp->info);
263
264 info->time += now - info->timestamp;
265 info->timestamp = now;
266}
267
268static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
269{
270 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
271 if (cgrp_out)
272 __update_cgrp_time(cgrp_out);
273}
274
275static inline void update_cgrp_time_from_event(struct perf_event *event)
276{
277 struct perf_cgroup *cgrp;
278
279 /*
280 * ensure we access cgroup data only when needed and
281 * when we know the cgroup is pinned (css_get)
282 */
283 if (!is_cgroup_event(event))
284 return;
285
286 cgrp = perf_cgroup_from_task(current);
287 /*
288 * Do not update time when cgroup is not active
289 */
290 if (cgrp == event->cgrp)
291 __update_cgrp_time(event->cgrp);
292}
293
294static inline void
295perf_cgroup_set_timestamp(struct task_struct *task,
296 struct perf_event_context *ctx)
297{
298 struct perf_cgroup *cgrp;
299 struct perf_cgroup_info *info;
300
301 /*
302 * ctx->lock held by caller
303 * ensure we do not access cgroup data
304 * unless we have the cgroup pinned (css_get)
305 */
306 if (!task || !ctx->nr_cgroups)
307 return;
308
309 cgrp = perf_cgroup_from_task(task);
310 info = this_cpu_ptr(cgrp->info);
311 info->timestamp = ctx->timestamp;
312}
313
314#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
315#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
316
317/*
318 * reschedule events based on the cgroup constraint of task.
319 *
320 * mode SWOUT : schedule out everything
321 * mode SWIN : schedule in based on cgroup for next
322 */
323void perf_cgroup_switch(struct task_struct *task, int mode)
324{
325 struct perf_cpu_context *cpuctx;
326 struct pmu *pmu;
327 unsigned long flags;
328
329 /*
330 * disable interrupts to avoid geting nr_cgroup
331 * changes via __perf_event_disable(). Also
332 * avoids preemption.
333 */
334 local_irq_save(flags);
335
336 /*
337 * we reschedule only in the presence of cgroup
338 * constrained events.
339 */
340 rcu_read_lock();
341
342 list_for_each_entry_rcu(pmu, &pmus, entry) {
343
344 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
345
346 perf_pmu_disable(cpuctx->ctx.pmu);
347
348 /*
349 * perf_cgroup_events says at least one
350 * context on this CPU has cgroup events.
351 *
352 * ctx->nr_cgroups reports the number of cgroup
353 * events for a context.
354 */
355 if (cpuctx->ctx.nr_cgroups > 0) {
356
357 if (mode & PERF_CGROUP_SWOUT) {
358 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
359 /*
360 * must not be done before ctxswout due
361 * to event_filter_match() in event_sched_out()
362 */
363 cpuctx->cgrp = NULL;
364 }
365
366 if (mode & PERF_CGROUP_SWIN) {
367 /* set cgrp before ctxsw in to
368 * allow event_filter_match() to not
369 * have to pass task around
370 */
371 cpuctx->cgrp = perf_cgroup_from_task(task);
372 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
373 }
374 }
375
376 perf_pmu_enable(cpuctx->ctx.pmu);
377 }
378
379 rcu_read_unlock();
380
381 local_irq_restore(flags);
382}
383
384static inline void perf_cgroup_sched_out(struct task_struct *task)
385{
386 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
387}
388
389static inline void perf_cgroup_sched_in(struct task_struct *task)
390{
391 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
392}
393
394static inline int perf_cgroup_connect(int fd, struct perf_event *event,
395 struct perf_event_attr *attr,
396 struct perf_event *group_leader)
397{
398 struct perf_cgroup *cgrp;
399 struct cgroup_subsys_state *css;
400 struct file *file;
401 int ret = 0, fput_needed;
402
403 file = fget_light(fd, &fput_needed);
404 if (!file)
405 return -EBADF;
406
407 css = cgroup_css_from_dir(file, perf_subsys_id);
408 if (IS_ERR(css)) {
409 ret = PTR_ERR(css);
410 goto out;
411 }
412
413 cgrp = container_of(css, struct perf_cgroup, css);
414 event->cgrp = cgrp;
415
416 /* must be done before we fput() the file */
417 perf_get_cgroup(event);
418
419 /*
420 * all events in a group must monitor
421 * the same cgroup because a task belongs
422 * to only one perf cgroup at a time
423 */
424 if (group_leader && group_leader->cgrp != cgrp) {
425 perf_detach_cgroup(event);
426 ret = -EINVAL;
427 }
428out:
429 fput_light(file, fput_needed);
430 return ret;
431}
432
433static inline void
434perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
435{
436 struct perf_cgroup_info *t;
437 t = per_cpu_ptr(event->cgrp->info, event->cpu);
438 event->shadow_ctx_time = now - t->timestamp;
439}
440
441static inline void
442perf_cgroup_defer_enabled(struct perf_event *event)
443{
444 /*
445 * when the current task's perf cgroup does not match
446 * the event's, we need to remember to call the
447 * perf_mark_enable() function the first time a task with
448 * a matching perf cgroup is scheduled in.
449 */
450 if (is_cgroup_event(event) && !perf_cgroup_match(event))
451 event->cgrp_defer_enabled = 1;
452}
453
454static inline void
455perf_cgroup_mark_enabled(struct perf_event *event,
456 struct perf_event_context *ctx)
457{
458 struct perf_event *sub;
459 u64 tstamp = perf_event_time(event);
460
461 if (!event->cgrp_defer_enabled)
462 return;
463
464 event->cgrp_defer_enabled = 0;
465
466 event->tstamp_enabled = tstamp - event->total_time_enabled;
467 list_for_each_entry(sub, &event->sibling_list, group_entry) {
468 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
469 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
470 sub->cgrp_defer_enabled = 0;
471 }
472 }
473}
474#else /* !CONFIG_CGROUP_PERF */
475
476static inline bool
477perf_cgroup_match(struct perf_event *event)
478{
479 return true;
480}
481
482static inline void perf_detach_cgroup(struct perf_event *event)
483{}
484
485static inline int is_cgroup_event(struct perf_event *event)
486{
487 return 0;
488}
489
490static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
491{
492 return 0;
493}
494
495static inline void update_cgrp_time_from_event(struct perf_event *event)
496{
497}
498
499static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
500{
501}
502
503static inline void perf_cgroup_sched_out(struct task_struct *task)
504{
505}
506
507static inline void perf_cgroup_sched_in(struct task_struct *task)
508{
509}
510
511static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
512 struct perf_event_attr *attr,
513 struct perf_event *group_leader)
514{
515 return -EINVAL;
516}
517
518static inline void
519perf_cgroup_set_timestamp(struct task_struct *task,
520 struct perf_event_context *ctx)
521{
522}
523
524void
525perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
526{
527}
528
529static inline void
530perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
531{
532}
533
534static inline u64 perf_cgroup_event_time(struct perf_event *event)
535{
536 return 0;
537}
538
539static inline void
540perf_cgroup_defer_enabled(struct perf_event *event)
541{
542}
543
544static inline void
545perf_cgroup_mark_enabled(struct perf_event *event,
546 struct perf_event_context *ctx)
547{
548}
549#endif
550
92void perf_pmu_disable(struct pmu *pmu) 551void perf_pmu_disable(struct pmu *pmu)
93{ 552{
94 int *count = this_cpu_ptr(pmu->pmu_disable_count); 553 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -254,7 +713,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
254 raw_spin_lock_irqsave(&ctx->lock, flags); 713 raw_spin_lock_irqsave(&ctx->lock, flags);
255 --ctx->pin_count; 714 --ctx->pin_count;
256 raw_spin_unlock_irqrestore(&ctx->lock, flags); 715 raw_spin_unlock_irqrestore(&ctx->lock, flags);
257 put_ctx(ctx);
258} 716}
259 717
260/* 718/*
@@ -271,6 +729,10 @@ static void update_context_time(struct perf_event_context *ctx)
271static u64 perf_event_time(struct perf_event *event) 729static u64 perf_event_time(struct perf_event *event)
272{ 730{
273 struct perf_event_context *ctx = event->ctx; 731 struct perf_event_context *ctx = event->ctx;
732
733 if (is_cgroup_event(event))
734 return perf_cgroup_event_time(event);
735
274 return ctx ? ctx->time : 0; 736 return ctx ? ctx->time : 0;
275} 737}
276 738
@@ -285,9 +747,20 @@ static void update_event_times(struct perf_event *event)
285 if (event->state < PERF_EVENT_STATE_INACTIVE || 747 if (event->state < PERF_EVENT_STATE_INACTIVE ||
286 event->group_leader->state < PERF_EVENT_STATE_INACTIVE) 748 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
287 return; 749 return;
288 750 /*
289 if (ctx->is_active) 751 * in cgroup mode, time_enabled represents
752 * the time the event was enabled AND active
753 * tasks were in the monitored cgroup. This is
754 * independent of the activity of the context as
755 * there may be a mix of cgroup and non-cgroup events.
756 *
757 * That is why we treat cgroup events differently
758 * here.
759 */
760 if (is_cgroup_event(event))
290 run_end = perf_event_time(event); 761 run_end = perf_event_time(event);
762 else if (ctx->is_active)
763 run_end = ctx->time;
291 else 764 else
292 run_end = event->tstamp_stopped; 765 run_end = event->tstamp_stopped;
293 766
@@ -299,6 +772,7 @@ static void update_event_times(struct perf_event *event)
299 run_end = perf_event_time(event); 772 run_end = perf_event_time(event);
300 773
301 event->total_time_running = run_end - event->tstamp_running; 774 event->total_time_running = run_end - event->tstamp_running;
775
302} 776}
303 777
304/* 778/*
@@ -347,6 +821,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
347 list_add_tail(&event->group_entry, list); 821 list_add_tail(&event->group_entry, list);
348 } 822 }
349 823
824 if (is_cgroup_event(event))
825 ctx->nr_cgroups++;
826
350 list_add_rcu(&event->event_entry, &ctx->event_list); 827 list_add_rcu(&event->event_entry, &ctx->event_list);
351 if (!ctx->nr_events) 828 if (!ctx->nr_events)
352 perf_pmu_rotate_start(ctx->pmu); 829 perf_pmu_rotate_start(ctx->pmu);
@@ -465,6 +942,7 @@ static void perf_group_attach(struct perf_event *event)
465static void 942static void
466list_del_event(struct perf_event *event, struct perf_event_context *ctx) 943list_del_event(struct perf_event *event, struct perf_event_context *ctx)
467{ 944{
945 struct perf_cpu_context *cpuctx;
468 /* 946 /*
469 * We can have double detach due to exit/hot-unplug + close. 947 * We can have double detach due to exit/hot-unplug + close.
470 */ 948 */
@@ -473,6 +951,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
473 951
474 event->attach_state &= ~PERF_ATTACH_CONTEXT; 952 event->attach_state &= ~PERF_ATTACH_CONTEXT;
475 953
954 if (is_cgroup_event(event)) {
955 ctx->nr_cgroups--;
956 cpuctx = __get_cpu_context(ctx);
957 /*
958 * if there are no more cgroup events
959 * then cler cgrp to avoid stale pointer
960 * in update_cgrp_time_from_cpuctx()
961 */
962 if (!ctx->nr_cgroups)
963 cpuctx->cgrp = NULL;
964 }
965
476 ctx->nr_events--; 966 ctx->nr_events--;
477 if (event->attr.inherit_stat) 967 if (event->attr.inherit_stat)
478 ctx->nr_stat--; 968 ctx->nr_stat--;
@@ -544,7 +1034,8 @@ out:
544static inline int 1034static inline int
545event_filter_match(struct perf_event *event) 1035event_filter_match(struct perf_event *event)
546{ 1036{
547 return event->cpu == -1 || event->cpu == smp_processor_id(); 1037 return (event->cpu == -1 || event->cpu == smp_processor_id())
1038 && perf_cgroup_match(event);
548} 1039}
549 1040
550static void 1041static void
@@ -562,7 +1053,7 @@ event_sched_out(struct perf_event *event,
562 */ 1053 */
563 if (event->state == PERF_EVENT_STATE_INACTIVE 1054 if (event->state == PERF_EVENT_STATE_INACTIVE
564 && !event_filter_match(event)) { 1055 && !event_filter_match(event)) {
565 delta = ctx->time - event->tstamp_stopped; 1056 delta = tstamp - event->tstamp_stopped;
566 event->tstamp_running += delta; 1057 event->tstamp_running += delta;
567 event->tstamp_stopped = tstamp; 1058 event->tstamp_stopped = tstamp;
568 } 1059 }
@@ -606,47 +1097,30 @@ group_sched_out(struct perf_event *group_event,
606 cpuctx->exclusive = 0; 1097 cpuctx->exclusive = 0;
607} 1098}
608 1099
609static inline struct perf_cpu_context *
610__get_cpu_context(struct perf_event_context *ctx)
611{
612 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
613}
614
615/* 1100/*
616 * Cross CPU call to remove a performance event 1101 * Cross CPU call to remove a performance event
617 * 1102 *
618 * We disable the event on the hardware level first. After that we 1103 * We disable the event on the hardware level first. After that we
619 * remove it from the context list. 1104 * remove it from the context list.
620 */ 1105 */
621static void __perf_event_remove_from_context(void *info) 1106static int __perf_remove_from_context(void *info)
622{ 1107{
623 struct perf_event *event = info; 1108 struct perf_event *event = info;
624 struct perf_event_context *ctx = event->ctx; 1109 struct perf_event_context *ctx = event->ctx;
625 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1110 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
626 1111
627 /*
628 * If this is a task context, we need to check whether it is
629 * the current task context of this cpu. If not it has been
630 * scheduled out before the smp call arrived.
631 */
632 if (ctx->task && cpuctx->task_ctx != ctx)
633 return;
634
635 raw_spin_lock(&ctx->lock); 1112 raw_spin_lock(&ctx->lock);
636
637 event_sched_out(event, cpuctx, ctx); 1113 event_sched_out(event, cpuctx, ctx);
638
639 list_del_event(event, ctx); 1114 list_del_event(event, ctx);
640
641 raw_spin_unlock(&ctx->lock); 1115 raw_spin_unlock(&ctx->lock);
1116
1117 return 0;
642} 1118}
643 1119
644 1120
645/* 1121/*
646 * Remove the event from a task's (or a CPU's) list of events. 1122 * Remove the event from a task's (or a CPU's) list of events.
647 * 1123 *
648 * Must be called with ctx->mutex held.
649 *
650 * CPU events are removed with a smp call. For task events we only 1124 * CPU events are removed with a smp call. For task events we only
651 * call when the task is on a CPU. 1125 * call when the task is on a CPU.
652 * 1126 *
@@ -657,49 +1131,48 @@ static void __perf_event_remove_from_context(void *info)
657 * When called from perf_event_exit_task, it's OK because the 1131 * When called from perf_event_exit_task, it's OK because the
658 * context has been detached from its task. 1132 * context has been detached from its task.
659 */ 1133 */
660static void perf_event_remove_from_context(struct perf_event *event) 1134static void perf_remove_from_context(struct perf_event *event)
661{ 1135{
662 struct perf_event_context *ctx = event->ctx; 1136 struct perf_event_context *ctx = event->ctx;
663 struct task_struct *task = ctx->task; 1137 struct task_struct *task = ctx->task;
664 1138
1139 lockdep_assert_held(&ctx->mutex);
1140
665 if (!task) { 1141 if (!task) {
666 /* 1142 /*
667 * Per cpu events are removed via an smp call and 1143 * Per cpu events are removed via an smp call and
668 * the removal is always successful. 1144 * the removal is always successful.
669 */ 1145 */
670 smp_call_function_single(event->cpu, 1146 cpu_function_call(event->cpu, __perf_remove_from_context, event);
671 __perf_event_remove_from_context,
672 event, 1);
673 return; 1147 return;
674 } 1148 }
675 1149
676retry: 1150retry:
677 task_oncpu_function_call(task, __perf_event_remove_from_context, 1151 if (!task_function_call(task, __perf_remove_from_context, event))
678 event); 1152 return;
679 1153
680 raw_spin_lock_irq(&ctx->lock); 1154 raw_spin_lock_irq(&ctx->lock);
681 /* 1155 /*
682 * If the context is active we need to retry the smp call. 1156 * If we failed to find a running task, but find the context active now
1157 * that we've acquired the ctx->lock, retry.
683 */ 1158 */
684 if (ctx->nr_active && !list_empty(&event->group_entry)) { 1159 if (ctx->is_active) {
685 raw_spin_unlock_irq(&ctx->lock); 1160 raw_spin_unlock_irq(&ctx->lock);
686 goto retry; 1161 goto retry;
687 } 1162 }
688 1163
689 /* 1164 /*
690 * The lock prevents that this context is scheduled in so we 1165 * Since the task isn't running, its safe to remove the event, us
691 * can remove the event safely, if the call above did not 1166 * holding the ctx->lock ensures the task won't get scheduled in.
692 * succeed.
693 */ 1167 */
694 if (!list_empty(&event->group_entry)) 1168 list_del_event(event, ctx);
695 list_del_event(event, ctx);
696 raw_spin_unlock_irq(&ctx->lock); 1169 raw_spin_unlock_irq(&ctx->lock);
697} 1170}
698 1171
699/* 1172/*
700 * Cross CPU call to disable a performance event 1173 * Cross CPU call to disable a performance event
701 */ 1174 */
702static void __perf_event_disable(void *info) 1175static int __perf_event_disable(void *info)
703{ 1176{
704 struct perf_event *event = info; 1177 struct perf_event *event = info;
705 struct perf_event_context *ctx = event->ctx; 1178 struct perf_event_context *ctx = event->ctx;
@@ -708,9 +1181,12 @@ static void __perf_event_disable(void *info)
708 /* 1181 /*
709 * If this is a per-task event, need to check whether this 1182 * If this is a per-task event, need to check whether this
710 * event's task is the current task on this cpu. 1183 * event's task is the current task on this cpu.
1184 *
1185 * Can trigger due to concurrent perf_event_context_sched_out()
1186 * flipping contexts around.
711 */ 1187 */
712 if (ctx->task && cpuctx->task_ctx != ctx) 1188 if (ctx->task && cpuctx->task_ctx != ctx)
713 return; 1189 return -EINVAL;
714 1190
715 raw_spin_lock(&ctx->lock); 1191 raw_spin_lock(&ctx->lock);
716 1192
@@ -720,6 +1196,7 @@ static void __perf_event_disable(void *info)
720 */ 1196 */
721 if (event->state >= PERF_EVENT_STATE_INACTIVE) { 1197 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
722 update_context_time(ctx); 1198 update_context_time(ctx);
1199 update_cgrp_time_from_event(event);
723 update_group_times(event); 1200 update_group_times(event);
724 if (event == event->group_leader) 1201 if (event == event->group_leader)
725 group_sched_out(event, cpuctx, ctx); 1202 group_sched_out(event, cpuctx, ctx);
@@ -729,6 +1206,8 @@ static void __perf_event_disable(void *info)
729 } 1206 }
730 1207
731 raw_spin_unlock(&ctx->lock); 1208 raw_spin_unlock(&ctx->lock);
1209
1210 return 0;
732} 1211}
733 1212
734/* 1213/*
@@ -753,13 +1232,13 @@ void perf_event_disable(struct perf_event *event)
753 /* 1232 /*
754 * Disable the event on the cpu that it's on 1233 * Disable the event on the cpu that it's on
755 */ 1234 */
756 smp_call_function_single(event->cpu, __perf_event_disable, 1235 cpu_function_call(event->cpu, __perf_event_disable, event);
757 event, 1);
758 return; 1236 return;
759 } 1237 }
760 1238
761retry: 1239retry:
762 task_oncpu_function_call(task, __perf_event_disable, event); 1240 if (!task_function_call(task, __perf_event_disable, event))
1241 return;
763 1242
764 raw_spin_lock_irq(&ctx->lock); 1243 raw_spin_lock_irq(&ctx->lock);
765 /* 1244 /*
@@ -767,6 +1246,11 @@ retry:
767 */ 1246 */
768 if (event->state == PERF_EVENT_STATE_ACTIVE) { 1247 if (event->state == PERF_EVENT_STATE_ACTIVE) {
769 raw_spin_unlock_irq(&ctx->lock); 1248 raw_spin_unlock_irq(&ctx->lock);
1249 /*
1250 * Reload the task pointer, it might have been changed by
1251 * a concurrent perf_event_context_sched_out().
1252 */
1253 task = ctx->task;
770 goto retry; 1254 goto retry;
771 } 1255 }
772 1256
@@ -778,10 +1262,44 @@ retry:
778 update_group_times(event); 1262 update_group_times(event);
779 event->state = PERF_EVENT_STATE_OFF; 1263 event->state = PERF_EVENT_STATE_OFF;
780 } 1264 }
781
782 raw_spin_unlock_irq(&ctx->lock); 1265 raw_spin_unlock_irq(&ctx->lock);
783} 1266}
784 1267
1268static void perf_set_shadow_time(struct perf_event *event,
1269 struct perf_event_context *ctx,
1270 u64 tstamp)
1271{
1272 /*
1273 * use the correct time source for the time snapshot
1274 *
1275 * We could get by without this by leveraging the
1276 * fact that to get to this function, the caller
1277 * has most likely already called update_context_time()
1278 * and update_cgrp_time_xx() and thus both timestamp
1279 * are identical (or very close). Given that tstamp is,
1280 * already adjusted for cgroup, we could say that:
1281 * tstamp - ctx->timestamp
1282 * is equivalent to
1283 * tstamp - cgrp->timestamp.
1284 *
1285 * Then, in perf_output_read(), the calculation would
1286 * work with no changes because:
1287 * - event is guaranteed scheduled in
1288 * - no scheduled out in between
1289 * - thus the timestamp would be the same
1290 *
1291 * But this is a bit hairy.
1292 *
1293 * So instead, we have an explicit cgroup call to remain
1294 * within the time time source all along. We believe it
1295 * is cleaner and simpler to understand.
1296 */
1297 if (is_cgroup_event(event))
1298 perf_cgroup_set_shadow_time(event, tstamp);
1299 else
1300 event->shadow_ctx_time = tstamp - ctx->timestamp;
1301}
1302
785#define MAX_INTERRUPTS (~0ULL) 1303#define MAX_INTERRUPTS (~0ULL)
786 1304
787static void perf_log_throttle(struct perf_event *event, int enable); 1305static void perf_log_throttle(struct perf_event *event, int enable);
@@ -822,7 +1340,7 @@ event_sched_in(struct perf_event *event,
822 1340
823 event->tstamp_running += tstamp - event->tstamp_stopped; 1341 event->tstamp_running += tstamp - event->tstamp_stopped;
824 1342
825 event->shadow_ctx_time = tstamp - ctx->timestamp; 1343 perf_set_shadow_time(event, ctx, tstamp);
826 1344
827 if (!is_software_event(event)) 1345 if (!is_software_event(event))
828 cpuctx->active_oncpu++; 1346 cpuctx->active_oncpu++;
@@ -943,12 +1461,15 @@ static void add_event_to_ctx(struct perf_event *event,
943 event->tstamp_stopped = tstamp; 1461 event->tstamp_stopped = tstamp;
944} 1462}
945 1463
1464static void perf_event_context_sched_in(struct perf_event_context *ctx,
1465 struct task_struct *tsk);
1466
946/* 1467/*
947 * Cross CPU call to install and enable a performance event 1468 * Cross CPU call to install and enable a performance event
948 * 1469 *
949 * Must be called with ctx->mutex held 1470 * Must be called with ctx->mutex held
950 */ 1471 */
951static void __perf_install_in_context(void *info) 1472static int __perf_install_in_context(void *info)
952{ 1473{
953 struct perf_event *event = info; 1474 struct perf_event *event = info;
954 struct perf_event_context *ctx = event->ctx; 1475 struct perf_event_context *ctx = event->ctx;
@@ -957,21 +1478,22 @@ static void __perf_install_in_context(void *info)
957 int err; 1478 int err;
958 1479
959 /* 1480 /*
960 * If this is a task context, we need to check whether it is 1481 * In case we're installing a new context to an already running task,
961 * the current task context of this cpu. If not it has been 1482 * could also happen before perf_event_task_sched_in() on architectures
962 * scheduled out before the smp call arrived. 1483 * which do context switches with IRQs enabled.
963 * Or possibly this is the right context but it isn't
964 * on this cpu because it had no events.
965 */ 1484 */
966 if (ctx->task && cpuctx->task_ctx != ctx) { 1485 if (ctx->task && !cpuctx->task_ctx)
967 if (cpuctx->task_ctx || ctx->task != current) 1486 perf_event_context_sched_in(ctx, ctx->task);
968 return;
969 cpuctx->task_ctx = ctx;
970 }
971 1487
972 raw_spin_lock(&ctx->lock); 1488 raw_spin_lock(&ctx->lock);
973 ctx->is_active = 1; 1489 ctx->is_active = 1;
974 update_context_time(ctx); 1490 update_context_time(ctx);
1491 /*
1492 * update cgrp time only if current cgrp
1493 * matches event->cgrp. Must be done before
1494 * calling add_event_to_ctx()
1495 */
1496 update_cgrp_time_from_event(event);
975 1497
976 add_event_to_ctx(event, ctx); 1498 add_event_to_ctx(event, ctx);
977 1499
@@ -1012,6 +1534,8 @@ static void __perf_install_in_context(void *info)
1012 1534
1013unlock: 1535unlock:
1014 raw_spin_unlock(&ctx->lock); 1536 raw_spin_unlock(&ctx->lock);
1537
1538 return 0;
1015} 1539}
1016 1540
1017/* 1541/*
@@ -1023,8 +1547,6 @@ unlock:
1023 * If the event is attached to a task which is on a CPU we use a smp 1547 * If the event is attached to a task which is on a CPU we use a smp
1024 * call to enable it in the task context. The task might have been 1548 * call to enable it in the task context. The task might have been
1025 * scheduled away, but we check this in the smp call again. 1549 * scheduled away, but we check this in the smp call again.
1026 *
1027 * Must be called with ctx->mutex held.
1028 */ 1550 */
1029static void 1551static void
1030perf_install_in_context(struct perf_event_context *ctx, 1552perf_install_in_context(struct perf_event_context *ctx,
@@ -1033,6 +1555,8 @@ perf_install_in_context(struct perf_event_context *ctx,
1033{ 1555{
1034 struct task_struct *task = ctx->task; 1556 struct task_struct *task = ctx->task;
1035 1557
1558 lockdep_assert_held(&ctx->mutex);
1559
1036 event->ctx = ctx; 1560 event->ctx = ctx;
1037 1561
1038 if (!task) { 1562 if (!task) {
@@ -1040,31 +1564,29 @@ perf_install_in_context(struct perf_event_context *ctx,
1040 * Per cpu events are installed via an smp call and 1564 * Per cpu events are installed via an smp call and
1041 * the install is always successful. 1565 * the install is always successful.
1042 */ 1566 */
1043 smp_call_function_single(cpu, __perf_install_in_context, 1567 cpu_function_call(cpu, __perf_install_in_context, event);
1044 event, 1);
1045 return; 1568 return;
1046 } 1569 }
1047 1570
1048retry: 1571retry:
1049 task_oncpu_function_call(task, __perf_install_in_context, 1572 if (!task_function_call(task, __perf_install_in_context, event))
1050 event); 1573 return;
1051 1574
1052 raw_spin_lock_irq(&ctx->lock); 1575 raw_spin_lock_irq(&ctx->lock);
1053 /* 1576 /*
1054 * we need to retry the smp call. 1577 * If we failed to find a running task, but find the context active now
1578 * that we've acquired the ctx->lock, retry.
1055 */ 1579 */
1056 if (ctx->is_active && list_empty(&event->group_entry)) { 1580 if (ctx->is_active) {
1057 raw_spin_unlock_irq(&ctx->lock); 1581 raw_spin_unlock_irq(&ctx->lock);
1058 goto retry; 1582 goto retry;
1059 } 1583 }
1060 1584
1061 /* 1585 /*
1062 * The lock prevents that this context is scheduled in so we 1586 * Since the task isn't running, its safe to add the event, us holding
1063 * can add the event safely, if it the call above did not 1587 * the ctx->lock ensures the task won't get scheduled in.
1064 * succeed.
1065 */ 1588 */
1066 if (list_empty(&event->group_entry)) 1589 add_event_to_ctx(event, ctx);
1067 add_event_to_ctx(event, ctx);
1068 raw_spin_unlock_irq(&ctx->lock); 1590 raw_spin_unlock_irq(&ctx->lock);
1069} 1591}
1070 1592
@@ -1093,7 +1615,7 @@ static void __perf_event_mark_enabled(struct perf_event *event,
1093/* 1615/*
1094 * Cross CPU call to enable a performance event 1616 * Cross CPU call to enable a performance event
1095 */ 1617 */
1096static void __perf_event_enable(void *info) 1618static int __perf_event_enable(void *info)
1097{ 1619{
1098 struct perf_event *event = info; 1620 struct perf_event *event = info;
1099 struct perf_event_context *ctx = event->ctx; 1621 struct perf_event_context *ctx = event->ctx;
@@ -1101,26 +1623,27 @@ static void __perf_event_enable(void *info)
1101 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1623 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1102 int err; 1624 int err;
1103 1625
1104 /* 1626 if (WARN_ON_ONCE(!ctx->is_active))
1105 * If this is a per-task event, need to check whether this 1627 return -EINVAL;
1106 * event's task is the current task on this cpu.
1107 */
1108 if (ctx->task && cpuctx->task_ctx != ctx) {
1109 if (cpuctx->task_ctx || ctx->task != current)
1110 return;
1111 cpuctx->task_ctx = ctx;
1112 }
1113 1628
1114 raw_spin_lock(&ctx->lock); 1629 raw_spin_lock(&ctx->lock);
1115 ctx->is_active = 1;
1116 update_context_time(ctx); 1630 update_context_time(ctx);
1117 1631
1118 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1632 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1119 goto unlock; 1633 goto unlock;
1634
1635 /*
1636 * set current task's cgroup time reference point
1637 */
1638 perf_cgroup_set_timestamp(current, ctx);
1639
1120 __perf_event_mark_enabled(event, ctx); 1640 __perf_event_mark_enabled(event, ctx);
1121 1641
1122 if (!event_filter_match(event)) 1642 if (!event_filter_match(event)) {
1643 if (is_cgroup_event(event))
1644 perf_cgroup_defer_enabled(event);
1123 goto unlock; 1645 goto unlock;
1646 }
1124 1647
1125 /* 1648 /*
1126 * If the event is in a group and isn't the group leader, 1649 * If the event is in a group and isn't the group leader,
@@ -1153,6 +1676,8 @@ static void __perf_event_enable(void *info)
1153 1676
1154unlock: 1677unlock:
1155 raw_spin_unlock(&ctx->lock); 1678 raw_spin_unlock(&ctx->lock);
1679
1680 return 0;
1156} 1681}
1157 1682
1158/* 1683/*
@@ -1173,8 +1698,7 @@ void perf_event_enable(struct perf_event *event)
1173 /* 1698 /*
1174 * Enable the event on the cpu that it's on 1699 * Enable the event on the cpu that it's on
1175 */ 1700 */
1176 smp_call_function_single(event->cpu, __perf_event_enable, 1701 cpu_function_call(event->cpu, __perf_event_enable, event);
1177 event, 1);
1178 return; 1702 return;
1179 } 1703 }
1180 1704
@@ -1193,8 +1717,15 @@ void perf_event_enable(struct perf_event *event)
1193 event->state = PERF_EVENT_STATE_OFF; 1717 event->state = PERF_EVENT_STATE_OFF;
1194 1718
1195retry: 1719retry:
1720 if (!ctx->is_active) {
1721 __perf_event_mark_enabled(event, ctx);
1722 goto out;
1723 }
1724
1196 raw_spin_unlock_irq(&ctx->lock); 1725 raw_spin_unlock_irq(&ctx->lock);
1197 task_oncpu_function_call(task, __perf_event_enable, event); 1726
1727 if (!task_function_call(task, __perf_event_enable, event))
1728 return;
1198 1729
1199 raw_spin_lock_irq(&ctx->lock); 1730 raw_spin_lock_irq(&ctx->lock);
1200 1731
@@ -1202,15 +1733,14 @@ retry:
1202 * If the context is active and the event is still off, 1733 * If the context is active and the event is still off,
1203 * we need to retry the cross-call. 1734 * we need to retry the cross-call.
1204 */ 1735 */
1205 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) 1736 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
1737 /*
1738 * task could have been flipped by a concurrent
1739 * perf_event_context_sched_out()
1740 */
1741 task = ctx->task;
1206 goto retry; 1742 goto retry;
1207 1743 }
1208 /*
1209 * Since we have the lock this context can't be scheduled
1210 * in, so we can change the state safely.
1211 */
1212 if (event->state == PERF_EVENT_STATE_OFF)
1213 __perf_event_mark_enabled(event, ctx);
1214 1744
1215out: 1745out:
1216 raw_spin_unlock_irq(&ctx->lock); 1746 raw_spin_unlock_irq(&ctx->lock);
@@ -1242,6 +1772,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
1242 if (likely(!ctx->nr_events)) 1772 if (likely(!ctx->nr_events))
1243 goto out; 1773 goto out;
1244 update_context_time(ctx); 1774 update_context_time(ctx);
1775 update_cgrp_time_from_cpuctx(cpuctx);
1245 1776
1246 if (!ctx->nr_active) 1777 if (!ctx->nr_active)
1247 goto out; 1778 goto out;
@@ -1354,8 +1885,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1354 } 1885 }
1355} 1886}
1356 1887
1357void perf_event_context_sched_out(struct task_struct *task, int ctxn, 1888static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1358 struct task_struct *next) 1889 struct task_struct *next)
1359{ 1890{
1360 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; 1891 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1361 struct perf_event_context *next_ctx; 1892 struct perf_event_context *next_ctx;
@@ -1431,6 +1962,14 @@ void __perf_event_task_sched_out(struct task_struct *task,
1431 1962
1432 for_each_task_context_nr(ctxn) 1963 for_each_task_context_nr(ctxn)
1433 perf_event_context_sched_out(task, ctxn, next); 1964 perf_event_context_sched_out(task, ctxn, next);
1965
1966 /*
1967 * if cgroup events exist on this CPU, then we need
1968 * to check if we have to switch out PMU state.
1969 * cgroup event are system-wide mode only
1970 */
1971 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
1972 perf_cgroup_sched_out(task);
1434} 1973}
1435 1974
1436static void task_ctx_sched_out(struct perf_event_context *ctx, 1975static void task_ctx_sched_out(struct perf_event_context *ctx,
@@ -1469,6 +2008,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
1469 if (!event_filter_match(event)) 2008 if (!event_filter_match(event))
1470 continue; 2009 continue;
1471 2010
2011 /* may need to reset tstamp_enabled */
2012 if (is_cgroup_event(event))
2013 perf_cgroup_mark_enabled(event, ctx);
2014
1472 if (group_can_go_on(event, cpuctx, 1)) 2015 if (group_can_go_on(event, cpuctx, 1))
1473 group_sched_in(event, cpuctx, ctx); 2016 group_sched_in(event, cpuctx, ctx);
1474 2017
@@ -1501,6 +2044,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1501 if (!event_filter_match(event)) 2044 if (!event_filter_match(event))
1502 continue; 2045 continue;
1503 2046
2047 /* may need to reset tstamp_enabled */
2048 if (is_cgroup_event(event))
2049 perf_cgroup_mark_enabled(event, ctx);
2050
1504 if (group_can_go_on(event, cpuctx, can_add_hw)) { 2051 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1505 if (group_sched_in(event, cpuctx, ctx)) 2052 if (group_sched_in(event, cpuctx, ctx))
1506 can_add_hw = 0; 2053 can_add_hw = 0;
@@ -1511,15 +2058,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1511static void 2058static void
1512ctx_sched_in(struct perf_event_context *ctx, 2059ctx_sched_in(struct perf_event_context *ctx,
1513 struct perf_cpu_context *cpuctx, 2060 struct perf_cpu_context *cpuctx,
1514 enum event_type_t event_type) 2061 enum event_type_t event_type,
2062 struct task_struct *task)
1515{ 2063{
2064 u64 now;
2065
1516 raw_spin_lock(&ctx->lock); 2066 raw_spin_lock(&ctx->lock);
1517 ctx->is_active = 1; 2067 ctx->is_active = 1;
1518 if (likely(!ctx->nr_events)) 2068 if (likely(!ctx->nr_events))
1519 goto out; 2069 goto out;
1520 2070
1521 ctx->timestamp = perf_clock(); 2071 now = perf_clock();
1522 2072 ctx->timestamp = now;
2073 perf_cgroup_set_timestamp(task, ctx);
1523 /* 2074 /*
1524 * First go through the list and put on any pinned groups 2075 * First go through the list and put on any pinned groups
1525 * in order to give them the best chance of going on. 2076 * in order to give them the best chance of going on.
@@ -1536,11 +2087,12 @@ out:
1536} 2087}
1537 2088
1538static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 2089static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1539 enum event_type_t event_type) 2090 enum event_type_t event_type,
2091 struct task_struct *task)
1540{ 2092{
1541 struct perf_event_context *ctx = &cpuctx->ctx; 2093 struct perf_event_context *ctx = &cpuctx->ctx;
1542 2094
1543 ctx_sched_in(ctx, cpuctx, event_type); 2095 ctx_sched_in(ctx, cpuctx, event_type, task);
1544} 2096}
1545 2097
1546static void task_ctx_sched_in(struct perf_event_context *ctx, 2098static void task_ctx_sched_in(struct perf_event_context *ctx,
@@ -1548,15 +2100,16 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
1548{ 2100{
1549 struct perf_cpu_context *cpuctx; 2101 struct perf_cpu_context *cpuctx;
1550 2102
1551 cpuctx = __get_cpu_context(ctx); 2103 cpuctx = __get_cpu_context(ctx);
1552 if (cpuctx->task_ctx == ctx) 2104 if (cpuctx->task_ctx == ctx)
1553 return; 2105 return;
1554 2106
1555 ctx_sched_in(ctx, cpuctx, event_type); 2107 ctx_sched_in(ctx, cpuctx, event_type, NULL);
1556 cpuctx->task_ctx = ctx; 2108 cpuctx->task_ctx = ctx;
1557} 2109}
1558 2110
1559void perf_event_context_sched_in(struct perf_event_context *ctx) 2111static void perf_event_context_sched_in(struct perf_event_context *ctx,
2112 struct task_struct *task)
1560{ 2113{
1561 struct perf_cpu_context *cpuctx; 2114 struct perf_cpu_context *cpuctx;
1562 2115
@@ -1572,9 +2125,9 @@ void perf_event_context_sched_in(struct perf_event_context *ctx)
1572 */ 2125 */
1573 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2126 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1574 2127
1575 ctx_sched_in(ctx, cpuctx, EVENT_PINNED); 2128 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1576 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2129 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1577 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); 2130 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1578 2131
1579 cpuctx->task_ctx = ctx; 2132 cpuctx->task_ctx = ctx;
1580 2133
@@ -1607,8 +2160,15 @@ void __perf_event_task_sched_in(struct task_struct *task)
1607 if (likely(!ctx)) 2160 if (likely(!ctx))
1608 continue; 2161 continue;
1609 2162
1610 perf_event_context_sched_in(ctx); 2163 perf_event_context_sched_in(ctx, task);
1611 } 2164 }
2165 /*
2166 * if cgroup events exist on this CPU, then we need
2167 * to check if we have to switch in PMU state.
2168 * cgroup event are system-wide mode only
2169 */
2170 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2171 perf_cgroup_sched_in(task);
1612} 2172}
1613 2173
1614static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2174static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -1638,7 +2198,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1638 * Reduce accuracy by one bit such that @a and @b converge 2198 * Reduce accuracy by one bit such that @a and @b converge
1639 * to a similar magnitude. 2199 * to a similar magnitude.
1640 */ 2200 */
1641#define REDUCE_FLS(a, b) \ 2201#define REDUCE_FLS(a, b) \
1642do { \ 2202do { \
1643 if (a##_fls > b##_fls) { \ 2203 if (a##_fls > b##_fls) { \
1644 a >>= 1; \ 2204 a >>= 1; \
@@ -1808,7 +2368,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1808 if (ctx) 2368 if (ctx)
1809 rotate_ctx(ctx); 2369 rotate_ctx(ctx);
1810 2370
1811 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2371 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
1812 if (ctx) 2372 if (ctx)
1813 task_ctx_sched_in(ctx, EVENT_FLEXIBLE); 2373 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1814 2374
@@ -1887,7 +2447,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1887 2447
1888 raw_spin_unlock(&ctx->lock); 2448 raw_spin_unlock(&ctx->lock);
1889 2449
1890 perf_event_context_sched_in(ctx); 2450 perf_event_context_sched_in(ctx, ctx->task);
1891out: 2451out:
1892 local_irq_restore(flags); 2452 local_irq_restore(flags);
1893} 2453}
@@ -1912,8 +2472,10 @@ static void __perf_event_read(void *info)
1912 return; 2472 return;
1913 2473
1914 raw_spin_lock(&ctx->lock); 2474 raw_spin_lock(&ctx->lock);
1915 if (ctx->is_active) 2475 if (ctx->is_active) {
1916 update_context_time(ctx); 2476 update_context_time(ctx);
2477 update_cgrp_time_from_event(event);
2478 }
1917 update_event_times(event); 2479 update_event_times(event);
1918 if (event->state == PERF_EVENT_STATE_ACTIVE) 2480 if (event->state == PERF_EVENT_STATE_ACTIVE)
1919 event->pmu->read(event); 2481 event->pmu->read(event);
@@ -1944,8 +2506,10 @@ static u64 perf_event_read(struct perf_event *event)
1944 * (e.g., thread is blocked), in that case 2506 * (e.g., thread is blocked), in that case
1945 * we cannot update context time 2507 * we cannot update context time
1946 */ 2508 */
1947 if (ctx->is_active) 2509 if (ctx->is_active) {
1948 update_context_time(ctx); 2510 update_context_time(ctx);
2511 update_cgrp_time_from_event(event);
2512 }
1949 update_event_times(event); 2513 update_event_times(event);
1950 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2514 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1951 } 2515 }
@@ -2224,6 +2788,9 @@ errout:
2224 2788
2225} 2789}
2226 2790
2791/*
2792 * Returns a matching context with refcount and pincount.
2793 */
2227static struct perf_event_context * 2794static struct perf_event_context *
2228find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) 2795find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2229{ 2796{
@@ -2248,6 +2815,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2248 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 2815 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2249 ctx = &cpuctx->ctx; 2816 ctx = &cpuctx->ctx;
2250 get_ctx(ctx); 2817 get_ctx(ctx);
2818 ++ctx->pin_count;
2251 2819
2252 return ctx; 2820 return ctx;
2253 } 2821 }
@@ -2261,6 +2829,7 @@ retry:
2261 ctx = perf_lock_task_context(task, ctxn, &flags); 2829 ctx = perf_lock_task_context(task, ctxn, &flags);
2262 if (ctx) { 2830 if (ctx) {
2263 unclone_ctx(ctx); 2831 unclone_ctx(ctx);
2832 ++ctx->pin_count;
2264 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2833 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2265 } 2834 }
2266 2835
@@ -2282,8 +2851,10 @@ retry:
2282 err = -ESRCH; 2851 err = -ESRCH;
2283 else if (task->perf_event_ctxp[ctxn]) 2852 else if (task->perf_event_ctxp[ctxn])
2284 err = -EAGAIN; 2853 err = -EAGAIN;
2285 else 2854 else {
2855 ++ctx->pin_count;
2286 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); 2856 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2857 }
2287 mutex_unlock(&task->perf_event_mutex); 2858 mutex_unlock(&task->perf_event_mutex);
2288 2859
2289 if (unlikely(err)) { 2860 if (unlikely(err)) {
@@ -2323,7 +2894,7 @@ static void free_event(struct perf_event *event)
2323 2894
2324 if (!event->parent) { 2895 if (!event->parent) {
2325 if (event->attach_state & PERF_ATTACH_TASK) 2896 if (event->attach_state & PERF_ATTACH_TASK)
2326 jump_label_dec(&perf_task_events); 2897 jump_label_dec(&perf_sched_events);
2327 if (event->attr.mmap || event->attr.mmap_data) 2898 if (event->attr.mmap || event->attr.mmap_data)
2328 atomic_dec(&nr_mmap_events); 2899 atomic_dec(&nr_mmap_events);
2329 if (event->attr.comm) 2900 if (event->attr.comm)
@@ -2332,6 +2903,10 @@ static void free_event(struct perf_event *event)
2332 atomic_dec(&nr_task_events); 2903 atomic_dec(&nr_task_events);
2333 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 2904 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2334 put_callchain_buffers(); 2905 put_callchain_buffers();
2906 if (is_cgroup_event(event)) {
2907 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2908 jump_label_dec(&perf_sched_events);
2909 }
2335 } 2910 }
2336 2911
2337 if (event->buffer) { 2912 if (event->buffer) {
@@ -2339,6 +2914,9 @@ static void free_event(struct perf_event *event)
2339 event->buffer = NULL; 2914 event->buffer = NULL;
2340 } 2915 }
2341 2916
2917 if (is_cgroup_event(event))
2918 perf_detach_cgroup(event);
2919
2342 if (event->destroy) 2920 if (event->destroy)
2343 event->destroy(event); 2921 event->destroy(event);
2344 2922
@@ -4406,26 +4984,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4406 if (unlikely(!is_sampling_event(event))) 4984 if (unlikely(!is_sampling_event(event)))
4407 return 0; 4985 return 0;
4408 4986
4409 if (!throttle) { 4987 if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
4410 hwc->interrupts++; 4988 if (throttle) {
4411 } else { 4989 hwc->interrupts = MAX_INTERRUPTS;
4412 if (hwc->interrupts != MAX_INTERRUPTS) { 4990 perf_log_throttle(event, 0);
4413 hwc->interrupts++;
4414 if (HZ * hwc->interrupts >
4415 (u64)sysctl_perf_event_sample_rate) {
4416 hwc->interrupts = MAX_INTERRUPTS;
4417 perf_log_throttle(event, 0);
4418 ret = 1;
4419 }
4420 } else {
4421 /*
4422 * Keep re-disabling events even though on the previous
4423 * pass we disabled it - just in case we raced with a
4424 * sched-in and the event got enabled again:
4425 */
4426 ret = 1; 4991 ret = 1;
4427 } 4992 }
4428 } 4993 } else
4994 hwc->interrupts++;
4429 4995
4430 if (event->attr.freq) { 4996 if (event->attr.freq) {
4431 u64 now = perf_clock(); 4997 u64 now = perf_clock();
@@ -4567,7 +5133,7 @@ static int perf_exclude_event(struct perf_event *event,
4567 struct pt_regs *regs) 5133 struct pt_regs *regs)
4568{ 5134{
4569 if (event->hw.state & PERF_HES_STOPPED) 5135 if (event->hw.state & PERF_HES_STOPPED)
4570 return 0; 5136 return 1;
4571 5137
4572 if (regs) { 5138 if (regs) {
4573 if (event->attr.exclude_user && user_mode(regs)) 5139 if (event->attr.exclude_user && user_mode(regs))
@@ -4923,6 +5489,8 @@ static int perf_tp_event_match(struct perf_event *event,
4923 struct perf_sample_data *data, 5489 struct perf_sample_data *data,
4924 struct pt_regs *regs) 5490 struct pt_regs *regs)
4925{ 5491{
5492 if (event->hw.state & PERF_HES_STOPPED)
5493 return 0;
4926 /* 5494 /*
4927 * All tracepoints are from kernel-space. 5495 * All tracepoints are from kernel-space.
4928 */ 5496 */
@@ -5062,6 +5630,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5062 u64 period; 5630 u64 period;
5063 5631
5064 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 5632 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
5633
5634 if (event->state != PERF_EVENT_STATE_ACTIVE)
5635 return HRTIMER_NORESTART;
5636
5065 event->pmu->read(event); 5637 event->pmu->read(event);
5066 5638
5067 perf_sample_data_init(&data, 0); 5639 perf_sample_data_init(&data, 0);
@@ -5088,9 +5660,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
5088 if (!is_sampling_event(event)) 5660 if (!is_sampling_event(event))
5089 return; 5661 return;
5090 5662
5091 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5092 hwc->hrtimer.function = perf_swevent_hrtimer;
5093
5094 period = local64_read(&hwc->period_left); 5663 period = local64_read(&hwc->period_left);
5095 if (period) { 5664 if (period) {
5096 if (period < 0) 5665 if (period < 0)
@@ -5117,6 +5686,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5117 } 5686 }
5118} 5687}
5119 5688
5689static void perf_swevent_init_hrtimer(struct perf_event *event)
5690{
5691 struct hw_perf_event *hwc = &event->hw;
5692
5693 if (!is_sampling_event(event))
5694 return;
5695
5696 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5697 hwc->hrtimer.function = perf_swevent_hrtimer;
5698
5699 /*
5700 * Since hrtimers have a fixed rate, we can do a static freq->period
5701 * mapping and avoid the whole period adjust feedback stuff.
5702 */
5703 if (event->attr.freq) {
5704 long freq = event->attr.sample_freq;
5705
5706 event->attr.sample_period = NSEC_PER_SEC / freq;
5707 hwc->sample_period = event->attr.sample_period;
5708 local64_set(&hwc->period_left, hwc->sample_period);
5709 event->attr.freq = 0;
5710 }
5711}
5712
5120/* 5713/*
5121 * Software event: cpu wall time clock 5714 * Software event: cpu wall time clock
5122 */ 5715 */
@@ -5169,6 +5762,8 @@ static int cpu_clock_event_init(struct perf_event *event)
5169 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) 5762 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5170 return -ENOENT; 5763 return -ENOENT;
5171 5764
5765 perf_swevent_init_hrtimer(event);
5766
5172 return 0; 5767 return 0;
5173} 5768}
5174 5769
@@ -5224,16 +5819,9 @@ static void task_clock_event_del(struct perf_event *event, int flags)
5224 5819
5225static void task_clock_event_read(struct perf_event *event) 5820static void task_clock_event_read(struct perf_event *event)
5226{ 5821{
5227 u64 time; 5822 u64 now = perf_clock();
5228 5823 u64 delta = now - event->ctx->timestamp;
5229 if (!in_nmi()) { 5824 u64 time = event->ctx->time + delta;
5230 update_context_time(event->ctx);
5231 time = event->ctx->time;
5232 } else {
5233 u64 now = perf_clock();
5234 u64 delta = now - event->ctx->timestamp;
5235 time = event->ctx->time + delta;
5236 }
5237 5825
5238 task_clock_event_update(event, time); 5826 task_clock_event_update(event, time);
5239} 5827}
@@ -5246,6 +5834,8 @@ static int task_clock_event_init(struct perf_event *event)
5246 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) 5834 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5247 return -ENOENT; 5835 return -ENOENT;
5248 5836
5837 perf_swevent_init_hrtimer(event);
5838
5249 return 0; 5839 return 0;
5250} 5840}
5251 5841
@@ -5517,17 +6107,22 @@ struct pmu *perf_init_event(struct perf_event *event)
5517{ 6107{
5518 struct pmu *pmu = NULL; 6108 struct pmu *pmu = NULL;
5519 int idx; 6109 int idx;
6110 int ret;
5520 6111
5521 idx = srcu_read_lock(&pmus_srcu); 6112 idx = srcu_read_lock(&pmus_srcu);
5522 6113
5523 rcu_read_lock(); 6114 rcu_read_lock();
5524 pmu = idr_find(&pmu_idr, event->attr.type); 6115 pmu = idr_find(&pmu_idr, event->attr.type);
5525 rcu_read_unlock(); 6116 rcu_read_unlock();
5526 if (pmu) 6117 if (pmu) {
6118 ret = pmu->event_init(event);
6119 if (ret)
6120 pmu = ERR_PTR(ret);
5527 goto unlock; 6121 goto unlock;
6122 }
5528 6123
5529 list_for_each_entry_rcu(pmu, &pmus, entry) { 6124 list_for_each_entry_rcu(pmu, &pmus, entry) {
5530 int ret = pmu->event_init(event); 6125 ret = pmu->event_init(event);
5531 if (!ret) 6126 if (!ret)
5532 goto unlock; 6127 goto unlock;
5533 6128
@@ -5653,7 +6248,7 @@ done:
5653 6248
5654 if (!event->parent) { 6249 if (!event->parent) {
5655 if (event->attach_state & PERF_ATTACH_TASK) 6250 if (event->attach_state & PERF_ATTACH_TASK)
5656 jump_label_inc(&perf_task_events); 6251 jump_label_inc(&perf_sched_events);
5657 if (event->attr.mmap || event->attr.mmap_data) 6252 if (event->attr.mmap || event->attr.mmap_data)
5658 atomic_inc(&nr_mmap_events); 6253 atomic_inc(&nr_mmap_events);
5659 if (event->attr.comm) 6254 if (event->attr.comm)
@@ -5828,7 +6423,7 @@ SYSCALL_DEFINE5(perf_event_open,
5828 int err; 6423 int err;
5829 6424
5830 /* for future expandability... */ 6425 /* for future expandability... */
5831 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) 6426 if (flags & ~PERF_FLAG_ALL)
5832 return -EINVAL; 6427 return -EINVAL;
5833 6428
5834 err = perf_copy_attr(attr_uptr, &attr); 6429 err = perf_copy_attr(attr_uptr, &attr);
@@ -5845,6 +6440,15 @@ SYSCALL_DEFINE5(perf_event_open,
5845 return -EINVAL; 6440 return -EINVAL;
5846 } 6441 }
5847 6442
6443 /*
6444 * In cgroup mode, the pid argument is used to pass the fd
6445 * opened to the cgroup directory in cgroupfs. The cpu argument
6446 * designates the cpu on which to monitor threads from that
6447 * cgroup.
6448 */
6449 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6450 return -EINVAL;
6451
5848 event_fd = get_unused_fd_flags(O_RDWR); 6452 event_fd = get_unused_fd_flags(O_RDWR);
5849 if (event_fd < 0) 6453 if (event_fd < 0)
5850 return event_fd; 6454 return event_fd;
@@ -5862,7 +6466,7 @@ SYSCALL_DEFINE5(perf_event_open,
5862 group_leader = NULL; 6466 group_leader = NULL;
5863 } 6467 }
5864 6468
5865 if (pid != -1) { 6469 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
5866 task = find_lively_task_by_vpid(pid); 6470 task = find_lively_task_by_vpid(pid);
5867 if (IS_ERR(task)) { 6471 if (IS_ERR(task)) {
5868 err = PTR_ERR(task); 6472 err = PTR_ERR(task);
@@ -5876,6 +6480,19 @@ SYSCALL_DEFINE5(perf_event_open,
5876 goto err_task; 6480 goto err_task;
5877 } 6481 }
5878 6482
6483 if (flags & PERF_FLAG_PID_CGROUP) {
6484 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6485 if (err)
6486 goto err_alloc;
6487 /*
6488 * one more event:
6489 * - that has cgroup constraint on event->cpu
6490 * - that may need work on context switch
6491 */
6492 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6493 jump_label_inc(&perf_sched_events);
6494 }
6495
5879 /* 6496 /*
5880 * Special case software events and allow them to be part of 6497 * Special case software events and allow them to be part of
5881 * any hardware group. 6498 * any hardware group.
@@ -5961,10 +6578,10 @@ SYSCALL_DEFINE5(perf_event_open,
5961 struct perf_event_context *gctx = group_leader->ctx; 6578 struct perf_event_context *gctx = group_leader->ctx;
5962 6579
5963 mutex_lock(&gctx->mutex); 6580 mutex_lock(&gctx->mutex);
5964 perf_event_remove_from_context(group_leader); 6581 perf_remove_from_context(group_leader);
5965 list_for_each_entry(sibling, &group_leader->sibling_list, 6582 list_for_each_entry(sibling, &group_leader->sibling_list,
5966 group_entry) { 6583 group_entry) {
5967 perf_event_remove_from_context(sibling); 6584 perf_remove_from_context(sibling);
5968 put_ctx(gctx); 6585 put_ctx(gctx);
5969 } 6586 }
5970 mutex_unlock(&gctx->mutex); 6587 mutex_unlock(&gctx->mutex);
@@ -5987,6 +6604,7 @@ SYSCALL_DEFINE5(perf_event_open,
5987 6604
5988 perf_install_in_context(ctx, event, cpu); 6605 perf_install_in_context(ctx, event, cpu);
5989 ++ctx->generation; 6606 ++ctx->generation;
6607 perf_unpin_context(ctx);
5990 mutex_unlock(&ctx->mutex); 6608 mutex_unlock(&ctx->mutex);
5991 6609
5992 event->owner = current; 6610 event->owner = current;
@@ -6012,6 +6630,7 @@ SYSCALL_DEFINE5(perf_event_open,
6012 return event_fd; 6630 return event_fd;
6013 6631
6014err_context: 6632err_context:
6633 perf_unpin_context(ctx);
6015 put_ctx(ctx); 6634 put_ctx(ctx);
6016err_alloc: 6635err_alloc:
6017 free_event(event); 6636 free_event(event);
@@ -6062,6 +6681,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6062 mutex_lock(&ctx->mutex); 6681 mutex_lock(&ctx->mutex);
6063 perf_install_in_context(ctx, event, cpu); 6682 perf_install_in_context(ctx, event, cpu);
6064 ++ctx->generation; 6683 ++ctx->generation;
6684 perf_unpin_context(ctx);
6065 mutex_unlock(&ctx->mutex); 6685 mutex_unlock(&ctx->mutex);
6066 6686
6067 return event; 6687 return event;
@@ -6113,17 +6733,20 @@ __perf_event_exit_task(struct perf_event *child_event,
6113 struct perf_event_context *child_ctx, 6733 struct perf_event_context *child_ctx,
6114 struct task_struct *child) 6734 struct task_struct *child)
6115{ 6735{
6116 struct perf_event *parent_event; 6736 if (child_event->parent) {
6737 raw_spin_lock_irq(&child_ctx->lock);
6738 perf_group_detach(child_event);
6739 raw_spin_unlock_irq(&child_ctx->lock);
6740 }
6117 6741
6118 perf_event_remove_from_context(child_event); 6742 perf_remove_from_context(child_event);
6119 6743
6120 parent_event = child_event->parent;
6121 /* 6744 /*
6122 * It can happen that parent exits first, and has events 6745 * It can happen that the parent exits first, and has events
6123 * that are still around due to the child reference. These 6746 * that are still around due to the child reference. These
6124 * events need to be zapped - but otherwise linger. 6747 * events need to be zapped.
6125 */ 6748 */
6126 if (parent_event) { 6749 if (child_event->parent) {
6127 sync_child_event(child_event, child); 6750 sync_child_event(child_event, child);
6128 free_event(child_event); 6751 free_event(child_event);
6129 } 6752 }
@@ -6422,7 +7045,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
6422 return 0; 7045 return 0;
6423 } 7046 }
6424 7047
6425 child_ctx = child->perf_event_ctxp[ctxn]; 7048 child_ctx = child->perf_event_ctxp[ctxn];
6426 if (!child_ctx) { 7049 if (!child_ctx) {
6427 /* 7050 /*
6428 * This is executed from the parent task context, so 7051 * This is executed from the parent task context, so
@@ -6537,6 +7160,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6537 mutex_unlock(&parent_ctx->mutex); 7160 mutex_unlock(&parent_ctx->mutex);
6538 7161
6539 perf_unpin_context(parent_ctx); 7162 perf_unpin_context(parent_ctx);
7163 put_ctx(parent_ctx);
6540 7164
6541 return ret; 7165 return ret;
6542} 7166}
@@ -6606,9 +7230,9 @@ static void __perf_event_exit_context(void *__info)
6606 perf_pmu_rotate_stop(ctx->pmu); 7230 perf_pmu_rotate_stop(ctx->pmu);
6607 7231
6608 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 7232 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
6609 __perf_event_remove_from_context(event); 7233 __perf_remove_from_context(event);
6610 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) 7234 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
6611 __perf_event_remove_from_context(event); 7235 __perf_remove_from_context(event);
6612} 7236}
6613 7237
6614static void perf_event_exit_cpu_context(int cpu) 7238static void perf_event_exit_cpu_context(int cpu)
@@ -6732,3 +7356,83 @@ unlock:
6732 return ret; 7356 return ret;
6733} 7357}
6734device_initcall(perf_event_sysfs_init); 7358device_initcall(perf_event_sysfs_init);
7359
7360#ifdef CONFIG_CGROUP_PERF
7361static struct cgroup_subsys_state *perf_cgroup_create(
7362 struct cgroup_subsys *ss, struct cgroup *cont)
7363{
7364 struct perf_cgroup *jc;
7365
7366 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
7367 if (!jc)
7368 return ERR_PTR(-ENOMEM);
7369
7370 jc->info = alloc_percpu(struct perf_cgroup_info);
7371 if (!jc->info) {
7372 kfree(jc);
7373 return ERR_PTR(-ENOMEM);
7374 }
7375
7376 return &jc->css;
7377}
7378
7379static void perf_cgroup_destroy(struct cgroup_subsys *ss,
7380 struct cgroup *cont)
7381{
7382 struct perf_cgroup *jc;
7383 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
7384 struct perf_cgroup, css);
7385 free_percpu(jc->info);
7386 kfree(jc);
7387}
7388
7389static int __perf_cgroup_move(void *info)
7390{
7391 struct task_struct *task = info;
7392 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
7393 return 0;
7394}
7395
7396static void perf_cgroup_move(struct task_struct *task)
7397{
7398 task_function_call(task, __perf_cgroup_move, task);
7399}
7400
7401static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7402 struct cgroup *old_cgrp, struct task_struct *task,
7403 bool threadgroup)
7404{
7405 perf_cgroup_move(task);
7406 if (threadgroup) {
7407 struct task_struct *c;
7408 rcu_read_lock();
7409 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
7410 perf_cgroup_move(c);
7411 }
7412 rcu_read_unlock();
7413 }
7414}
7415
7416static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7417 struct cgroup *old_cgrp, struct task_struct *task)
7418{
7419 /*
7420 * cgroup_exit() is called in the copy_process() failure path.
7421 * Ignore this case since the task hasn't ran yet, this avoids
7422 * trying to poke a half freed task state from generic code.
7423 */
7424 if (!(task->flags & PF_EXITING))
7425 return;
7426
7427 perf_cgroup_move(task);
7428}
7429
7430struct cgroup_subsys perf_subsys = {
7431 .name = "perf_event",
7432 .subsys_id = perf_subsys_id,
7433 .create = perf_cgroup_create,
7434 .destroy = perf_cgroup_destroy,
7435 .exit = perf_cgroup_exit,
7436 .attach = perf_cgroup_attach,
7437};
7438#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/pid.c b/kernel/pid.c
index 39b65b69584f..02f221274265 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -435,6 +435,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
435 rcu_read_unlock(); 435 rcu_read_unlock();
436 return pid; 436 return pid;
437} 437}
438EXPORT_SYMBOL_GPL(get_task_pid);
438 439
439struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) 440struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
440{ 441{
@@ -446,6 +447,7 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
446 rcu_read_unlock(); 447 rcu_read_unlock();
447 return result; 448 return result;
448} 449}
450EXPORT_SYMBOL_GPL(get_pid_task);
449 451
450struct pid *find_get_pid(pid_t nr) 452struct pid *find_get_pid(pid_t nr)
451{ 453{
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a5aff94e1f0b..e9c9adc84ca6 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -14,6 +14,7 @@
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/acct.h> 15#include <linux/acct.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/proc_fs.h>
17 18
18#define BITS_PER_PAGE (PAGE_SIZE*8) 19#define BITS_PER_PAGE (PAGE_SIZE*8)
19 20
@@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
72{ 73{
73 struct pid_namespace *ns; 74 struct pid_namespace *ns;
74 unsigned int level = parent_pid_ns->level + 1; 75 unsigned int level = parent_pid_ns->level + 1;
75 int i; 76 int i, err = -ENOMEM;
76 77
77 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); 78 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
78 if (ns == NULL) 79 if (ns == NULL)
@@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
96 for (i = 1; i < PIDMAP_ENTRIES; i++) 97 for (i = 1; i < PIDMAP_ENTRIES; i++)
97 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 98 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
98 99
100 err = pid_ns_prepare_proc(ns);
101 if (err)
102 goto out_put_parent_pid_ns;
103
99 return ns; 104 return ns;
100 105
106out_put_parent_pid_ns:
107 put_pid_ns(parent_pid_ns);
101out_free_map: 108out_free_map:
102 kfree(ns->pidmap[0].page); 109 kfree(ns->pidmap[0].page);
103out_free: 110out_free:
104 kmem_cache_free(pid_ns_cachep, ns); 111 kmem_cache_free(pid_ns_cachep, ns);
105out: 112out:
106 return ERR_PTR(-ENOMEM); 113 return ERR_PTR(err);
107} 114}
108 115
109static void destroy_pid_namespace(struct pid_namespace *ns) 116static void destroy_pid_namespace(struct pid_namespace *ns)
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index aeaa7f846821..0da058bff8eb 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -103,11 +103,14 @@ static struct pm_qos_object *pm_qos_array[] = {
103 103
104static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 104static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
105 size_t count, loff_t *f_pos); 105 size_t count, loff_t *f_pos);
106static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
107 size_t count, loff_t *f_pos);
106static int pm_qos_power_open(struct inode *inode, struct file *filp); 108static int pm_qos_power_open(struct inode *inode, struct file *filp);
107static int pm_qos_power_release(struct inode *inode, struct file *filp); 109static int pm_qos_power_release(struct inode *inode, struct file *filp);
108 110
109static const struct file_operations pm_qos_power_fops = { 111static const struct file_operations pm_qos_power_fops = {
110 .write = pm_qos_power_write, 112 .write = pm_qos_power_write,
113 .read = pm_qos_power_read,
111 .open = pm_qos_power_open, 114 .open = pm_qos_power_open,
112 .release = pm_qos_power_release, 115 .release = pm_qos_power_release,
113 .llseek = noop_llseek, 116 .llseek = noop_llseek,
@@ -376,6 +379,27 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
376} 379}
377 380
378 381
382static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
383 size_t count, loff_t *f_pos)
384{
385 s32 value;
386 unsigned long flags;
387 struct pm_qos_object *o;
388 struct pm_qos_request_list *pm_qos_req = filp->private_data;;
389
390 if (!pm_qos_req)
391 return -EINVAL;
392 if (!pm_qos_request_active(pm_qos_req))
393 return -EINVAL;
394
395 o = pm_qos_array[pm_qos_req->pm_qos_class];
396 spin_lock_irqsave(&pm_qos_lock, flags);
397 value = pm_qos_get_value(o);
398 spin_unlock_irqrestore(&pm_qos_lock, flags);
399
400 return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
401}
402
379static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 403static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
380 size_t count, loff_t *f_pos) 404 size_t count, loff_t *f_pos)
381{ 405{
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 05bb7173850e..67fea9d25d55 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p)
176 return p->utime; 176 return p->utime;
177} 177}
178 178
179int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) 179static int
180posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
180{ 181{
181 int error = check_clock(which_clock); 182 int error = check_clock(which_clock);
182 if (!error) { 183 if (!error) {
@@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
194 return error; 195 return error;
195} 196}
196 197
197int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) 198static int
199posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
198{ 200{
199 /* 201 /*
200 * You can never reset a CPU clock, but we check for other errors 202 * You can never reset a CPU clock, but we check for other errors
@@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
317} 319}
318 320
319 321
320int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) 322static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
321{ 323{
322 const pid_t pid = CPUCLOCK_PID(which_clock); 324 const pid_t pid = CPUCLOCK_PID(which_clock);
323 int error = -EINVAL; 325 int error = -EINVAL;
@@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
379 * This is called from sys_timer_create() and do_cpu_nanosleep() with the 381 * This is called from sys_timer_create() and do_cpu_nanosleep() with the
380 * new timer already all-zeros initialized. 382 * new timer already all-zeros initialized.
381 */ 383 */
382int posix_cpu_timer_create(struct k_itimer *new_timer) 384static int posix_cpu_timer_create(struct k_itimer *new_timer)
383{ 385{
384 int ret = 0; 386 int ret = 0;
385 const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); 387 const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
@@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
425 * If we return TIMER_RETRY, it's necessary to release the timer's lock 427 * If we return TIMER_RETRY, it's necessary to release the timer's lock
426 * and try again. (This happens when the timer is in the middle of firing.) 428 * and try again. (This happens when the timer is in the middle of firing.)
427 */ 429 */
428int posix_cpu_timer_del(struct k_itimer *timer) 430static int posix_cpu_timer_del(struct k_itimer *timer)
429{ 431{
430 struct task_struct *p = timer->it.cpu.task; 432 struct task_struct *p = timer->it.cpu.task;
431 int ret = 0; 433 int ret = 0;
@@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
665 * If we return TIMER_RETRY, it's necessary to release the timer's lock 667 * If we return TIMER_RETRY, it's necessary to release the timer's lock
666 * and try again. (This happens when the timer is in the middle of firing.) 668 * and try again. (This happens when the timer is in the middle of firing.)
667 */ 669 */
668int posix_cpu_timer_set(struct k_itimer *timer, int flags, 670static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
669 struct itimerspec *new, struct itimerspec *old) 671 struct itimerspec *new, struct itimerspec *old)
670{ 672{
671 struct task_struct *p = timer->it.cpu.task; 673 struct task_struct *p = timer->it.cpu.task;
672 union cpu_time_count old_expires, new_expires, old_incr, val; 674 union cpu_time_count old_expires, new_expires, old_incr, val;
@@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
820 return ret; 822 return ret;
821} 823}
822 824
823void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) 825static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
824{ 826{
825 union cpu_time_count now; 827 union cpu_time_count now;
826 struct task_struct *p = timer->it.cpu.task; 828 struct task_struct *p = timer->it.cpu.task;
@@ -1481,11 +1483,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1481 return error; 1483 return error;
1482} 1484}
1483 1485
1484int posix_cpu_nsleep(const clockid_t which_clock, int flags, 1486static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1485 struct timespec *rqtp, struct timespec __user *rmtp) 1487
1488static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1489 struct timespec *rqtp, struct timespec __user *rmtp)
1486{ 1490{
1487 struct restart_block *restart_block = 1491 struct restart_block *restart_block =
1488 &current_thread_info()->restart_block; 1492 &current_thread_info()->restart_block;
1489 struct itimerspec it; 1493 struct itimerspec it;
1490 int error; 1494 int error;
1491 1495
@@ -1501,56 +1505,47 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1501 1505
1502 if (error == -ERESTART_RESTARTBLOCK) { 1506 if (error == -ERESTART_RESTARTBLOCK) {
1503 1507
1504 if (flags & TIMER_ABSTIME) 1508 if (flags & TIMER_ABSTIME)
1505 return -ERESTARTNOHAND; 1509 return -ERESTARTNOHAND;
1506 /* 1510 /*
1507 * Report back to the user the time still remaining. 1511 * Report back to the user the time still remaining.
1508 */ 1512 */
1509 if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) 1513 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1510 return -EFAULT; 1514 return -EFAULT;
1511 1515
1512 restart_block->fn = posix_cpu_nsleep_restart; 1516 restart_block->fn = posix_cpu_nsleep_restart;
1513 restart_block->arg0 = which_clock; 1517 restart_block->nanosleep.index = which_clock;
1514 restart_block->arg1 = (unsigned long) rmtp; 1518 restart_block->nanosleep.rmtp = rmtp;
1515 restart_block->arg2 = rqtp->tv_sec; 1519 restart_block->nanosleep.expires = timespec_to_ns(rqtp);
1516 restart_block->arg3 = rqtp->tv_nsec;
1517 } 1520 }
1518 return error; 1521 return error;
1519} 1522}
1520 1523
1521long posix_cpu_nsleep_restart(struct restart_block *restart_block) 1524static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1522{ 1525{
1523 clockid_t which_clock = restart_block->arg0; 1526 clockid_t which_clock = restart_block->nanosleep.index;
1524 struct timespec __user *rmtp;
1525 struct timespec t; 1527 struct timespec t;
1526 struct itimerspec it; 1528 struct itimerspec it;
1527 int error; 1529 int error;
1528 1530
1529 rmtp = (struct timespec __user *) restart_block->arg1; 1531 t = ns_to_timespec(restart_block->nanosleep.expires);
1530 t.tv_sec = restart_block->arg2;
1531 t.tv_nsec = restart_block->arg3;
1532 1532
1533 restart_block->fn = do_no_restart_syscall;
1534 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); 1533 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
1535 1534
1536 if (error == -ERESTART_RESTARTBLOCK) { 1535 if (error == -ERESTART_RESTARTBLOCK) {
1536 struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
1537 /* 1537 /*
1538 * Report back to the user the time still remaining. 1538 * Report back to the user the time still remaining.
1539 */ 1539 */
1540 if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) 1540 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1541 return -EFAULT; 1541 return -EFAULT;
1542 1542
1543 restart_block->fn = posix_cpu_nsleep_restart; 1543 restart_block->nanosleep.expires = timespec_to_ns(&t);
1544 restart_block->arg0 = which_clock;
1545 restart_block->arg1 = (unsigned long) rmtp;
1546 restart_block->arg2 = t.tv_sec;
1547 restart_block->arg3 = t.tv_nsec;
1548 } 1544 }
1549 return error; 1545 return error;
1550 1546
1551} 1547}
1552 1548
1553
1554#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) 1549#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
1555#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) 1550#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
1556 1551
@@ -1594,38 +1589,37 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
1594 timer->it_clock = THREAD_CLOCK; 1589 timer->it_clock = THREAD_CLOCK;
1595 return posix_cpu_timer_create(timer); 1590 return posix_cpu_timer_create(timer);
1596} 1591}
1597static int thread_cpu_nsleep(const clockid_t which_clock, int flags, 1592
1598 struct timespec *rqtp, struct timespec __user *rmtp) 1593struct k_clock clock_posix_cpu = {
1599{ 1594 .clock_getres = posix_cpu_clock_getres,
1600 return -EINVAL; 1595 .clock_set = posix_cpu_clock_set,
1601} 1596 .clock_get = posix_cpu_clock_get,
1602static long thread_cpu_nsleep_restart(struct restart_block *restart_block) 1597 .timer_create = posix_cpu_timer_create,
1603{ 1598 .nsleep = posix_cpu_nsleep,
1604 return -EINVAL; 1599 .nsleep_restart = posix_cpu_nsleep_restart,
1605} 1600 .timer_set = posix_cpu_timer_set,
1601 .timer_del = posix_cpu_timer_del,
1602 .timer_get = posix_cpu_timer_get,
1603};
1606 1604
1607static __init int init_posix_cpu_timers(void) 1605static __init int init_posix_cpu_timers(void)
1608{ 1606{
1609 struct k_clock process = { 1607 struct k_clock process = {
1610 .clock_getres = process_cpu_clock_getres, 1608 .clock_getres = process_cpu_clock_getres,
1611 .clock_get = process_cpu_clock_get, 1609 .clock_get = process_cpu_clock_get,
1612 .clock_set = do_posix_clock_nosettime, 1610 .timer_create = process_cpu_timer_create,
1613 .timer_create = process_cpu_timer_create, 1611 .nsleep = process_cpu_nsleep,
1614 .nsleep = process_cpu_nsleep, 1612 .nsleep_restart = process_cpu_nsleep_restart,
1615 .nsleep_restart = process_cpu_nsleep_restart,
1616 }; 1613 };
1617 struct k_clock thread = { 1614 struct k_clock thread = {
1618 .clock_getres = thread_cpu_clock_getres, 1615 .clock_getres = thread_cpu_clock_getres,
1619 .clock_get = thread_cpu_clock_get, 1616 .clock_get = thread_cpu_clock_get,
1620 .clock_set = do_posix_clock_nosettime, 1617 .timer_create = thread_cpu_timer_create,
1621 .timer_create = thread_cpu_timer_create,
1622 .nsleep = thread_cpu_nsleep,
1623 .nsleep_restart = thread_cpu_nsleep_restart,
1624 }; 1618 };
1625 struct timespec ts; 1619 struct timespec ts;
1626 1620
1627 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); 1621 posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
1628 register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); 1622 posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
1629 1623
1630 cputime_to_timespec(cputime_one_jiffy, &ts); 1624 cputime_to_timespec(cputime_one_jiffy, &ts);
1631 onecputick = ts.tv_nsec; 1625 onecputick = ts.tv_nsec;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 93bd2eb2bc53..4c0124919f9a 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -41,6 +41,7 @@
41#include <linux/init.h> 41#include <linux/init.h>
42#include <linux/compiler.h> 42#include <linux/compiler.h>
43#include <linux/idr.h> 43#include <linux/idr.h>
44#include <linux/posix-clock.h>
44#include <linux/posix-timers.h> 45#include <linux/posix-timers.h>
45#include <linux/syscalls.h> 46#include <linux/syscalls.h>
46#include <linux/wait.h> 47#include <linux/wait.h>
@@ -81,6 +82,14 @@ static DEFINE_SPINLOCK(idr_lock);
81#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" 82#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
82#endif 83#endif
83 84
85/*
86 * parisc wants ENOTSUP instead of EOPNOTSUPP
87 */
88#ifndef ENOTSUP
89# define ENANOSLEEP_NOTSUP EOPNOTSUPP
90#else
91# define ENANOSLEEP_NOTSUP ENOTSUP
92#endif
84 93
85/* 94/*
86 * The timer ID is turned into a timer address by idr_find(). 95 * The timer ID is turned into a timer address by idr_find().
@@ -94,11 +103,7 @@ static DEFINE_SPINLOCK(idr_lock);
94/* 103/*
95 * CLOCKs: The POSIX standard calls for a couple of clocks and allows us 104 * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
96 * to implement others. This structure defines the various 105 * to implement others. This structure defines the various
97 * clocks and allows the possibility of adding others. We 106 * clocks.
98 * provide an interface to add clocks to the table and expect
99 * the "arch" code to add at least one clock that is high
100 * resolution. Here we define the standard CLOCK_REALTIME as a
101 * 1/HZ resolution clock.
102 * 107 *
103 * RESOLUTION: Clock resolution is used to round up timer and interval 108 * RESOLUTION: Clock resolution is used to round up timer and interval
104 * times, NOT to report clock times, which are reported with as 109 * times, NOT to report clock times, which are reported with as
@@ -108,20 +113,13 @@ static DEFINE_SPINLOCK(idr_lock);
108 * necessary code is written. The standard says we should say 113 * necessary code is written. The standard says we should say
109 * something about this issue in the documentation... 114 * something about this issue in the documentation...
110 * 115 *
111 * FUNCTIONS: The CLOCKs structure defines possible functions to handle 116 * FUNCTIONS: The CLOCKs structure defines possible functions to
112 * various clock functions. For clocks that use the standard 117 * handle various clock functions.
113 * system timer code these entries should be NULL. This will
114 * allow dispatch without the overhead of indirect function
115 * calls. CLOCKS that depend on other sources (e.g. WWV or GPS)
116 * must supply functions here, even if the function just returns
117 * ENOSYS. The standard POSIX timer management code assumes the
118 * following: 1.) The k_itimer struct (sched.h) is used for the
119 * timer. 2.) The list, it_lock, it_clock, it_id and it_pid
120 * fields are not modified by timer code.
121 * 118 *
122 * At this time all functions EXCEPT clock_nanosleep can be 119 * The standard POSIX timer management code assumes the
123 * redirected by the CLOCKS structure. Clock_nanosleep is in 120 * following: 1.) The k_itimer struct (sched.h) is used for
124 * there, but the code ignores it. 121 * the timer. 2.) The list, it_lock, it_clock, it_id and
122 * it_pid fields are not modified by timer code.
125 * 123 *
126 * Permissions: It is assumed that the clock_settime() function defined 124 * Permissions: It is assumed that the clock_settime() function defined
127 * for each clock will take care of permission checks. Some 125 * for each clock will take care of permission checks. Some
@@ -138,6 +136,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
138 */ 136 */
139static int common_nsleep(const clockid_t, int flags, struct timespec *t, 137static int common_nsleep(const clockid_t, int flags, struct timespec *t,
140 struct timespec __user *rmtp); 138 struct timespec __user *rmtp);
139static int common_timer_create(struct k_itimer *new_timer);
141static void common_timer_get(struct k_itimer *, struct itimerspec *); 140static void common_timer_get(struct k_itimer *, struct itimerspec *);
142static int common_timer_set(struct k_itimer *, int, 141static int common_timer_set(struct k_itimer *, int,
143 struct itimerspec *, struct itimerspec *); 142 struct itimerspec *, struct itimerspec *);
@@ -158,76 +157,24 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
158 spin_unlock_irqrestore(&timr->it_lock, flags); 157 spin_unlock_irqrestore(&timr->it_lock, flags);
159} 158}
160 159
161/* 160/* Get clock_realtime */
162 * Call the k_clock hook function if non-null, or the default function. 161static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
163 */
164#define CLOCK_DISPATCH(clock, call, arglist) \
165 ((clock) < 0 ? posix_cpu_##call arglist : \
166 (posix_clocks[clock].call != NULL \
167 ? (*posix_clocks[clock].call) arglist : common_##call arglist))
168
169/*
170 * Default clock hook functions when the struct k_clock passed
171 * to register_posix_clock leaves a function pointer null.
172 *
173 * The function common_CALL is the default implementation for
174 * the function pointer CALL in struct k_clock.
175 */
176
177static inline int common_clock_getres(const clockid_t which_clock,
178 struct timespec *tp)
179{
180 tp->tv_sec = 0;
181 tp->tv_nsec = posix_clocks[which_clock].res;
182 return 0;
183}
184
185/*
186 * Get real time for posix timers
187 */
188static int common_clock_get(clockid_t which_clock, struct timespec *tp)
189{ 162{
190 ktime_get_real_ts(tp); 163 ktime_get_real_ts(tp);
191 return 0; 164 return 0;
192} 165}
193 166
194static inline int common_clock_set(const clockid_t which_clock, 167/* Set clock_realtime */
195 struct timespec *tp) 168static int posix_clock_realtime_set(const clockid_t which_clock,
169 const struct timespec *tp)
196{ 170{
197 return do_sys_settimeofday(tp, NULL); 171 return do_sys_settimeofday(tp, NULL);
198} 172}
199 173
200static int common_timer_create(struct k_itimer *new_timer) 174static int posix_clock_realtime_adj(const clockid_t which_clock,
201{ 175 struct timex *t)
202 hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
203 return 0;
204}
205
206static int no_timer_create(struct k_itimer *new_timer)
207{
208 return -EOPNOTSUPP;
209}
210
211static int no_nsleep(const clockid_t which_clock, int flags,
212 struct timespec *tsave, struct timespec __user *rmtp)
213{
214 return -EOPNOTSUPP;
215}
216
217/*
218 * Return nonzero if we know a priori this clockid_t value is bogus.
219 */
220static inline int invalid_clockid(const clockid_t which_clock)
221{ 176{
222 if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ 177 return do_adjtimex(t);
223 return 0;
224 if ((unsigned) which_clock >= MAX_CLOCKS)
225 return 1;
226 if (posix_clocks[which_clock].clock_getres != NULL)
227 return 0;
228 if (posix_clocks[which_clock].res != 0)
229 return 0;
230 return 1;
231} 178}
232 179
233/* 180/*
@@ -240,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
240} 187}
241 188
242/* 189/*
243 * Get monotonic time for posix timers 190 * Get monotonic-raw time for posix timers
244 */ 191 */
245static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) 192static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
246{ 193{
@@ -267,46 +214,70 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp
267 *tp = ktime_to_timespec(KTIME_LOW_RES); 214 *tp = ktime_to_timespec(KTIME_LOW_RES);
268 return 0; 215 return 0;
269} 216}
217
218static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
219{
220 get_monotonic_boottime(tp);
221 return 0;
222}
223
224
270/* 225/*
271 * Initialize everything, well, just everything in Posix clocks/timers ;) 226 * Initialize everything, well, just everything in Posix clocks/timers ;)
272 */ 227 */
273static __init int init_posix_timers(void) 228static __init int init_posix_timers(void)
274{ 229{
275 struct k_clock clock_realtime = { 230 struct k_clock clock_realtime = {
276 .clock_getres = hrtimer_get_res, 231 .clock_getres = hrtimer_get_res,
232 .clock_get = posix_clock_realtime_get,
233 .clock_set = posix_clock_realtime_set,
234 .clock_adj = posix_clock_realtime_adj,
235 .nsleep = common_nsleep,
236 .nsleep_restart = hrtimer_nanosleep_restart,
237 .timer_create = common_timer_create,
238 .timer_set = common_timer_set,
239 .timer_get = common_timer_get,
240 .timer_del = common_timer_del,
277 }; 241 };
278 struct k_clock clock_monotonic = { 242 struct k_clock clock_monotonic = {
279 .clock_getres = hrtimer_get_res, 243 .clock_getres = hrtimer_get_res,
280 .clock_get = posix_ktime_get_ts, 244 .clock_get = posix_ktime_get_ts,
281 .clock_set = do_posix_clock_nosettime, 245 .nsleep = common_nsleep,
246 .nsleep_restart = hrtimer_nanosleep_restart,
247 .timer_create = common_timer_create,
248 .timer_set = common_timer_set,
249 .timer_get = common_timer_get,
250 .timer_del = common_timer_del,
282 }; 251 };
283 struct k_clock clock_monotonic_raw = { 252 struct k_clock clock_monotonic_raw = {
284 .clock_getres = hrtimer_get_res, 253 .clock_getres = hrtimer_get_res,
285 .clock_get = posix_get_monotonic_raw, 254 .clock_get = posix_get_monotonic_raw,
286 .clock_set = do_posix_clock_nosettime,
287 .timer_create = no_timer_create,
288 .nsleep = no_nsleep,
289 }; 255 };
290 struct k_clock clock_realtime_coarse = { 256 struct k_clock clock_realtime_coarse = {
291 .clock_getres = posix_get_coarse_res, 257 .clock_getres = posix_get_coarse_res,
292 .clock_get = posix_get_realtime_coarse, 258 .clock_get = posix_get_realtime_coarse,
293 .clock_set = do_posix_clock_nosettime,
294 .timer_create = no_timer_create,
295 .nsleep = no_nsleep,
296 }; 259 };
297 struct k_clock clock_monotonic_coarse = { 260 struct k_clock clock_monotonic_coarse = {
298 .clock_getres = posix_get_coarse_res, 261 .clock_getres = posix_get_coarse_res,
299 .clock_get = posix_get_monotonic_coarse, 262 .clock_get = posix_get_monotonic_coarse,
300 .clock_set = do_posix_clock_nosettime, 263 };
301 .timer_create = no_timer_create, 264 struct k_clock clock_boottime = {
302 .nsleep = no_nsleep, 265 .clock_getres = hrtimer_get_res,
266 .clock_get = posix_get_boottime,
267 .nsleep = common_nsleep,
268 .nsleep_restart = hrtimer_nanosleep_restart,
269 .timer_create = common_timer_create,
270 .timer_set = common_timer_set,
271 .timer_get = common_timer_get,
272 .timer_del = common_timer_del,
303 }; 273 };
304 274
305 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 275 posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
306 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 276 posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
307 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); 277 posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
308 register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); 278 posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
309 register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); 279 posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
280 posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
310 281
311 posix_timers_cache = kmem_cache_create("posix_timers_cache", 282 posix_timers_cache = kmem_cache_create("posix_timers_cache",
312 sizeof (struct k_itimer), 0, SLAB_PANIC, 283 sizeof (struct k_itimer), 0, SLAB_PANIC,
@@ -482,17 +453,29 @@ static struct pid *good_sigevent(sigevent_t * event)
482 return task_pid(rtn); 453 return task_pid(rtn);
483} 454}
484 455
485void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) 456void posix_timers_register_clock(const clockid_t clock_id,
457 struct k_clock *new_clock)
486{ 458{
487 if ((unsigned) clock_id >= MAX_CLOCKS) { 459 if ((unsigned) clock_id >= MAX_CLOCKS) {
488 printk("POSIX clock register failed for clock_id %d\n", 460 printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
461 clock_id);
462 return;
463 }
464
465 if (!new_clock->clock_get) {
466 printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n",
467 clock_id);
468 return;
469 }
470 if (!new_clock->clock_getres) {
471 printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n",
489 clock_id); 472 clock_id);
490 return; 473 return;
491 } 474 }
492 475
493 posix_clocks[clock_id] = *new_clock; 476 posix_clocks[clock_id] = *new_clock;
494} 477}
495EXPORT_SYMBOL_GPL(register_posix_clock); 478EXPORT_SYMBOL_GPL(posix_timers_register_clock);
496 479
497static struct k_itimer * alloc_posix_timer(void) 480static struct k_itimer * alloc_posix_timer(void)
498{ 481{
@@ -523,19 +506,39 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
523 kmem_cache_free(posix_timers_cache, tmr); 506 kmem_cache_free(posix_timers_cache, tmr);
524} 507}
525 508
509static struct k_clock *clockid_to_kclock(const clockid_t id)
510{
511 if (id < 0)
512 return (id & CLOCKFD_MASK) == CLOCKFD ?
513 &clock_posix_dynamic : &clock_posix_cpu;
514
515 if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
516 return NULL;
517 return &posix_clocks[id];
518}
519
520static int common_timer_create(struct k_itimer *new_timer)
521{
522 hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
523 return 0;
524}
525
526/* Create a POSIX.1b interval timer. */ 526/* Create a POSIX.1b interval timer. */
527 527
528SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, 528SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
529 struct sigevent __user *, timer_event_spec, 529 struct sigevent __user *, timer_event_spec,
530 timer_t __user *, created_timer_id) 530 timer_t __user *, created_timer_id)
531{ 531{
532 struct k_clock *kc = clockid_to_kclock(which_clock);
532 struct k_itimer *new_timer; 533 struct k_itimer *new_timer;
533 int error, new_timer_id; 534 int error, new_timer_id;
534 sigevent_t event; 535 sigevent_t event;
535 int it_id_set = IT_ID_NOT_SET; 536 int it_id_set = IT_ID_NOT_SET;
536 537
537 if (invalid_clockid(which_clock)) 538 if (!kc)
538 return -EINVAL; 539 return -EINVAL;
540 if (!kc->timer_create)
541 return -EOPNOTSUPP;
539 542
540 new_timer = alloc_posix_timer(); 543 new_timer = alloc_posix_timer();
541 if (unlikely(!new_timer)) 544 if (unlikely(!new_timer))
@@ -597,7 +600,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
597 goto out; 600 goto out;
598 } 601 }
599 602
600 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); 603 error = kc->timer_create(new_timer);
601 if (error) 604 if (error)
602 goto out; 605 goto out;
603 606
@@ -607,7 +610,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
607 spin_unlock_irq(&current->sighand->siglock); 610 spin_unlock_irq(&current->sighand->siglock);
608 611
609 return 0; 612 return 0;
610 /* 613 /*
611 * In the case of the timer belonging to another task, after 614 * In the case of the timer belonging to another task, after
612 * the task is unlocked, the timer is owned by the other task 615 * the task is unlocked, the timer is owned by the other task
613 * and may cease to exist at any time. Don't use or modify 616 * and may cease to exist at any time. Don't use or modify
@@ -709,22 +712,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
709SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, 712SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
710 struct itimerspec __user *, setting) 713 struct itimerspec __user *, setting)
711{ 714{
712 struct k_itimer *timr;
713 struct itimerspec cur_setting; 715 struct itimerspec cur_setting;
716 struct k_itimer *timr;
717 struct k_clock *kc;
714 unsigned long flags; 718 unsigned long flags;
719 int ret = 0;
715 720
716 timr = lock_timer(timer_id, &flags); 721 timr = lock_timer(timer_id, &flags);
717 if (!timr) 722 if (!timr)
718 return -EINVAL; 723 return -EINVAL;
719 724
720 CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting)); 725 kc = clockid_to_kclock(timr->it_clock);
726 if (WARN_ON_ONCE(!kc || !kc->timer_get))
727 ret = -EINVAL;
728 else
729 kc->timer_get(timr, &cur_setting);
721 730
722 unlock_timer(timr, flags); 731 unlock_timer(timr, flags);
723 732
724 if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) 733 if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
725 return -EFAULT; 734 return -EFAULT;
726 735
727 return 0; 736 return ret;
728} 737}
729 738
730/* 739/*
@@ -813,6 +822,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
813 int error = 0; 822 int error = 0;
814 unsigned long flag; 823 unsigned long flag;
815 struct itimerspec *rtn = old_setting ? &old_spec : NULL; 824 struct itimerspec *rtn = old_setting ? &old_spec : NULL;
825 struct k_clock *kc;
816 826
817 if (!new_setting) 827 if (!new_setting)
818 return -EINVAL; 828 return -EINVAL;
@@ -828,8 +838,11 @@ retry:
828 if (!timr) 838 if (!timr)
829 return -EINVAL; 839 return -EINVAL;
830 840
831 error = CLOCK_DISPATCH(timr->it_clock, timer_set, 841 kc = clockid_to_kclock(timr->it_clock);
832 (timr, flags, &new_spec, rtn)); 842 if (WARN_ON_ONCE(!kc || !kc->timer_set))
843 error = -EINVAL;
844 else
845 error = kc->timer_set(timr, flags, &new_spec, rtn);
833 846
834 unlock_timer(timr, flag); 847 unlock_timer(timr, flag);
835 if (error == TIMER_RETRY) { 848 if (error == TIMER_RETRY) {
@@ -844,7 +857,7 @@ retry:
844 return error; 857 return error;
845} 858}
846 859
847static inline int common_timer_del(struct k_itimer *timer) 860static int common_timer_del(struct k_itimer *timer)
848{ 861{
849 timer->it.real.interval.tv64 = 0; 862 timer->it.real.interval.tv64 = 0;
850 863
@@ -855,7 +868,11 @@ static inline int common_timer_del(struct k_itimer *timer)
855 868
856static inline int timer_delete_hook(struct k_itimer *timer) 869static inline int timer_delete_hook(struct k_itimer *timer)
857{ 870{
858 return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer)); 871 struct k_clock *kc = clockid_to_kclock(timer->it_clock);
872
873 if (WARN_ON_ONCE(!kc || !kc->timer_del))
874 return -EINVAL;
875 return kc->timer_del(timer);
859} 876}
860 877
861/* Delete a POSIX.1b interval timer. */ 878/* Delete a POSIX.1b interval timer. */
@@ -927,69 +944,76 @@ void exit_itimers(struct signal_struct *sig)
927 } 944 }
928} 945}
929 946
930/* Not available / possible... functions */
931int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
932{
933 return -EINVAL;
934}
935EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
936
937int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
938 struct timespec *t, struct timespec __user *r)
939{
940#ifndef ENOTSUP
941 return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */
942#else /* parisc does define it separately. */
943 return -ENOTSUP;
944#endif
945}
946EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
947
948SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, 947SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
949 const struct timespec __user *, tp) 948 const struct timespec __user *, tp)
950{ 949{
950 struct k_clock *kc = clockid_to_kclock(which_clock);
951 struct timespec new_tp; 951 struct timespec new_tp;
952 952
953 if (invalid_clockid(which_clock)) 953 if (!kc || !kc->clock_set)
954 return -EINVAL; 954 return -EINVAL;
955
955 if (copy_from_user(&new_tp, tp, sizeof (*tp))) 956 if (copy_from_user(&new_tp, tp, sizeof (*tp)))
956 return -EFAULT; 957 return -EFAULT;
957 958
958 return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); 959 return kc->clock_set(which_clock, &new_tp);
959} 960}
960 961
961SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, 962SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
962 struct timespec __user *,tp) 963 struct timespec __user *,tp)
963{ 964{
965 struct k_clock *kc = clockid_to_kclock(which_clock);
964 struct timespec kernel_tp; 966 struct timespec kernel_tp;
965 int error; 967 int error;
966 968
967 if (invalid_clockid(which_clock)) 969 if (!kc)
968 return -EINVAL; 970 return -EINVAL;
969 error = CLOCK_DISPATCH(which_clock, clock_get, 971
970 (which_clock, &kernel_tp)); 972 error = kc->clock_get(which_clock, &kernel_tp);
973
971 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) 974 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
972 error = -EFAULT; 975 error = -EFAULT;
973 976
974 return error; 977 return error;
978}
979
980SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
981 struct timex __user *, utx)
982{
983 struct k_clock *kc = clockid_to_kclock(which_clock);
984 struct timex ktx;
985 int err;
986
987 if (!kc)
988 return -EINVAL;
989 if (!kc->clock_adj)
990 return -EOPNOTSUPP;
991
992 if (copy_from_user(&ktx, utx, sizeof(ktx)))
993 return -EFAULT;
994
995 err = kc->clock_adj(which_clock, &ktx);
996
997 if (!err && copy_to_user(utx, &ktx, sizeof(ktx)))
998 return -EFAULT;
975 999
1000 return err;
976} 1001}
977 1002
978SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, 1003SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
979 struct timespec __user *, tp) 1004 struct timespec __user *, tp)
980{ 1005{
1006 struct k_clock *kc = clockid_to_kclock(which_clock);
981 struct timespec rtn_tp; 1007 struct timespec rtn_tp;
982 int error; 1008 int error;
983 1009
984 if (invalid_clockid(which_clock)) 1010 if (!kc)
985 return -EINVAL; 1011 return -EINVAL;
986 1012
987 error = CLOCK_DISPATCH(which_clock, clock_getres, 1013 error = kc->clock_getres(which_clock, &rtn_tp);
988 (which_clock, &rtn_tp));
989 1014
990 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) { 1015 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
991 error = -EFAULT; 1016 error = -EFAULT;
992 }
993 1017
994 return error; 1018 return error;
995} 1019}
@@ -1009,10 +1033,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1009 const struct timespec __user *, rqtp, 1033 const struct timespec __user *, rqtp,
1010 struct timespec __user *, rmtp) 1034 struct timespec __user *, rmtp)
1011{ 1035{
1036 struct k_clock *kc = clockid_to_kclock(which_clock);
1012 struct timespec t; 1037 struct timespec t;
1013 1038
1014 if (invalid_clockid(which_clock)) 1039 if (!kc)
1015 return -EINVAL; 1040 return -EINVAL;
1041 if (!kc->nsleep)
1042 return -ENANOSLEEP_NOTSUP;
1016 1043
1017 if (copy_from_user(&t, rqtp, sizeof (struct timespec))) 1044 if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
1018 return -EFAULT; 1045 return -EFAULT;
@@ -1020,27 +1047,20 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1020 if (!timespec_valid(&t)) 1047 if (!timespec_valid(&t))
1021 return -EINVAL; 1048 return -EINVAL;
1022 1049
1023 return CLOCK_DISPATCH(which_clock, nsleep, 1050 return kc->nsleep(which_clock, flags, &t, rmtp);
1024 (which_clock, flags, &t, rmtp));
1025}
1026
1027/*
1028 * nanosleep_restart for monotonic and realtime clocks
1029 */
1030static int common_nsleep_restart(struct restart_block *restart_block)
1031{
1032 return hrtimer_nanosleep_restart(restart_block);
1033} 1051}
1034 1052
1035/* 1053/*
1036 * This will restart clock_nanosleep. This is required only by 1054 * This will restart clock_nanosleep. This is required only by
1037 * compat_clock_nanosleep_restart for now. 1055 * compat_clock_nanosleep_restart for now.
1038 */ 1056 */
1039long 1057long clock_nanosleep_restart(struct restart_block *restart_block)
1040clock_nanosleep_restart(struct restart_block *restart_block)
1041{ 1058{
1042 clockid_t which_clock = restart_block->arg0; 1059 clockid_t which_clock = restart_block->nanosleep.index;
1060 struct k_clock *kc = clockid_to_kclock(which_clock);
1061
1062 if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
1063 return -EINVAL;
1043 1064
1044 return CLOCK_DISPATCH(which_clock, nsleep_restart, 1065 return kc->nsleep_restart(restart_block);
1045 (restart_block));
1046} 1066}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 265729966ece..4603f08dc47b 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,125 +1,12 @@
1config PM
2 bool "Power Management support"
3 depends on !IA64_HP_SIM
4 ---help---
5 "Power Management" means that parts of your computer are shut
6 off or put into a power conserving "sleep" mode if they are not
7 being used. There are two competing standards for doing this: APM
8 and ACPI. If you want to use either one, say Y here and then also
9 to the requisite support below.
10
11 Power Management is most important for battery powered laptop
12 computers; if you have a laptop, check out the Linux Laptop home
13 page on the WWW at <http://www.linux-on-laptops.com/> or
14 Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
15 and the Battery Powered Linux mini-HOWTO, available from
16 <http://www.tldp.org/docs.html#howto>.
17
18 Note that, even if you say N here, Linux on the x86 architecture
19 will issue the hlt instruction if nothing is to be done, thereby
20 sending the processor to sleep and saving power.
21
22config PM_DEBUG
23 bool "Power Management Debug Support"
24 depends on PM
25 ---help---
26 This option enables various debugging support in the Power Management
27 code. This is helpful when debugging and reporting PM bugs, like
28 suspend support.
29
30config PM_ADVANCED_DEBUG
31 bool "Extra PM attributes in sysfs for low-level debugging/testing"
32 depends on PM_DEBUG
33 default n
34 ---help---
35 Add extra sysfs attributes allowing one to access some Power Management
36 fields of device objects from user space. If you are not a kernel
37 developer interested in debugging/testing Power Management, say "no".
38
39config PM_VERBOSE
40 bool "Verbose Power Management debugging"
41 depends on PM_DEBUG
42 default n
43 ---help---
44 This option enables verbose messages from the Power Management code.
45
46config CAN_PM_TRACE
47 def_bool y
48 depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL
49
50config PM_TRACE
51 bool
52 help
53 This enables code to save the last PM event point across
54 reboot. The architecture needs to support this, x86 for
55 example does by saving things in the RTC, see below.
56
57 The architecture specific code must provide the extern
58 functions from <linux/resume-trace.h> as well as the
59 <asm/resume-trace.h> header with a TRACE_RESUME() macro.
60
61 The way the information is presented is architecture-
62 dependent, x86 will print the information during a
63 late_initcall.
64
65config PM_TRACE_RTC
66 bool "Suspend/resume event tracing"
67 depends on CAN_PM_TRACE
68 depends on X86
69 select PM_TRACE
70 default n
71 ---help---
72 This enables some cheesy code to save the last PM event point in the
73 RTC across reboots, so that you can debug a machine that just hangs
74 during suspend (or more commonly, during resume).
75
76 To use this debugging feature you should attempt to suspend the
77 machine, reboot it and then run
78
79 dmesg -s 1000000 | grep 'hash matches'
80
81 CAUTION: this option will cause your machine's real-time clock to be
82 set to an invalid time after a resume.
83
84config PM_SLEEP_SMP
85 bool
86 depends on SMP
87 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
88 depends on PM_SLEEP
89 select HOTPLUG
90 select HOTPLUG_CPU
91 default y
92
93config PM_SLEEP
94 bool
95 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
96 default y
97
98config PM_SLEEP_ADVANCED_DEBUG
99 bool
100 depends on PM_ADVANCED_DEBUG
101 default n
102
103config SUSPEND 1config SUSPEND
104 bool "Suspend to RAM and standby" 2 bool "Suspend to RAM and standby"
105 depends on PM && ARCH_SUSPEND_POSSIBLE 3 depends on ARCH_SUSPEND_POSSIBLE
106 default y 4 default y
107 ---help--- 5 ---help---
108 Allow the system to enter sleep states in which main memory is 6 Allow the system to enter sleep states in which main memory is
109 powered and thus its contents are preserved, such as the 7 powered and thus its contents are preserved, such as the
110 suspend-to-RAM state (e.g. the ACPI S3 state). 8 suspend-to-RAM state (e.g. the ACPI S3 state).
111 9
112config PM_TEST_SUSPEND
113 bool "Test suspend/resume and wakealarm during bootup"
114 depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
115 ---help---
116 This option will let you suspend your machine during bootup, and
117 make it wake up a few seconds later using an RTC wakeup alarm.
118 Enable this with a kernel parameter like "test_suspend=mem".
119
120 You probably want to have your system's RTC driver statically
121 linked, ensuring that it's available when this test runs.
122
123config SUSPEND_FREEZER 10config SUSPEND_FREEZER
124 bool "Enable freezer for suspend to RAM/standby" \ 11 bool "Enable freezer for suspend to RAM/standby" \
125 if ARCH_WANTS_FREEZER_CONTROL || BROKEN 12 if ARCH_WANTS_FREEZER_CONTROL || BROKEN
@@ -133,7 +20,7 @@ config SUSPEND_FREEZER
133 20
134config HIBERNATION 21config HIBERNATION
135 bool "Hibernation (aka 'suspend to disk')" 22 bool "Hibernation (aka 'suspend to disk')"
136 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE 23 depends on SWAP && ARCH_HIBERNATION_POSSIBLE
137 select LZO_COMPRESS 24 select LZO_COMPRESS
138 select LZO_DECOMPRESS 25 select LZO_DECOMPRESS
139 ---help--- 26 ---help---
@@ -196,6 +83,106 @@ config PM_STD_PARTITION
196 suspended image to. It will simply pick the first available swap 83 suspended image to. It will simply pick the first available swap
197 device. 84 device.
198 85
86config PM_SLEEP
87 def_bool y
88 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
89
90config PM_SLEEP_SMP
91 def_bool y
92 depends on SMP
93 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
94 depends on PM_SLEEP
95 select HOTPLUG
96 select HOTPLUG_CPU
97
98config PM_RUNTIME
99 bool "Run-time PM core functionality"
100 depends on !IA64_HP_SIM
101 ---help---
102 Enable functionality allowing I/O devices to be put into energy-saving
103 (low power) states at run time (or autosuspended) after a specified
104 period of inactivity and woken up in response to a hardware-generated
105 wake-up event or a driver's request.
106
107 Hardware support is generally required for this functionality to work
108 and the bus type drivers of the buses the devices are on are
109 responsible for the actual handling of the autosuspend requests and
110 wake-up events.
111
112config PM
113 def_bool y
114 depends on PM_SLEEP || PM_RUNTIME
115
116config PM_DEBUG
117 bool "Power Management Debug Support"
118 depends on PM
119 ---help---
120 This option enables various debugging support in the Power Management
121 code. This is helpful when debugging and reporting PM bugs, like
122 suspend support.
123
124config PM_VERBOSE
125 bool "Verbose Power Management debugging"
126 depends on PM_DEBUG
127 ---help---
128 This option enables verbose messages from the Power Management code.
129
130config PM_ADVANCED_DEBUG
131 bool "Extra PM attributes in sysfs for low-level debugging/testing"
132 depends on PM_DEBUG
133 ---help---
134 Add extra sysfs attributes allowing one to access some Power Management
135 fields of device objects from user space. If you are not a kernel
136 developer interested in debugging/testing Power Management, say "no".
137
138config PM_TEST_SUSPEND
139 bool "Test suspend/resume and wakealarm during bootup"
140 depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
141 ---help---
142 This option will let you suspend your machine during bootup, and
143 make it wake up a few seconds later using an RTC wakeup alarm.
144 Enable this with a kernel parameter like "test_suspend=mem".
145
146 You probably want to have your system's RTC driver statically
147 linked, ensuring that it's available when this test runs.
148
149config CAN_PM_TRACE
150 def_bool y
151 depends on PM_DEBUG && PM_SLEEP
152
153config PM_TRACE
154 bool
155 help
156 This enables code to save the last PM event point across
157 reboot. The architecture needs to support this, x86 for
158 example does by saving things in the RTC, see below.
159
160 The architecture specific code must provide the extern
161 functions from <linux/resume-trace.h> as well as the
162 <asm/resume-trace.h> header with a TRACE_RESUME() macro.
163
164 The way the information is presented is architecture-
165 dependent, x86 will print the information during a
166 late_initcall.
167
168config PM_TRACE_RTC
169 bool "Suspend/resume event tracing"
170 depends on CAN_PM_TRACE
171 depends on X86
172 select PM_TRACE
173 ---help---
174 This enables some cheesy code to save the last PM event point in the
175 RTC across reboots, so that you can debug a machine that just hangs
176 during suspend (or more commonly, during resume).
177
178 To use this debugging feature you should attempt to suspend the
179 machine, reboot it and then run
180
181 dmesg -s 1000000 | grep 'hash matches'
182
183 CAUTION: this option will cause your machine's real-time clock to be
184 set to an invalid time after a resume.
185
199config APM_EMULATION 186config APM_EMULATION
200 tristate "Advanced Power Management Emulation" 187 tristate "Advanced Power Management Emulation"
201 depends on PM && SYS_SUPPORTS_APM_EMULATION 188 depends on PM && SYS_SUPPORTS_APM_EMULATION
@@ -222,31 +209,11 @@ config APM_EMULATION
222 anything, try disabling/enabling this option (or disabling/enabling 209 anything, try disabling/enabling this option (or disabling/enabling
223 APM in your BIOS). 210 APM in your BIOS).
224 211
225config PM_RUNTIME
226 bool "Run-time PM core functionality"
227 depends on PM
228 ---help---
229 Enable functionality allowing I/O devices to be put into energy-saving
230 (low power) states at run time (or autosuspended) after a specified
231 period of inactivity and woken up in response to a hardware-generated
232 wake-up event or a driver's request.
233
234 Hardware support is generally required for this functionality to work
235 and the bus type drivers of the buses the devices are on are
236 responsible for the actual handling of the autosuspend requests and
237 wake-up events.
238
239config PM_OPS
240 bool
241 depends on PM_SLEEP || PM_RUNTIME
242 default y
243
244config ARCH_HAS_OPP 212config ARCH_HAS_OPP
245 bool 213 bool
246 214
247config PM_OPP 215config PM_OPP
248 bool "Operating Performance Point (OPP) Layer library" 216 bool "Operating Performance Point (OPP) Layer library"
249 depends on PM
250 depends on ARCH_HAS_OPP 217 depends on ARCH_HAS_OPP
251 ---help--- 218 ---help---
252 SOCs have a standard set of tuples consisting of frequency and 219 SOCs have a standard set of tuples consisting of frequency and
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c350e18b53e3..c5ebc6a90643 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,4 +1,5 @@
1ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG 1
2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
2 3
3obj-$(CONFIG_PM) += main.o 4obj-$(CONFIG_PM) += main.o
4obj-$(CONFIG_PM_SLEEP) += console.o 5obj-$(CONFIG_PM_SLEEP) += console.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 83bbc7c02df9..d09dd10c5a5e 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
28static int submit(int rw, struct block_device *bdev, sector_t sector, 28static int submit(int rw, struct block_device *bdev, sector_t sector,
29 struct page *page, struct bio **bio_chain) 29 struct page *page, struct bio **bio_chain)
30{ 30{
31 const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG; 31 const int bio_rw = rw | REQ_SYNC;
32 struct bio *bio; 32 struct bio *bio;
33 33
34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); 34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1832bd264219..aeabd26e3342 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -23,6 +23,7 @@
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/gfp.h> 25#include <linux/gfp.h>
26#include <linux/syscore_ops.h>
26#include <scsi/scsi_scan.h> 27#include <scsi/scsi_scan.h>
27#include <asm/suspend.h> 28#include <asm/suspend.h>
28 29
@@ -272,6 +273,8 @@ static int create_image(int platform_mode)
272 local_irq_disable(); 273 local_irq_disable();
273 274
274 error = sysdev_suspend(PMSG_FREEZE); 275 error = sysdev_suspend(PMSG_FREEZE);
276 if (!error)
277 error = syscore_suspend();
275 if (error) { 278 if (error) {
276 printk(KERN_ERR "PM: Some system devices failed to power down, " 279 printk(KERN_ERR "PM: Some system devices failed to power down, "
277 "aborting hibernation\n"); 280 "aborting hibernation\n");
@@ -295,6 +298,7 @@ static int create_image(int platform_mode)
295 } 298 }
296 299
297 Power_up: 300 Power_up:
301 syscore_resume();
298 sysdev_resume(); 302 sysdev_resume();
299 /* NOTE: dpm_resume_noirq() is just a resume() for devices 303 /* NOTE: dpm_resume_noirq() is just a resume() for devices
300 * that suspended with irqs off ... no overall powerup. 304 * that suspended with irqs off ... no overall powerup.
@@ -403,6 +407,8 @@ static int resume_target_kernel(bool platform_mode)
403 local_irq_disable(); 407 local_irq_disable();
404 408
405 error = sysdev_suspend(PMSG_QUIESCE); 409 error = sysdev_suspend(PMSG_QUIESCE);
410 if (!error)
411 error = syscore_suspend();
406 if (error) 412 if (error)
407 goto Enable_irqs; 413 goto Enable_irqs;
408 414
@@ -429,6 +435,7 @@ static int resume_target_kernel(bool platform_mode)
429 restore_processor_state(); 435 restore_processor_state();
430 touch_softlockup_watchdog(); 436 touch_softlockup_watchdog();
431 437
438 syscore_resume();
432 sysdev_resume(); 439 sysdev_resume();
433 440
434 Enable_irqs: 441 Enable_irqs:
@@ -516,6 +523,7 @@ int hibernation_platform_enter(void)
516 523
517 local_irq_disable(); 524 local_irq_disable();
518 sysdev_suspend(PMSG_HIBERNATE); 525 sysdev_suspend(PMSG_HIBERNATE);
526 syscore_suspend();
519 if (pm_wakeup_pending()) { 527 if (pm_wakeup_pending()) {
520 error = -EAGAIN; 528 error = -EAGAIN;
521 goto Power_up; 529 goto Power_up;
@@ -526,6 +534,7 @@ int hibernation_platform_enter(void)
526 while (1); 534 while (1);
527 535
528 Power_up: 536 Power_up:
537 syscore_resume();
529 sysdev_resume(); 538 sysdev_resume();
530 local_irq_enable(); 539 local_irq_enable();
531 enable_nonboot_cpus(); 540 enable_nonboot_cpus();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 701853042c28..8eaba5f27b10 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -17,9 +17,6 @@
17 17
18DEFINE_MUTEX(pm_mutex); 18DEFINE_MUTEX(pm_mutex);
19 19
20unsigned int pm_flags;
21EXPORT_SYMBOL(pm_flags);
22
23#ifdef CONFIG_PM_SLEEP 20#ifdef CONFIG_PM_SLEEP
24 21
25/* Routines for PM-transition notifications */ 22/* Routines for PM-transition notifications */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 64db648ff911..ca0aacc24874 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -42,15 +42,15 @@ static void swsusp_unset_page_forbidden(struct page *);
42 42
43/* 43/*
44 * Preferred image size in bytes (tunable via /sys/power/image_size). 44 * Preferred image size in bytes (tunable via /sys/power/image_size).
45 * When it is set to N, swsusp will do its best to ensure the image 45 * When it is set to N, the image creating code will do its best to
46 * size will not exceed N bytes, but if that is impossible, it will 46 * ensure the image size will not exceed N bytes, but if that is
47 * try to create the smallest image possible. 47 * impossible, it will try to create the smallest image possible.
48 */ 48 */
49unsigned long image_size; 49unsigned long image_size;
50 50
51void __init hibernate_image_size_init(void) 51void __init hibernate_image_size_init(void)
52{ 52{
53 image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; 53 image_size = (totalram_pages / 3) * PAGE_SIZE;
54} 54}
55 55
56/* List of PBEs needed for restoring the pages that were allocated before 56/* List of PBEs needed for restoring the pages that were allocated before
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index de6f86bfa303..2814c32aed51 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -22,6 +22,7 @@
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/suspend.h> 24#include <linux/suspend.h>
25#include <linux/syscore_ops.h>
25#include <trace/events/power.h> 26#include <trace/events/power.h>
26 27
27#include "power.h" 28#include "power.h"
@@ -163,11 +164,14 @@ static int suspend_enter(suspend_state_t state)
163 BUG_ON(!irqs_disabled()); 164 BUG_ON(!irqs_disabled());
164 165
165 error = sysdev_suspend(PMSG_SUSPEND); 166 error = sysdev_suspend(PMSG_SUSPEND);
167 if (!error)
168 error = syscore_suspend();
166 if (!error) { 169 if (!error) {
167 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { 170 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
168 error = suspend_ops->enter(state); 171 error = suspend_ops->enter(state);
169 events_check_enabled = false; 172 events_check_enabled = false;
170 } 173 }
174 syscore_resume();
171 sysdev_resume(); 175 sysdev_resume();
172 } 176 }
173 177
diff --git a/kernel/printk.c b/kernel/printk.c
index 36231525e22f..da8ca817eae3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -53,7 +53,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
53#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) 53#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
54 54
55/* printk's without a loglevel use this.. */ 55/* printk's without a loglevel use this.. */
56#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */ 56#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
57 57
58/* We show everything that is MORE important than this.. */ 58/* We show everything that is MORE important than this.. */
59#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ 59#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
@@ -113,6 +113,11 @@ static unsigned con_start; /* Index into log_buf: next char to be sent to consol
113static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ 113static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
114 114
115/* 115/*
116 * If exclusive_console is non-NULL then only this console is to be printed to.
117 */
118static struct console *exclusive_console;
119
120/*
116 * Array of consoles built from command line options (console=) 121 * Array of consoles built from command line options (console=)
117 */ 122 */
118struct console_cmdline 123struct console_cmdline
@@ -476,6 +481,8 @@ static void __call_console_drivers(unsigned start, unsigned end)
476 struct console *con; 481 struct console *con;
477 482
478 for_each_console(con) { 483 for_each_console(con) {
484 if (exclusive_console && con != exclusive_console)
485 continue;
479 if ((con->flags & CON_ENABLED) && con->write && 486 if ((con->flags & CON_ENABLED) && con->write &&
480 (cpu_online(smp_processor_id()) || 487 (cpu_online(smp_processor_id()) ||
481 (con->flags & CON_ANYTIME))) 488 (con->flags & CON_ANYTIME)))
@@ -515,6 +522,71 @@ static void _call_console_drivers(unsigned start,
515} 522}
516 523
517/* 524/*
525 * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
526 * lower 3 bit are the log level, the rest are the log facility. In case
527 * userspace passes usual userspace syslog messages to /dev/kmsg or
528 * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
529 * to extract the correct log level for in-kernel processing, and not mangle
530 * the original value.
531 *
532 * If a prefix is found, the length of the prefix is returned. If 'level' is
533 * passed, it will be filled in with the log level without a possible facility
534 * value. If 'special' is passed, the special printk prefix chars are accepted
535 * and returned. If no valid header is found, 0 is returned and the passed
536 * variables are not touched.
537 */
538static size_t log_prefix(const char *p, unsigned int *level, char *special)
539{
540 unsigned int lev = 0;
541 char sp = '\0';
542 size_t len;
543
544 if (p[0] != '<' || !p[1])
545 return 0;
546 if (p[2] == '>') {
547 /* usual single digit level number or special char */
548 switch (p[1]) {
549 case '0' ... '7':
550 lev = p[1] - '0';
551 break;
552 case 'c': /* KERN_CONT */
553 case 'd': /* KERN_DEFAULT */
554 sp = p[1];
555 break;
556 default:
557 return 0;
558 }
559 len = 3;
560 } else {
561 /* multi digit including the level and facility number */
562 char *endp = NULL;
563
564 if (p[1] < '0' && p[1] > '9')
565 return 0;
566
567 lev = (simple_strtoul(&p[1], &endp, 10) & 7);
568 if (endp == NULL || endp[0] != '>')
569 return 0;
570 len = (endp + 1) - p;
571 }
572
573 /* do not accept special char if not asked for */
574 if (sp && !special)
575 return 0;
576
577 if (special) {
578 *special = sp;
579 /* return special char, do not touch level */
580 if (sp)
581 return len;
582 }
583
584 if (level)
585 *level = lev;
586 return len;
587}
588
589/*
518 * Call the console drivers, asking them to write out 590 * Call the console drivers, asking them to write out
519 * log_buf[start] to log_buf[end - 1]. 591 * log_buf[start] to log_buf[end - 1].
520 * The console_lock must be held. 592 * The console_lock must be held.
@@ -529,13 +601,9 @@ static void call_console_drivers(unsigned start, unsigned end)
529 cur_index = start; 601 cur_index = start;
530 start_print = start; 602 start_print = start;
531 while (cur_index != end) { 603 while (cur_index != end) {
532 if (msg_level < 0 && ((end - cur_index) > 2) && 604 if (msg_level < 0 && ((end - cur_index) > 2)) {
533 LOG_BUF(cur_index + 0) == '<' && 605 /* strip log prefix */
534 LOG_BUF(cur_index + 1) >= '0' && 606 cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
535 LOG_BUF(cur_index + 1) <= '7' &&
536 LOG_BUF(cur_index + 2) == '>') {
537 msg_level = LOG_BUF(cur_index + 1) - '0';
538 cur_index += 3;
539 start_print = cur_index; 607 start_print = cur_index;
540 } 608 }
541 while (cur_index != end) { 609 while (cur_index != end) {
@@ -733,6 +801,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
733 unsigned long flags; 801 unsigned long flags;
734 int this_cpu; 802 int this_cpu;
735 char *p; 803 char *p;
804 size_t plen;
805 char special;
736 806
737 boot_delay_msec(); 807 boot_delay_msec();
738 printk_delay(); 808 printk_delay();
@@ -773,45 +843,52 @@ asmlinkage int vprintk(const char *fmt, va_list args)
773 printed_len += vscnprintf(printk_buf + printed_len, 843 printed_len += vscnprintf(printk_buf + printed_len,
774 sizeof(printk_buf) - printed_len, fmt, args); 844 sizeof(printk_buf) - printed_len, fmt, args);
775 845
776
777 p = printk_buf; 846 p = printk_buf;
778 847
779 /* Do we have a loglevel in the string? */ 848 /* Read log level and handle special printk prefix */
780 if (p[0] == '<') { 849 plen = log_prefix(p, &current_log_level, &special);
781 unsigned char c = p[1]; 850 if (plen) {
782 if (c && p[2] == '>') { 851 p += plen;
783 switch (c) { 852
784 case '0' ... '7': /* loglevel */ 853 switch (special) {
785 current_log_level = c - '0'; 854 case 'c': /* Strip <c> KERN_CONT, continue line */
786 /* Fallthrough - make sure we're on a new line */ 855 plen = 0;
787 case 'd': /* KERN_DEFAULT */ 856 break;
788 if (!new_text_line) { 857 case 'd': /* Strip <d> KERN_DEFAULT, start new line */
789 emit_log_char('\n'); 858 plen = 0;
790 new_text_line = 1; 859 default:
791 } 860 if (!new_text_line) {
792 /* Fallthrough - skip the loglevel */ 861 emit_log_char('\n');
793 case 'c': /* KERN_CONT */ 862 new_text_line = 1;
794 p += 3;
795 break;
796 } 863 }
797 } 864 }
798 } 865 }
799 866
800 /* 867 /*
801 * Copy the output into log_buf. If the caller didn't provide 868 * Copy the output into log_buf. If the caller didn't provide
802 * appropriate log level tags, we insert them here 869 * the appropriate log prefix, we insert them here
803 */ 870 */
804 for ( ; *p; p++) { 871 for (; *p; p++) {
805 if (new_text_line) { 872 if (new_text_line) {
806 /* Always output the token */
807 emit_log_char('<');
808 emit_log_char(current_log_level + '0');
809 emit_log_char('>');
810 printed_len += 3;
811 new_text_line = 0; 873 new_text_line = 0;
812 874
875 if (plen) {
876 /* Copy original log prefix */
877 int i;
878
879 for (i = 0; i < plen; i++)
880 emit_log_char(printk_buf[i]);
881 printed_len += plen;
882 } else {
883 /* Add log prefix */
884 emit_log_char('<');
885 emit_log_char(current_log_level + '0');
886 emit_log_char('>');
887 printed_len += 3;
888 }
889
813 if (printk_time) { 890 if (printk_time) {
814 /* Follow the token with the time */ 891 /* Add the current time stamp */
815 char tbuf[50], *tp; 892 char tbuf[50], *tp;
816 unsigned tlen; 893 unsigned tlen;
817 unsigned long long t; 894 unsigned long long t;
@@ -1160,6 +1237,11 @@ void console_unlock(void)
1160 local_irq_restore(flags); 1237 local_irq_restore(flags);
1161 } 1238 }
1162 console_locked = 0; 1239 console_locked = 0;
1240
1241 /* Release the exclusive_console once it is used */
1242 if (unlikely(exclusive_console))
1243 exclusive_console = NULL;
1244
1163 up(&console_sem); 1245 up(&console_sem);
1164 spin_unlock_irqrestore(&logbuf_lock, flags); 1246 spin_unlock_irqrestore(&logbuf_lock, flags);
1165 if (wake_klogd) 1247 if (wake_klogd)
@@ -1246,6 +1328,18 @@ void console_start(struct console *console)
1246} 1328}
1247EXPORT_SYMBOL(console_start); 1329EXPORT_SYMBOL(console_start);
1248 1330
1331static int __read_mostly keep_bootcon;
1332
1333static int __init keep_bootcon_setup(char *str)
1334{
1335 keep_bootcon = 1;
1336 printk(KERN_INFO "debug: skip boot console de-registration.\n");
1337
1338 return 0;
1339}
1340
1341early_param("keep_bootcon", keep_bootcon_setup);
1342
1249/* 1343/*
1250 * The console driver calls this routine during kernel initialization 1344 * The console driver calls this routine during kernel initialization
1251 * to register the console printing procedure with printk() and to 1345 * to register the console printing procedure with printk() and to
@@ -1382,6 +1476,12 @@ void register_console(struct console *newcon)
1382 spin_lock_irqsave(&logbuf_lock, flags); 1476 spin_lock_irqsave(&logbuf_lock, flags);
1383 con_start = log_start; 1477 con_start = log_start;
1384 spin_unlock_irqrestore(&logbuf_lock, flags); 1478 spin_unlock_irqrestore(&logbuf_lock, flags);
1479 /*
1480 * We're about to replay the log buffer. Only do this to the
1481 * just-registered console to avoid excessive message spam to
1482 * the already-registered consoles.
1483 */
1484 exclusive_console = newcon;
1385 } 1485 }
1386 console_unlock(); 1486 console_unlock();
1387 console_sysfs_notify(); 1487 console_sysfs_notify();
@@ -1393,7 +1493,9 @@ void register_console(struct console *newcon)
1393 * users know there might be something in the kernel's log buffer that 1493 * users know there might be something in the kernel's log buffer that
1394 * went to the bootconsole (that they do not see on the real console) 1494 * went to the bootconsole (that they do not see on the real console)
1395 */ 1495 */
1396 if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { 1496 if (bcon &&
1497 ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
1498 !keep_bootcon) {
1397 /* we need to iterate through twice, to make sure we print 1499 /* we need to iterate through twice, to make sure we print
1398 * everything out, before we unregister the console(s) 1500 * everything out, before we unregister the console(s)
1399 */ 1501 */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index e2302e40b360..0fc1eed28d27 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -134,21 +134,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
134 return 0; 134 return 0;
135 rcu_read_lock(); 135 rcu_read_lock();
136 tcred = __task_cred(task); 136 tcred = __task_cred(task);
137 if ((cred->uid != tcred->euid || 137 if (cred->user->user_ns == tcred->user->user_ns &&
138 cred->uid != tcred->suid || 138 (cred->uid == tcred->euid &&
139 cred->uid != tcred->uid || 139 cred->uid == tcred->suid &&
140 cred->gid != tcred->egid || 140 cred->uid == tcred->uid &&
141 cred->gid != tcred->sgid || 141 cred->gid == tcred->egid &&
142 cred->gid != tcred->gid) && 142 cred->gid == tcred->sgid &&
143 !capable(CAP_SYS_PTRACE)) { 143 cred->gid == tcred->gid))
144 rcu_read_unlock(); 144 goto ok;
145 return -EPERM; 145 if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
146 } 146 goto ok;
147 rcu_read_unlock();
148 return -EPERM;
149ok:
147 rcu_read_unlock(); 150 rcu_read_unlock();
148 smp_rmb(); 151 smp_rmb();
149 if (task->mm) 152 if (task->mm)
150 dumpable = get_dumpable(task->mm); 153 dumpable = get_dumpable(task->mm);
151 if (!dumpable && !capable(CAP_SYS_PTRACE)) 154 if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE))
152 return -EPERM; 155 return -EPERM;
153 156
154 return security_ptrace_access_check(task, mode); 157 return security_ptrace_access_check(task, mode);
@@ -198,7 +201,7 @@ static int ptrace_attach(struct task_struct *task)
198 goto unlock_tasklist; 201 goto unlock_tasklist;
199 202
200 task->ptrace = PT_PTRACED; 203 task->ptrace = PT_PTRACED;
201 if (capable(CAP_SYS_PTRACE)) 204 if (task_ns_capable(task, CAP_SYS_PTRACE))
202 task->ptrace |= PT_PTRACE_CAP; 205 task->ptrace |= PT_PTRACE_CAP;
203 206
204 __ptrace_link(task, current); 207 __ptrace_link(task, current);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a23a57a976d1..f3240e987928 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -214,11 +214,12 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
214 * Ensure that queued callbacks are all executed. 214 * Ensure that queued callbacks are all executed.
215 * If we detect that we are nested in a RCU read-side critical 215 * If we detect that we are nested in a RCU read-side critical
216 * section, we should simply fail, otherwise we would deadlock. 216 * section, we should simply fail, otherwise we would deadlock.
217 * Note that the machinery to reliably determine whether
218 * or not we are in an RCU read-side critical section
219 * exists only in the preemptible RCU implementations
220 * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why
221 * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT.
217 */ 222 */
218#ifndef CONFIG_PREEMPT
219 WARN_ON(1);
220 return 0;
221#else
222 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 223 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
223 irqs_disabled()) { 224 irqs_disabled()) {
224 WARN_ON(1); 225 WARN_ON(1);
@@ -229,7 +230,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
229 rcu_barrier_bh(); 230 rcu_barrier_bh();
230 debug_object_free(head, &rcuhead_debug_descr); 231 debug_object_free(head, &rcuhead_debug_descr);
231 return 1; 232 return 1;
232#endif
233 default: 233 default:
234 return 0; 234 return 0;
235 } 235 }
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 015abaea962a..3cb8e362e883 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -852,7 +852,7 @@ void exit_rcu(void)
852 if (t->rcu_read_lock_nesting == 0) 852 if (t->rcu_read_lock_nesting == 0)
853 return; 853 return;
854 t->rcu_read_lock_nesting = 1; 854 t->rcu_read_lock_nesting = 1;
855 rcu_read_unlock(); 855 __rcu_read_unlock();
856} 856}
857 857
858#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 858#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 89613f97ff26..c224da41890c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,7 +47,6 @@
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <asm/byteorder.h> 49#include <asm/byteorder.h>
50#include <linux/sched.h>
51 50
52MODULE_LICENSE("GPL"); 51MODULE_LICENSE("GPL");
53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index c7eaa37a768b..34683efa2cce 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
126 pos, buf, s - buf); 126 pos, buf, s - buf);
127} 127}
128 128
129#if BITS_PER_LONG == 32
130u64 res_counter_read_u64(struct res_counter *counter, int member)
131{
132 unsigned long flags;
133 u64 ret;
134
135 spin_lock_irqsave(&counter->lock, flags);
136 ret = *res_counter_member(counter, member);
137 spin_unlock_irqrestore(&counter->lock, flags);
138
139 return ret;
140}
141#else
129u64 res_counter_read_u64(struct res_counter *counter, int member) 142u64 res_counter_read_u64(struct res_counter *counter, int member)
130{ 143{
131 return *res_counter_member(counter, member); 144 return *res_counter_member(counter, member);
132} 145}
146#endif
133 147
134int res_counter_memparse_write_strategy(const char *buf, 148int res_counter_memparse_write_strategy(const char *buf,
135 unsigned long long *res) 149 unsigned long long *res)
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index ddabb54bb5c8..3c7cbc2c33be 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
215 put_pid(waiter->deadlock_task_pid); 215 put_pid(waiter->deadlock_task_pid);
216 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); 216 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
217 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 217 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
218 TRACE_WARN_ON(waiter->task);
219 memset(waiter, 0x22, sizeof(*waiter)); 218 memset(waiter, 0x22, sizeof(*waiter));
220} 219}
221 220
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 66cb89bc5ef1..5c9ccd380966 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -9,7 +9,6 @@
9#include <linux/kthread.h> 9#include <linux/kthread.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/smp_lock.h>
13#include <linux/spinlock.h> 12#include <linux/spinlock.h>
14#include <linux/sysdev.h> 13#include <linux/sysdev.h>
15#include <linux/timer.h> 14#include <linux/timer.h>
@@ -27,7 +26,6 @@ struct test_thread_data {
27 int opcode; 26 int opcode;
28 int opdata; 27 int opdata;
29 int mutexes[MAX_RT_TEST_MUTEXES]; 28 int mutexes[MAX_RT_TEST_MUTEXES];
30 int bkl;
31 int event; 29 int event;
32 struct sys_device sysdev; 30 struct sys_device sysdev;
33}; 31};
@@ -46,9 +44,8 @@ enum test_opcodes {
46 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ 44 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
47 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ 45 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
48 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ 46 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
49 RTTEST_LOCKBKL, /* 9 Lock BKL */ 47 /* 9, 10 - reserved for BKL commemoration */
50 RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ 48 RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */
51 RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */
52 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ 49 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
53 RTTEST_RESET = 99, /* 99 Reset all pending operations */ 50 RTTEST_RESET = 99, /* 99 Reset all pending operations */
54}; 51};
@@ -74,13 +71,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
74 td->mutexes[i] = 0; 71 td->mutexes[i] = 0;
75 } 72 }
76 } 73 }
77
78 if (!lockwakeup && td->bkl == 4) {
79#ifdef CONFIG_LOCK_KERNEL
80 unlock_kernel();
81#endif
82 td->bkl = 0;
83 }
84 return 0; 74 return 0;
85 75
86 case RTTEST_RESETEVENT: 76 case RTTEST_RESETEVENT:
@@ -131,25 +121,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
131 td->mutexes[id] = 0; 121 td->mutexes[id] = 0;
132 return 0; 122 return 0;
133 123
134 case RTTEST_LOCKBKL:
135 if (td->bkl)
136 return 0;
137 td->bkl = 1;
138#ifdef CONFIG_LOCK_KERNEL
139 lock_kernel();
140#endif
141 td->bkl = 4;
142 return 0;
143
144 case RTTEST_UNLOCKBKL:
145 if (td->bkl != 4)
146 break;
147#ifdef CONFIG_LOCK_KERNEL
148 unlock_kernel();
149#endif
150 td->bkl = 0;
151 return 0;
152
153 default: 124 default:
154 break; 125 break;
155 } 126 }
@@ -196,7 +167,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
196 td->event = atomic_add_return(1, &rttest_event); 167 td->event = atomic_add_return(1, &rttest_event);
197 break; 168 break;
198 169
199 case RTTEST_LOCKBKL:
200 default: 170 default:
201 break; 171 break;
202 } 172 }
@@ -229,8 +199,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
229 td->event = atomic_add_return(1, &rttest_event); 199 td->event = atomic_add_return(1, &rttest_event);
230 return; 200 return;
231 201
232 case RTTEST_LOCKBKL:
233 return;
234 default: 202 default:
235 return; 203 return;
236 } 204 }
@@ -380,11 +348,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
380 spin_lock(&rttest_lock); 348 spin_lock(&rttest_lock);
381 349
382 curr += sprintf(curr, 350 curr += sprintf(curr,
383 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", 351 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
384 td->opcode, td->event, tsk->state, 352 td->opcode, td->event, tsk->state,
385 (MAX_RT_PRIO - 1) - tsk->prio, 353 (MAX_RT_PRIO - 1) - tsk->prio,
386 (MAX_RT_PRIO - 1) - tsk->normal_prio, 354 (MAX_RT_PRIO - 1) - tsk->normal_prio,
387 tsk->pi_blocked_on, td->bkl); 355 tsk->pi_blocked_on);
388 356
389 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) 357 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
390 curr += sprintf(curr, "%d", td->mutexes[i]); 358 curr += sprintf(curr, "%d", td->mutexes[i]);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a9604815786a..ab449117aaf2 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -20,41 +20,34 @@
20/* 20/*
21 * lock->owner state tracking: 21 * lock->owner state tracking:
22 * 22 *
23 * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 23 * lock->owner holds the task_struct pointer of the owner. Bit 0
24 * are used to keep track of the "owner is pending" and "lock has 24 * is used to keep track of the "lock has waiters" state.
25 * waiters" state.
26 * 25 *
27 * owner bit1 bit0 26 * owner bit0
28 * NULL 0 0 lock is free (fast acquire possible) 27 * NULL 0 lock is free (fast acquire possible)
29 * NULL 0 1 invalid state 28 * NULL 1 lock is free and has waiters and the top waiter
30 * NULL 1 0 Transitional State* 29 * is going to take the lock*
31 * NULL 1 1 invalid state 30 * taskpointer 0 lock is held (fast release possible)
32 * taskpointer 0 0 lock is held (fast release possible) 31 * taskpointer 1 lock is held and has waiters**
33 * taskpointer 0 1 task is pending owner
34 * taskpointer 1 0 lock is held and has waiters
35 * taskpointer 1 1 task is pending owner and lock has more waiters
36 *
37 * Pending ownership is assigned to the top (highest priority)
38 * waiter of the lock, when the lock is released. The thread is woken
39 * up and can now take the lock. Until the lock is taken (bit 0
40 * cleared) a competing higher priority thread can steal the lock
41 * which puts the woken up thread back on the waiters list.
42 * 32 *
43 * The fast atomic compare exchange based acquire and release is only 33 * The fast atomic compare exchange based acquire and release is only
44 * possible when bit 0 and 1 of lock->owner are 0. 34 * possible when bit 0 of lock->owner is 0.
35 *
36 * (*) It also can be a transitional state when grabbing the lock
37 * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
38 * we need to set the bit0 before looking at the lock, and the owner may be
39 * NULL in this small time, hence this can be a transitional state.
45 * 40 *
46 * (*) There's a small time where the owner can be NULL and the 41 * (**) There is a small time when bit 0 is set but there are no
47 * "lock has waiters" bit is set. This can happen when grabbing the lock. 42 * waiters. This can happen when grabbing the lock in the slow path.
48 * To prevent a cmpxchg of the owner releasing the lock, we need to set this 43 * To prevent a cmpxchg of the owner releasing the lock, we need to
49 * bit before looking at the lock, hence the reason this is a transitional 44 * set this bit before looking at the lock.
50 * state.
51 */ 45 */
52 46
53static void 47static void
54rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, 48rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
55 unsigned long mask)
56{ 49{
57 unsigned long val = (unsigned long)owner | mask; 50 unsigned long val = (unsigned long)owner;
58 51
59 if (rt_mutex_has_waiters(lock)) 52 if (rt_mutex_has_waiters(lock))
60 val |= RT_MUTEX_HAS_WAITERS; 53 val |= RT_MUTEX_HAS_WAITERS;
@@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
203 * reached or the state of the chain has changed while we 196 * reached or the state of the chain has changed while we
204 * dropped the locks. 197 * dropped the locks.
205 */ 198 */
206 if (!waiter || !waiter->task) 199 if (!waiter)
207 goto out_unlock_pi; 200 goto out_unlock_pi;
208 201
209 /* 202 /*
210 * Check the orig_waiter state. After we dropped the locks, 203 * Check the orig_waiter state. After we dropped the locks,
211 * the previous owner of the lock might have released the lock 204 * the previous owner of the lock might have released the lock.
212 * and made us the pending owner:
213 */ 205 */
214 if (orig_waiter && !orig_waiter->task) 206 if (orig_waiter && !rt_mutex_owner(orig_lock))
215 goto out_unlock_pi; 207 goto out_unlock_pi;
216 208
217 /* 209 /*
@@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
254 246
255 /* Release the task */ 247 /* Release the task */
256 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 248 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
249 if (!rt_mutex_owner(lock)) {
250 /*
251 * If the requeue above changed the top waiter, then we need
252 * to wake the new top waiter up to try to get the lock.
253 */
254
255 if (top_waiter != rt_mutex_top_waiter(lock))
256 wake_up_process(rt_mutex_top_waiter(lock)->task);
257 raw_spin_unlock(&lock->wait_lock);
258 goto out_put_task;
259 }
257 put_task_struct(task); 260 put_task_struct(task);
258 261
259 /* Grab the next task */ 262 /* Grab the next task */
@@ -296,78 +299,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
296} 299}
297 300
298/* 301/*
299 * Optimization: check if we can steal the lock from the
300 * assigned pending owner [which might not have taken the
301 * lock yet]:
302 */
303static inline int try_to_steal_lock(struct rt_mutex *lock,
304 struct task_struct *task)
305{
306 struct task_struct *pendowner = rt_mutex_owner(lock);
307 struct rt_mutex_waiter *next;
308 unsigned long flags;
309
310 if (!rt_mutex_owner_pending(lock))
311 return 0;
312
313 if (pendowner == task)
314 return 1;
315
316 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
317 if (task->prio >= pendowner->prio) {
318 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
319 return 0;
320 }
321
322 /*
323 * Check if a waiter is enqueued on the pending owners
324 * pi_waiters list. Remove it and readjust pending owners
325 * priority.
326 */
327 if (likely(!rt_mutex_has_waiters(lock))) {
328 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
329 return 1;
330 }
331
332 /* No chain handling, pending owner is not blocked on anything: */
333 next = rt_mutex_top_waiter(lock);
334 plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
335 __rt_mutex_adjust_prio(pendowner);
336 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
337
338 /*
339 * We are going to steal the lock and a waiter was
340 * enqueued on the pending owners pi_waiters queue. So
341 * we have to enqueue this waiter into
342 * task->pi_waiters list. This covers the case,
343 * where task is boosted because it holds another
344 * lock and gets unboosted because the booster is
345 * interrupted, so we would delay a waiter with higher
346 * priority as task->normal_prio.
347 *
348 * Note: in the rare case of a SCHED_OTHER task changing
349 * its priority and thus stealing the lock, next->task
350 * might be task:
351 */
352 if (likely(next->task != task)) {
353 raw_spin_lock_irqsave(&task->pi_lock, flags);
354 plist_add(&next->pi_list_entry, &task->pi_waiters);
355 __rt_mutex_adjust_prio(task);
356 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
357 }
358 return 1;
359}
360
361/*
362 * Try to take an rt-mutex 302 * Try to take an rt-mutex
363 * 303 *
364 * This fails
365 * - when the lock has a real owner
366 * - when a different pending owner exists and has higher priority than current
367 *
368 * Must be called with lock->wait_lock held. 304 * Must be called with lock->wait_lock held.
305 *
306 * @lock: the lock to be acquired.
307 * @task: the task which wants to acquire the lock
308 * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
369 */ 309 */
370static int try_to_take_rt_mutex(struct rt_mutex *lock) 310static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
311 struct rt_mutex_waiter *waiter)
371{ 312{
372 /* 313 /*
373 * We have to be careful here if the atomic speedups are 314 * We have to be careful here if the atomic speedups are
@@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
390 */ 331 */
391 mark_rt_mutex_waiters(lock); 332 mark_rt_mutex_waiters(lock);
392 333
393 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) 334 if (rt_mutex_owner(lock))
394 return 0; 335 return 0;
395 336
337 /*
338 * It will get the lock because of one of these conditions:
339 * 1) there is no waiter
340 * 2) higher priority than waiters
341 * 3) it is top waiter
342 */
343 if (rt_mutex_has_waiters(lock)) {
344 if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) {
345 if (!waiter || waiter != rt_mutex_top_waiter(lock))
346 return 0;
347 }
348 }
349
350 if (waiter || rt_mutex_has_waiters(lock)) {
351 unsigned long flags;
352 struct rt_mutex_waiter *top;
353
354 raw_spin_lock_irqsave(&task->pi_lock, flags);
355
356 /* remove the queued waiter. */
357 if (waiter) {
358 plist_del(&waiter->list_entry, &lock->wait_list);
359 task->pi_blocked_on = NULL;
360 }
361
362 /*
363 * We have to enqueue the top waiter(if it exists) into
364 * task->pi_waiters list.
365 */
366 if (rt_mutex_has_waiters(lock)) {
367 top = rt_mutex_top_waiter(lock);
368 top->pi_list_entry.prio = top->list_entry.prio;
369 plist_add(&top->pi_list_entry, &task->pi_waiters);
370 }
371 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
372 }
373
396 /* We got the lock. */ 374 /* We got the lock. */
397 debug_rt_mutex_lock(lock); 375 debug_rt_mutex_lock(lock);
398 376
399 rt_mutex_set_owner(lock, current, 0); 377 rt_mutex_set_owner(lock, task);
400 378
401 rt_mutex_deadlock_account_lock(lock, current); 379 rt_mutex_deadlock_account_lock(lock, task);
402 380
403 return 1; 381 return 1;
404} 382}
@@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
436 414
437 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 415 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
438 416
417 if (!owner)
418 return 0;
419
439 if (waiter == rt_mutex_top_waiter(lock)) { 420 if (waiter == rt_mutex_top_waiter(lock)) {
440 raw_spin_lock_irqsave(&owner->pi_lock, flags); 421 raw_spin_lock_irqsave(&owner->pi_lock, flags);
441 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); 422 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
@@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
472/* 453/*
473 * Wake up the next waiter on the lock. 454 * Wake up the next waiter on the lock.
474 * 455 *
475 * Remove the top waiter from the current tasks waiter list and from 456 * Remove the top waiter from the current tasks waiter list and wake it up.
476 * the lock waiter list. Set it as pending owner. Then wake it up.
477 * 457 *
478 * Called with lock->wait_lock held. 458 * Called with lock->wait_lock held.
479 */ 459 */
480static void wakeup_next_waiter(struct rt_mutex *lock) 460static void wakeup_next_waiter(struct rt_mutex *lock)
481{ 461{
482 struct rt_mutex_waiter *waiter; 462 struct rt_mutex_waiter *waiter;
483 struct task_struct *pendowner;
484 unsigned long flags; 463 unsigned long flags;
485 464
486 raw_spin_lock_irqsave(&current->pi_lock, flags); 465 raw_spin_lock_irqsave(&current->pi_lock, flags);
487 466
488 waiter = rt_mutex_top_waiter(lock); 467 waiter = rt_mutex_top_waiter(lock);
489 plist_del(&waiter->list_entry, &lock->wait_list);
490 468
491 /* 469 /*
492 * Remove it from current->pi_waiters. We do not adjust a 470 * Remove it from current->pi_waiters. We do not adjust a
@@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
495 * lock->wait_lock. 473 * lock->wait_lock.
496 */ 474 */
497 plist_del(&waiter->pi_list_entry, &current->pi_waiters); 475 plist_del(&waiter->pi_list_entry, &current->pi_waiters);
498 pendowner = waiter->task;
499 waiter->task = NULL;
500 476
501 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); 477 rt_mutex_set_owner(lock, NULL);
502 478
503 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 479 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
504 480
505 /* 481 wake_up_process(waiter->task);
506 * Clear the pi_blocked_on variable and enqueue a possible
507 * waiter into the pi_waiters list of the pending owner. This
508 * prevents that in case the pending owner gets unboosted a
509 * waiter with higher priority than pending-owner->normal_prio
510 * is blocked on the unboosted (pending) owner.
511 */
512 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
513
514 WARN_ON(!pendowner->pi_blocked_on);
515 WARN_ON(pendowner->pi_blocked_on != waiter);
516 WARN_ON(pendowner->pi_blocked_on->lock != lock);
517
518 pendowner->pi_blocked_on = NULL;
519
520 if (rt_mutex_has_waiters(lock)) {
521 struct rt_mutex_waiter *next;
522
523 next = rt_mutex_top_waiter(lock);
524 plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
525 }
526 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
527
528 wake_up_process(pendowner);
529} 482}
530 483
531/* 484/*
532 * Remove a waiter from a lock 485 * Remove a waiter from a lock and give up
533 * 486 *
534 * Must be called with lock->wait_lock held 487 * Must be called with lock->wait_lock held and
488 * have just failed to try_to_take_rt_mutex().
535 */ 489 */
536static void remove_waiter(struct rt_mutex *lock, 490static void remove_waiter(struct rt_mutex *lock,
537 struct rt_mutex_waiter *waiter) 491 struct rt_mutex_waiter *waiter)
@@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock,
543 497
544 raw_spin_lock_irqsave(&current->pi_lock, flags); 498 raw_spin_lock_irqsave(&current->pi_lock, flags);
545 plist_del(&waiter->list_entry, &lock->wait_list); 499 plist_del(&waiter->list_entry, &lock->wait_list);
546 waiter->task = NULL;
547 current->pi_blocked_on = NULL; 500 current->pi_blocked_on = NULL;
548 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 501 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
549 502
550 if (first && owner != current) { 503 if (!owner)
504 return;
505
506 if (first) {
551 507
552 raw_spin_lock_irqsave(&owner->pi_lock, flags); 508 raw_spin_lock_irqsave(&owner->pi_lock, flags);
553 509
@@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task)
614 * or TASK_UNINTERRUPTIBLE) 570 * or TASK_UNINTERRUPTIBLE)
615 * @timeout: the pre-initialized and started timer, or NULL for none 571 * @timeout: the pre-initialized and started timer, or NULL for none
616 * @waiter: the pre-initialized rt_mutex_waiter 572 * @waiter: the pre-initialized rt_mutex_waiter
617 * @detect_deadlock: passed to task_blocks_on_rt_mutex
618 * 573 *
619 * lock->wait_lock must be held by the caller. 574 * lock->wait_lock must be held by the caller.
620 */ 575 */
621static int __sched 576static int __sched
622__rt_mutex_slowlock(struct rt_mutex *lock, int state, 577__rt_mutex_slowlock(struct rt_mutex *lock, int state,
623 struct hrtimer_sleeper *timeout, 578 struct hrtimer_sleeper *timeout,
624 struct rt_mutex_waiter *waiter, 579 struct rt_mutex_waiter *waiter)
625 int detect_deadlock)
626{ 580{
627 int ret = 0; 581 int ret = 0;
628 582
629 for (;;) { 583 for (;;) {
630 /* Try to acquire the lock: */ 584 /* Try to acquire the lock: */
631 if (try_to_take_rt_mutex(lock)) 585 if (try_to_take_rt_mutex(lock, current, waiter))
632 break; 586 break;
633 587
634 /* 588 /*
@@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
645 break; 599 break;
646 } 600 }
647 601
648 /*
649 * waiter->task is NULL the first time we come here and
650 * when we have been woken up by the previous owner
651 * but the lock got stolen by a higher prio task.
652 */
653 if (!waiter->task) {
654 ret = task_blocks_on_rt_mutex(lock, waiter, current,
655 detect_deadlock);
656 /*
657 * If we got woken up by the owner then start loop
658 * all over without going into schedule to try
659 * to get the lock now:
660 */
661 if (unlikely(!waiter->task)) {
662 /*
663 * Reset the return value. We might
664 * have returned with -EDEADLK and the
665 * owner released the lock while we
666 * were walking the pi chain.
667 */
668 ret = 0;
669 continue;
670 }
671 if (unlikely(ret))
672 break;
673 }
674
675 raw_spin_unlock(&lock->wait_lock); 602 raw_spin_unlock(&lock->wait_lock);
676 603
677 debug_rt_mutex_print_deadlock(waiter); 604 debug_rt_mutex_print_deadlock(waiter);
678 605
679 if (waiter->task) 606 schedule_rt_mutex(lock);
680 schedule_rt_mutex(lock);
681 607
682 raw_spin_lock(&lock->wait_lock); 608 raw_spin_lock(&lock->wait_lock);
683 set_current_state(state); 609 set_current_state(state);
@@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
698 int ret = 0; 624 int ret = 0;
699 625
700 debug_rt_mutex_init_waiter(&waiter); 626 debug_rt_mutex_init_waiter(&waiter);
701 waiter.task = NULL;
702 627
703 raw_spin_lock(&lock->wait_lock); 628 raw_spin_lock(&lock->wait_lock);
704 629
705 /* Try to acquire the lock again: */ 630 /* Try to acquire the lock again: */
706 if (try_to_take_rt_mutex(lock)) { 631 if (try_to_take_rt_mutex(lock, current, NULL)) {
707 raw_spin_unlock(&lock->wait_lock); 632 raw_spin_unlock(&lock->wait_lock);
708 return 0; 633 return 0;
709 } 634 }
@@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
717 timeout->task = NULL; 642 timeout->task = NULL;
718 } 643 }
719 644
720 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, 645 ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock);
721 detect_deadlock); 646
647 if (likely(!ret))
648 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
722 649
723 set_current_state(TASK_RUNNING); 650 set_current_state(TASK_RUNNING);
724 651
725 if (unlikely(waiter.task)) 652 if (unlikely(ret))
726 remove_waiter(lock, &waiter); 653 remove_waiter(lock, &waiter);
727 654
728 /* 655 /*
@@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
737 if (unlikely(timeout)) 664 if (unlikely(timeout))
738 hrtimer_cancel(&timeout->timer); 665 hrtimer_cancel(&timeout->timer);
739 666
740 /*
741 * Readjust priority, when we did not get the lock. We might
742 * have been the pending owner and boosted. Since we did not
743 * take the lock, the PI boost has to go.
744 */
745 if (unlikely(ret))
746 rt_mutex_adjust_prio(current);
747
748 debug_rt_mutex_free_waiter(&waiter); 667 debug_rt_mutex_free_waiter(&waiter);
749 668
750 return ret; 669 return ret;
@@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
762 681
763 if (likely(rt_mutex_owner(lock) != current)) { 682 if (likely(rt_mutex_owner(lock) != current)) {
764 683
765 ret = try_to_take_rt_mutex(lock); 684 ret = try_to_take_rt_mutex(lock, current, NULL);
766 /* 685 /*
767 * try_to_take_rt_mutex() sets the lock waiters 686 * try_to_take_rt_mutex() sets the lock waiters
768 * bit unconditionally. Clean this up. 687 * bit unconditionally. Clean this up.
@@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
992{ 911{
993 __rt_mutex_init(lock, NULL); 912 __rt_mutex_init(lock, NULL);
994 debug_rt_mutex_proxy_lock(lock, proxy_owner); 913 debug_rt_mutex_proxy_lock(lock, proxy_owner);
995 rt_mutex_set_owner(lock, proxy_owner, 0); 914 rt_mutex_set_owner(lock, proxy_owner);
996 rt_mutex_deadlock_account_lock(lock, proxy_owner); 915 rt_mutex_deadlock_account_lock(lock, proxy_owner);
997} 916}
998 917
@@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
1008 struct task_struct *proxy_owner) 927 struct task_struct *proxy_owner)
1009{ 928{
1010 debug_rt_mutex_proxy_unlock(lock); 929 debug_rt_mutex_proxy_unlock(lock);
1011 rt_mutex_set_owner(lock, NULL, 0); 930 rt_mutex_set_owner(lock, NULL);
1012 rt_mutex_deadlock_account_unlock(proxy_owner); 931 rt_mutex_deadlock_account_unlock(proxy_owner);
1013} 932}
1014 933
@@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1034 953
1035 raw_spin_lock(&lock->wait_lock); 954 raw_spin_lock(&lock->wait_lock);
1036 955
1037 mark_rt_mutex_waiters(lock); 956 if (try_to_take_rt_mutex(lock, task, NULL)) {
1038
1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock);
1042 rt_mutex_set_owner(lock, task, 0);
1043 raw_spin_unlock(&lock->wait_lock); 957 raw_spin_unlock(&lock->wait_lock);
1044 rt_mutex_deadlock_account_lock(lock, task);
1045 return 1; 958 return 1;
1046 } 959 }
1047 960
1048 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); 961 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
1049 962
1050 if (ret && !waiter->task) { 963 if (ret && !rt_mutex_owner(lock)) {
1051 /* 964 /*
1052 * Reset the return value. We might have 965 * Reset the return value. We might have
1053 * returned with -EDEADLK and the owner 966 * returned with -EDEADLK and the owner
@@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1056 */ 969 */
1057 ret = 0; 970 ret = 0;
1058 } 971 }
972
973 if (unlikely(ret))
974 remove_waiter(lock, waiter);
975
1059 raw_spin_unlock(&lock->wait_lock); 976 raw_spin_unlock(&lock->wait_lock);
1060 977
1061 debug_rt_mutex_print_deadlock(waiter); 978 debug_rt_mutex_print_deadlock(waiter);
@@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1110 1027
1111 set_current_state(TASK_INTERRUPTIBLE); 1028 set_current_state(TASK_INTERRUPTIBLE);
1112 1029
1113 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, 1030 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
1114 detect_deadlock);
1115 1031
1116 set_current_state(TASK_RUNNING); 1032 set_current_state(TASK_RUNNING);
1117 1033
1118 if (unlikely(waiter->task)) 1034 if (unlikely(ret))
1119 remove_waiter(lock, waiter); 1035 remove_waiter(lock, waiter);
1120 1036
1121 /* 1037 /*
@@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1126 1042
1127 raw_spin_unlock(&lock->wait_lock); 1043 raw_spin_unlock(&lock->wait_lock);
1128 1044
1129 /*
1130 * Readjust priority, when we did not get the lock. We might have been
1131 * the pending owner and boosted. Since we did not take the lock, the
1132 * PI boost has to go.
1133 */
1134 if (unlikely(ret))
1135 rt_mutex_adjust_prio(current);
1136
1137 return ret; 1045 return ret;
1138} 1046}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 97a2f81866af..53a66c85261b 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p)
91/* 91/*
92 * lock->owner state tracking: 92 * lock->owner state tracking:
93 */ 93 */
94#define RT_MUTEX_OWNER_PENDING 1UL 94#define RT_MUTEX_HAS_WAITERS 1UL
95#define RT_MUTEX_HAS_WAITERS 2UL 95#define RT_MUTEX_OWNER_MASKALL 1UL
96#define RT_MUTEX_OWNER_MASKALL 3UL
97 96
98static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) 97static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
99{ 98{
@@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
101 ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); 100 ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
102} 101}
103 102
104static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
105{
106 return (struct task_struct *)
107 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
108}
109
110static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
111{
112 return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
113}
114
115/* 103/*
116 * PI-futex support (proxy locking functions, etc.): 104 * PI-futex support (proxy locking functions, etc.):
117 */ 105 */
diff --git a/kernel/sched.c b/kernel/sched.c
index 42eab5a8437d..f592ce6f8616 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -32,7 +32,6 @@
32#include <linux/init.h> 32#include <linux/init.h>
33#include <linux/uaccess.h> 33#include <linux/uaccess.h>
34#include <linux/highmem.h> 34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h> 35#include <asm/mmu_context.h>
37#include <linux/interrupt.h> 36#include <linux/interrupt.h>
38#include <linux/capability.h> 37#include <linux/capability.h>
@@ -324,7 +323,7 @@ struct cfs_rq {
324 * 'curr' points to currently running entity on this cfs_rq. 323 * 'curr' points to currently running entity on this cfs_rq.
325 * It is set to NULL otherwise (i.e when none are currently running). 324 * It is set to NULL otherwise (i.e when none are currently running).
326 */ 325 */
327 struct sched_entity *curr, *next, *last; 326 struct sched_entity *curr, *next, *last, *skip;
328 327
329 unsigned int nr_spread_over; 328 unsigned int nr_spread_over;
330 329
@@ -606,9 +605,6 @@ static inline struct task_group *task_group(struct task_struct *p)
606 struct task_group *tg; 605 struct task_group *tg;
607 struct cgroup_subsys_state *css; 606 struct cgroup_subsys_state *css;
608 607
609 if (p->flags & PF_EXITING)
610 return &root_task_group;
611
612 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 608 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
613 lockdep_is_held(&task_rq(p)->lock)); 609 lockdep_is_held(&task_rq(p)->lock));
614 tg = container_of(css, struct task_group, css); 610 tg = container_of(css, struct task_group, css);
@@ -664,10 +660,9 @@ static void update_rq_clock(struct rq *rq)
664#endif 660#endif
665 661
666/** 662/**
667 * runqueue_is_locked 663 * runqueue_is_locked - Returns true if the current cpu runqueue is locked
668 * @cpu: the processor in question. 664 * @cpu: the processor in question.
669 * 665 *
670 * Returns true if the current cpu runqueue is locked.
671 * This interface allows printk to be called with the runqueue lock 666 * This interface allows printk to be called with the runqueue lock
672 * held and know whether or not it is OK to wake up the klogd. 667 * held and know whether or not it is OK to wake up the klogd.
673 */ 668 */
@@ -1686,6 +1681,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1686 __release(rq2->lock); 1681 __release(rq2->lock);
1687} 1682}
1688 1683
1684#else /* CONFIG_SMP */
1685
1686/*
1687 * double_rq_lock - safely lock two runqueues
1688 *
1689 * Note this does not disable interrupts like task_rq_lock,
1690 * you need to do so manually before calling.
1691 */
1692static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1693 __acquires(rq1->lock)
1694 __acquires(rq2->lock)
1695{
1696 BUG_ON(!irqs_disabled());
1697 BUG_ON(rq1 != rq2);
1698 raw_spin_lock(&rq1->lock);
1699 __acquire(rq2->lock); /* Fake it out ;) */
1700}
1701
1702/*
1703 * double_rq_unlock - safely unlock two runqueues
1704 *
1705 * Note this does not restore interrupts like task_rq_unlock,
1706 * you need to do so manually after calling.
1707 */
1708static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1709 __releases(rq1->lock)
1710 __releases(rq2->lock)
1711{
1712 BUG_ON(rq1 != rq2);
1713 raw_spin_unlock(&rq1->lock);
1714 __release(rq2->lock);
1715}
1716
1689#endif 1717#endif
1690 1718
1691static void calc_load_account_idle(struct rq *this_rq); 1719static void calc_load_account_idle(struct rq *this_rq);
@@ -1880,7 +1908,7 @@ void account_system_vtime(struct task_struct *curr)
1880 */ 1908 */
1881 if (hardirq_count()) 1909 if (hardirq_count())
1882 __this_cpu_add(cpu_hardirq_time, delta); 1910 __this_cpu_add(cpu_hardirq_time, delta);
1883 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) 1911 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
1884 __this_cpu_add(cpu_softirq_time, delta); 1912 __this_cpu_add(cpu_softirq_time, delta);
1885 1913
1886 irq_time_write_end(); 1914 irq_time_write_end();
@@ -1920,8 +1948,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
1920 sched_rt_avg_update(rq, irq_delta); 1948 sched_rt_avg_update(rq, irq_delta);
1921} 1949}
1922 1950
1951static int irqtime_account_hi_update(void)
1952{
1953 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1954 unsigned long flags;
1955 u64 latest_ns;
1956 int ret = 0;
1957
1958 local_irq_save(flags);
1959 latest_ns = this_cpu_read(cpu_hardirq_time);
1960 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
1961 ret = 1;
1962 local_irq_restore(flags);
1963 return ret;
1964}
1965
1966static int irqtime_account_si_update(void)
1967{
1968 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1969 unsigned long flags;
1970 u64 latest_ns;
1971 int ret = 0;
1972
1973 local_irq_save(flags);
1974 latest_ns = this_cpu_read(cpu_softirq_time);
1975 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
1976 ret = 1;
1977 local_irq_restore(flags);
1978 return ret;
1979}
1980
1923#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 1981#else /* CONFIG_IRQ_TIME_ACCOUNTING */
1924 1982
1983#define sched_clock_irqtime (0)
1984
1925static void update_rq_clock_task(struct rq *rq, s64 delta) 1985static void update_rq_clock_task(struct rq *rq, s64 delta)
1926{ 1986{
1927 rq->clock_task += delta; 1987 rq->clock_task += delta;
@@ -2025,14 +2085,14 @@ inline int task_curr(const struct task_struct *p)
2025 2085
2026static inline void check_class_changed(struct rq *rq, struct task_struct *p, 2086static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2027 const struct sched_class *prev_class, 2087 const struct sched_class *prev_class,
2028 int oldprio, int running) 2088 int oldprio)
2029{ 2089{
2030 if (prev_class != p->sched_class) { 2090 if (prev_class != p->sched_class) {
2031 if (prev_class->switched_from) 2091 if (prev_class->switched_from)
2032 prev_class->switched_from(rq, p, running); 2092 prev_class->switched_from(rq, p);
2033 p->sched_class->switched_to(rq, p, running); 2093 p->sched_class->switched_to(rq, p);
2034 } else 2094 } else if (oldprio != p->prio)
2035 p->sched_class->prio_changed(rq, p, oldprio, running); 2095 p->sched_class->prio_changed(rq, p, oldprio);
2036} 2096}
2037 2097
2038static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 2098static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2224,7 +2284,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2224 * yield - it could be a while. 2284 * yield - it could be a while.
2225 */ 2285 */
2226 if (unlikely(on_rq)) { 2286 if (unlikely(on_rq)) {
2227 schedule_timeout_uninterruptible(1); 2287 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
2288
2289 set_current_state(TASK_UNINTERRUPTIBLE);
2290 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2228 continue; 2291 continue;
2229 } 2292 }
2230 2293
@@ -2265,27 +2328,6 @@ void kick_process(struct task_struct *p)
2265EXPORT_SYMBOL_GPL(kick_process); 2328EXPORT_SYMBOL_GPL(kick_process);
2266#endif /* CONFIG_SMP */ 2329#endif /* CONFIG_SMP */
2267 2330
2268/**
2269 * task_oncpu_function_call - call a function on the cpu on which a task runs
2270 * @p: the task to evaluate
2271 * @func: the function to be called
2272 * @info: the function call argument
2273 *
2274 * Calls the function @func when the task is currently running. This might
2275 * be on the current CPU, which just calls the function directly
2276 */
2277void task_oncpu_function_call(struct task_struct *p,
2278 void (*func) (void *info), void *info)
2279{
2280 int cpu;
2281
2282 preempt_disable();
2283 cpu = task_cpu(p);
2284 if (task_curr(p))
2285 smp_call_function_single(cpu, func, info, 1);
2286 preempt_enable();
2287}
2288
2289#ifdef CONFIG_SMP 2331#ifdef CONFIG_SMP
2290/* 2332/*
2291 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2333 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
@@ -2566,6 +2608,7 @@ static void __sched_fork(struct task_struct *p)
2566 p->se.sum_exec_runtime = 0; 2608 p->se.sum_exec_runtime = 0;
2567 p->se.prev_sum_exec_runtime = 0; 2609 p->se.prev_sum_exec_runtime = 0;
2568 p->se.nr_migrations = 0; 2610 p->se.nr_migrations = 0;
2611 p->se.vruntime = 0;
2569 2612
2570#ifdef CONFIG_SCHEDSTATS 2613#ifdef CONFIG_SCHEDSTATS
2571 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2614 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -2776,9 +2819,12 @@ static inline void
2776prepare_task_switch(struct rq *rq, struct task_struct *prev, 2819prepare_task_switch(struct rq *rq, struct task_struct *prev,
2777 struct task_struct *next) 2820 struct task_struct *next)
2778{ 2821{
2822 sched_info_switch(prev, next);
2823 perf_event_task_sched_out(prev, next);
2779 fire_sched_out_preempt_notifiers(prev, next); 2824 fire_sched_out_preempt_notifiers(prev, next);
2780 prepare_lock_switch(rq, next); 2825 prepare_lock_switch(rq, next);
2781 prepare_arch_switch(next); 2826 prepare_arch_switch(next);
2827 trace_sched_switch(prev, next);
2782} 2828}
2783 2829
2784/** 2830/**
@@ -2911,7 +2957,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2911 struct mm_struct *mm, *oldmm; 2957 struct mm_struct *mm, *oldmm;
2912 2958
2913 prepare_task_switch(rq, prev, next); 2959 prepare_task_switch(rq, prev, next);
2914 trace_sched_switch(prev, next); 2960
2915 mm = next->mm; 2961 mm = next->mm;
2916 oldmm = prev->active_mm; 2962 oldmm = prev->active_mm;
2917 /* 2963 /*
@@ -3568,6 +3614,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3568} 3614}
3569 3615
3570/* 3616/*
3617 * Account system cpu time to a process and desired cpustat field
3618 * @p: the process that the cpu time gets accounted to
3619 * @cputime: the cpu time spent in kernel space since the last update
3620 * @cputime_scaled: cputime scaled by cpu frequency
3621 * @target_cputime64: pointer to cpustat field that has to be updated
3622 */
3623static inline
3624void __account_system_time(struct task_struct *p, cputime_t cputime,
3625 cputime_t cputime_scaled, cputime64_t *target_cputime64)
3626{
3627 cputime64_t tmp = cputime_to_cputime64(cputime);
3628
3629 /* Add system time to process. */
3630 p->stime = cputime_add(p->stime, cputime);
3631 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3632 account_group_system_time(p, cputime);
3633
3634 /* Add system time to cpustat. */
3635 *target_cputime64 = cputime64_add(*target_cputime64, tmp);
3636 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3637
3638 /* Account for system time used */
3639 acct_update_integrals(p);
3640}
3641
3642/*
3571 * Account system cpu time to a process. 3643 * Account system cpu time to a process.
3572 * @p: the process that the cpu time gets accounted to 3644 * @p: the process that the cpu time gets accounted to
3573 * @hardirq_offset: the offset to subtract from hardirq_count() 3645 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3578,36 +3650,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3578 cputime_t cputime, cputime_t cputime_scaled) 3650 cputime_t cputime, cputime_t cputime_scaled)
3579{ 3651{
3580 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3652 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3581 cputime64_t tmp; 3653 cputime64_t *target_cputime64;
3582 3654
3583 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 3655 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3584 account_guest_time(p, cputime, cputime_scaled); 3656 account_guest_time(p, cputime, cputime_scaled);
3585 return; 3657 return;
3586 } 3658 }
3587 3659
3588 /* Add system time to process. */
3589 p->stime = cputime_add(p->stime, cputime);
3590 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3591 account_group_system_time(p, cputime);
3592
3593 /* Add system time to cpustat. */
3594 tmp = cputime_to_cputime64(cputime);
3595 if (hardirq_count() - hardirq_offset) 3660 if (hardirq_count() - hardirq_offset)
3596 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3661 target_cputime64 = &cpustat->irq;
3597 else if (in_serving_softirq()) 3662 else if (in_serving_softirq())
3598 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3663 target_cputime64 = &cpustat->softirq;
3599 else 3664 else
3600 cpustat->system = cputime64_add(cpustat->system, tmp); 3665 target_cputime64 = &cpustat->system;
3601 3666
3602 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); 3667 __account_system_time(p, cputime, cputime_scaled, target_cputime64);
3603
3604 /* Account for system time used */
3605 acct_update_integrals(p);
3606} 3668}
3607 3669
3608/* 3670/*
3609 * Account for involuntary wait time. 3671 * Account for involuntary wait time.
3610 * @steal: the cpu time spent in involuntary wait 3672 * @cputime: the cpu time spent in involuntary wait
3611 */ 3673 */
3612void account_steal_time(cputime_t cputime) 3674void account_steal_time(cputime_t cputime)
3613{ 3675{
@@ -3635,6 +3697,73 @@ void account_idle_time(cputime_t cputime)
3635 3697
3636#ifndef CONFIG_VIRT_CPU_ACCOUNTING 3698#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3637 3699
3700#ifdef CONFIG_IRQ_TIME_ACCOUNTING
3701/*
3702 * Account a tick to a process and cpustat
3703 * @p: the process that the cpu time gets accounted to
3704 * @user_tick: is the tick from userspace
3705 * @rq: the pointer to rq
3706 *
3707 * Tick demultiplexing follows the order
3708 * - pending hardirq update
3709 * - pending softirq update
3710 * - user_time
3711 * - idle_time
3712 * - system time
3713 * - check for guest_time
3714 * - else account as system_time
3715 *
3716 * Check for hardirq is done both for system and user time as there is
3717 * no timer going off while we are on hardirq and hence we may never get an
3718 * opportunity to update it solely in system time.
3719 * p->stime and friends are only updated on system time and not on irq
3720 * softirq as those do not count in task exec_runtime any more.
3721 */
3722static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3723 struct rq *rq)
3724{
3725 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3726 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3727 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3728
3729 if (irqtime_account_hi_update()) {
3730 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3731 } else if (irqtime_account_si_update()) {
3732 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3733 } else if (this_cpu_ksoftirqd() == p) {
3734 /*
3735 * ksoftirqd time do not get accounted in cpu_softirq_time.
3736 * So, we have to handle it separately here.
3737 * Also, p->stime needs to be updated for ksoftirqd.
3738 */
3739 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3740 &cpustat->softirq);
3741 } else if (user_tick) {
3742 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3743 } else if (p == rq->idle) {
3744 account_idle_time(cputime_one_jiffy);
3745 } else if (p->flags & PF_VCPU) { /* System time or guest time */
3746 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3747 } else {
3748 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3749 &cpustat->system);
3750 }
3751}
3752
3753static void irqtime_account_idle_ticks(int ticks)
3754{
3755 int i;
3756 struct rq *rq = this_rq();
3757
3758 for (i = 0; i < ticks; i++)
3759 irqtime_account_process_tick(current, 0, rq);
3760}
3761#else /* CONFIG_IRQ_TIME_ACCOUNTING */
3762static void irqtime_account_idle_ticks(int ticks) {}
3763static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3764 struct rq *rq) {}
3765#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
3766
3638/* 3767/*
3639 * Account a single tick of cpu time. 3768 * Account a single tick of cpu time.
3640 * @p: the process that the cpu time gets accounted to 3769 * @p: the process that the cpu time gets accounted to
@@ -3645,6 +3774,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
3645 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 3774 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3646 struct rq *rq = this_rq(); 3775 struct rq *rq = this_rq();
3647 3776
3777 if (sched_clock_irqtime) {
3778 irqtime_account_process_tick(p, user_tick, rq);
3779 return;
3780 }
3781
3648 if (user_tick) 3782 if (user_tick)
3649 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 3783 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3650 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 3784 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3670,6 +3804,12 @@ void account_steal_ticks(unsigned long ticks)
3670 */ 3804 */
3671void account_idle_ticks(unsigned long ticks) 3805void account_idle_ticks(unsigned long ticks)
3672{ 3806{
3807
3808 if (sched_clock_irqtime) {
3809 irqtime_account_idle_ticks(ticks);
3810 return;
3811 }
3812
3673 account_idle_time(jiffies_to_cputime(ticks)); 3813 account_idle_time(jiffies_to_cputime(ticks));
3674} 3814}
3675 3815
@@ -3945,9 +4085,6 @@ need_resched:
3945 rcu_note_context_switch(cpu); 4085 rcu_note_context_switch(cpu);
3946 prev = rq->curr; 4086 prev = rq->curr;
3947 4087
3948 release_kernel_lock(prev);
3949need_resched_nonpreemptible:
3950
3951 schedule_debug(prev); 4088 schedule_debug(prev);
3952 4089
3953 if (sched_feat(HRTICK)) 4090 if (sched_feat(HRTICK))
@@ -3978,6 +4115,16 @@ need_resched_nonpreemptible:
3978 switch_count = &prev->nvcsw; 4115 switch_count = &prev->nvcsw;
3979 } 4116 }
3980 4117
4118 /*
4119 * If we are going to sleep and we have plugged IO queued, make
4120 * sure to submit it to avoid deadlocks.
4121 */
4122 if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) {
4123 raw_spin_unlock(&rq->lock);
4124 blk_flush_plug(prev);
4125 raw_spin_lock(&rq->lock);
4126 }
4127
3981 pre_schedule(rq, prev); 4128 pre_schedule(rq, prev);
3982 4129
3983 if (unlikely(!rq->nr_running)) 4130 if (unlikely(!rq->nr_running))
@@ -3989,9 +4136,6 @@ need_resched_nonpreemptible:
3989 rq->skip_clock_update = 0; 4136 rq->skip_clock_update = 0;
3990 4137
3991 if (likely(prev != next)) { 4138 if (likely(prev != next)) {
3992 sched_info_switch(prev, next);
3993 perf_event_task_sched_out(prev, next);
3994
3995 rq->nr_switches++; 4139 rq->nr_switches++;
3996 rq->curr = next; 4140 rq->curr = next;
3997 ++*switch_count; 4141 ++*switch_count;
@@ -4010,9 +4154,6 @@ need_resched_nonpreemptible:
4010 4154
4011 post_schedule(rq); 4155 post_schedule(rq);
4012 4156
4013 if (unlikely(reacquire_kernel_lock(prev)))
4014 goto need_resched_nonpreemptible;
4015
4016 preempt_enable_no_resched(); 4157 preempt_enable_no_resched();
4017 if (need_resched()) 4158 if (need_resched())
4018 goto need_resched; 4159 goto need_resched;
@@ -4571,11 +4712,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4571 4712
4572 if (running) 4713 if (running)
4573 p->sched_class->set_curr_task(rq); 4714 p->sched_class->set_curr_task(rq);
4574 if (on_rq) { 4715 if (on_rq)
4575 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4716 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4576 4717
4577 check_class_changed(rq, p, prev_class, oldprio, running); 4718 check_class_changed(rq, p, prev_class, oldprio);
4578 }
4579 task_rq_unlock(rq, &flags); 4719 task_rq_unlock(rq, &flags);
4580} 4720}
4581 4721
@@ -4762,8 +4902,11 @@ static bool check_same_owner(struct task_struct *p)
4762 4902
4763 rcu_read_lock(); 4903 rcu_read_lock();
4764 pcred = __task_cred(p); 4904 pcred = __task_cred(p);
4765 match = (cred->euid == pcred->euid || 4905 if (cred->user->user_ns == pcred->user->user_ns)
4766 cred->euid == pcred->uid); 4906 match = (cred->euid == pcred->euid ||
4907 cred->euid == pcred->uid);
4908 else
4909 match = false;
4767 rcu_read_unlock(); 4910 rcu_read_unlock();
4768 return match; 4911 return match;
4769} 4912}
@@ -4823,12 +4966,15 @@ recheck:
4823 param->sched_priority > rlim_rtprio) 4966 param->sched_priority > rlim_rtprio)
4824 return -EPERM; 4967 return -EPERM;
4825 } 4968 }
4969
4826 /* 4970 /*
4827 * Like positive nice levels, dont allow tasks to 4971 * Treat SCHED_IDLE as nice 20. Only allow a switch to
4828 * move out of SCHED_IDLE either: 4972 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
4829 */ 4973 */
4830 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) 4974 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
4831 return -EPERM; 4975 if (!can_nice(p, TASK_NICE(p)))
4976 return -EPERM;
4977 }
4832 4978
4833 /* can't change other user's priorities */ 4979 /* can't change other user's priorities */
4834 if (!check_same_owner(p)) 4980 if (!check_same_owner(p))
@@ -4903,11 +5049,10 @@ recheck:
4903 5049
4904 if (running) 5050 if (running)
4905 p->sched_class->set_curr_task(rq); 5051 p->sched_class->set_curr_task(rq);
4906 if (on_rq) { 5052 if (on_rq)
4907 activate_task(rq, p, 0); 5053 activate_task(rq, p, 0);
4908 5054
4909 check_class_changed(rq, p, prev_class, oldprio, running); 5055 check_class_changed(rq, p, prev_class, oldprio);
4910 }
4911 __task_rq_unlock(rq); 5056 __task_rq_unlock(rq);
4912 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5057 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4913 5058
@@ -5089,7 +5234,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5089 goto out_free_cpus_allowed; 5234 goto out_free_cpus_allowed;
5090 } 5235 }
5091 retval = -EPERM; 5236 retval = -EPERM;
5092 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5237 if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
5093 goto out_unlock; 5238 goto out_unlock;
5094 5239
5095 retval = security_task_setscheduler(p); 5240 retval = security_task_setscheduler(p);
@@ -5324,6 +5469,67 @@ void __sched yield(void)
5324} 5469}
5325EXPORT_SYMBOL(yield); 5470EXPORT_SYMBOL(yield);
5326 5471
5472/**
5473 * yield_to - yield the current processor to another thread in
5474 * your thread group, or accelerate that thread toward the
5475 * processor it's on.
5476 * @p: target task
5477 * @preempt: whether task preemption is allowed or not
5478 *
5479 * It's the caller's job to ensure that the target task struct
5480 * can't go away on us before we can do any checks.
5481 *
5482 * Returns true if we indeed boosted the target task.
5483 */
5484bool __sched yield_to(struct task_struct *p, bool preempt)
5485{
5486 struct task_struct *curr = current;
5487 struct rq *rq, *p_rq;
5488 unsigned long flags;
5489 bool yielded = 0;
5490
5491 local_irq_save(flags);
5492 rq = this_rq();
5493
5494again:
5495 p_rq = task_rq(p);
5496 double_rq_lock(rq, p_rq);
5497 while (task_rq(p) != p_rq) {
5498 double_rq_unlock(rq, p_rq);
5499 goto again;
5500 }
5501
5502 if (!curr->sched_class->yield_to_task)
5503 goto out;
5504
5505 if (curr->sched_class != p->sched_class)
5506 goto out;
5507
5508 if (task_running(p_rq, p) || p->state)
5509 goto out;
5510
5511 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5512 if (yielded) {
5513 schedstat_inc(rq, yld_count);
5514 /*
5515 * Make p's CPU reschedule; pick_next_entity takes care of
5516 * fairness.
5517 */
5518 if (preempt && rq != p_rq)
5519 resched_task(p_rq->curr);
5520 }
5521
5522out:
5523 double_rq_unlock(rq, p_rq);
5524 local_irq_restore(flags);
5525
5526 if (yielded)
5527 schedule();
5528
5529 return yielded;
5530}
5531EXPORT_SYMBOL_GPL(yield_to);
5532
5327/* 5533/*
5328 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 5534 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
5329 * that process accounting knows that this is a task in IO wait state. 5535 * that process accounting knows that this is a task in IO wait state.
@@ -5334,6 +5540,7 @@ void __sched io_schedule(void)
5334 5540
5335 delayacct_blkio_start(); 5541 delayacct_blkio_start();
5336 atomic_inc(&rq->nr_iowait); 5542 atomic_inc(&rq->nr_iowait);
5543 blk_flush_plug(current);
5337 current->in_iowait = 1; 5544 current->in_iowait = 1;
5338 schedule(); 5545 schedule();
5339 current->in_iowait = 0; 5546 current->in_iowait = 0;
@@ -5349,6 +5556,7 @@ long __sched io_schedule_timeout(long timeout)
5349 5556
5350 delayacct_blkio_start(); 5557 delayacct_blkio_start();
5351 atomic_inc(&rq->nr_iowait); 5558 atomic_inc(&rq->nr_iowait);
5559 blk_flush_plug(current);
5352 current->in_iowait = 1; 5560 current->in_iowait = 1;
5353 ret = schedule_timeout(timeout); 5561 ret = schedule_timeout(timeout);
5354 current->in_iowait = 0; 5562 current->in_iowait = 0;
@@ -5572,7 +5780,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5572 * The idle tasks have their own, simple scheduling class: 5780 * The idle tasks have their own, simple scheduling class:
5573 */ 5781 */
5574 idle->sched_class = &idle_sched_class; 5782 idle->sched_class = &idle_sched_class;
5575 ftrace_graph_init_task(idle); 5783 ftrace_graph_init_idle_task(idle, cpu);
5576} 5784}
5577 5785
5578/* 5786/*
@@ -7797,6 +8005,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7797 INIT_LIST_HEAD(&cfs_rq->tasks); 8005 INIT_LIST_HEAD(&cfs_rq->tasks);
7798#ifdef CONFIG_FAIR_GROUP_SCHED 8006#ifdef CONFIG_FAIR_GROUP_SCHED
7799 cfs_rq->rq = rq; 8007 cfs_rq->rq = rq;
8008 /* allow initial update_cfs_load() to truncate */
8009#ifdef CONFIG_SMP
8010 cfs_rq->load_stamp = 1;
8011#endif
7800#endif 8012#endif
7801 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 8013 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7802} 8014}
@@ -8075,7 +8287,7 @@ static inline int preempt_count_equals(int preempt_offset)
8075{ 8287{
8076 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 8288 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
8077 8289
8078 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 8290 return (nested == preempt_offset);
8079} 8291}
8080 8292
8081void __might_sleep(const char *file, int line, int preempt_offset) 8293void __might_sleep(const char *file, int line, int preempt_offset)
@@ -8110,6 +8322,8 @@ EXPORT_SYMBOL(__might_sleep);
8110#ifdef CONFIG_MAGIC_SYSRQ 8322#ifdef CONFIG_MAGIC_SYSRQ
8111static void normalize_task(struct rq *rq, struct task_struct *p) 8323static void normalize_task(struct rq *rq, struct task_struct *p)
8112{ 8324{
8325 const struct sched_class *prev_class = p->sched_class;
8326 int old_prio = p->prio;
8113 int on_rq; 8327 int on_rq;
8114 8328
8115 on_rq = p->se.on_rq; 8329 on_rq = p->se.on_rq;
@@ -8120,6 +8334,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8120 activate_task(rq, p, 0); 8334 activate_task(rq, p, 0);
8121 resched_task(rq->curr); 8335 resched_task(rq->curr);
8122 } 8336 }
8337
8338 check_class_changed(rq, p, prev_class, old_prio);
8123} 8339}
8124 8340
8125void normalize_rt_tasks(void) 8341void normalize_rt_tasks(void)
@@ -8235,7 +8451,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8235{ 8451{
8236 struct cfs_rq *cfs_rq; 8452 struct cfs_rq *cfs_rq;
8237 struct sched_entity *se; 8453 struct sched_entity *se;
8238 struct rq *rq;
8239 int i; 8454 int i;
8240 8455
8241 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 8456 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8248,8 +8463,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8248 tg->shares = NICE_0_LOAD; 8463 tg->shares = NICE_0_LOAD;
8249 8464
8250 for_each_possible_cpu(i) { 8465 for_each_possible_cpu(i) {
8251 rq = cpu_rq(i);
8252
8253 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8466 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8254 GFP_KERNEL, cpu_to_node(i)); 8467 GFP_KERNEL, cpu_to_node(i));
8255 if (!cfs_rq) 8468 if (!cfs_rq)
@@ -8511,7 +8724,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8511 /* Propagate contribution to hierarchy */ 8724 /* Propagate contribution to hierarchy */
8512 raw_spin_lock_irqsave(&rq->lock, flags); 8725 raw_spin_lock_irqsave(&rq->lock, flags);
8513 for_each_sched_entity(se) 8726 for_each_sched_entity(se)
8514 update_cfs_shares(group_cfs_rq(se), 0); 8727 update_cfs_shares(group_cfs_rq(se));
8515 raw_spin_unlock_irqrestore(&rq->lock, flags); 8728 raw_spin_unlock_irqrestore(&rq->lock, flags);
8516 } 8729 }
8517 8730
@@ -8885,7 +9098,8 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8885} 9098}
8886 9099
8887static void 9100static void
8888cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) 9101cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
9102 struct cgroup *old_cgrp, struct task_struct *task)
8889{ 9103{
8890 /* 9104 /*
8891 * cgroup_exit() is called in the copy_process() failure path. 9105 * cgroup_exit() is called in the copy_process() failure path.
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 9fb656283157..5946ac515602 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -12,7 +12,6 @@ static atomic_t autogroup_seq_nr;
12static void __init autogroup_init(struct task_struct *init_task) 12static void __init autogroup_init(struct task_struct *init_task)
13{ 13{
14 autogroup_default.tg = &root_task_group; 14 autogroup_default.tg = &root_task_group;
15 root_task_group.autogroup = &autogroup_default;
16 kref_init(&autogroup_default.kref); 15 kref_init(&autogroup_default.kref);
17 init_rwsem(&autogroup_default.lock); 16 init_rwsem(&autogroup_default.lock);
18 init_task->signal->autogroup = &autogroup_default; 17 init_task->signal->autogroup = &autogroup_default;
@@ -130,7 +129,7 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
130 129
131static inline bool task_group_is_autogroup(struct task_group *tg) 130static inline bool task_group_is_autogroup(struct task_group *tg)
132{ 131{
133 return tg != &root_task_group && tg->autogroup; 132 return !!tg->autogroup;
134} 133}
135 134
136static inline struct task_group * 135static inline struct task_group *
@@ -161,11 +160,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
161 160
162 p->signal->autogroup = autogroup_kref_get(ag); 161 p->signal->autogroup = autogroup_kref_get(ag);
163 162
163 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
164 goto out;
165
164 t = p; 166 t = p;
165 do { 167 do {
166 sched_move_task(t); 168 sched_move_task(t);
167 } while_each_thread(p, t); 169 } while_each_thread(p, t);
168 170
171out:
169 unlock_task_sighand(p, &flags); 172 unlock_task_sighand(p, &flags);
170 autogroup_kref_put(prev); 173 autogroup_kref_put(prev);
171} 174}
@@ -247,10 +250,14 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
247{ 250{
248 struct autogroup *ag = autogroup_task_get(p); 251 struct autogroup *ag = autogroup_task_get(p);
249 252
253 if (!task_group_is_autogroup(ag->tg))
254 goto out;
255
250 down_read(&ag->lock); 256 down_read(&ag->lock);
251 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); 257 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
252 up_read(&ag->lock); 258 up_read(&ag->lock);
253 259
260out:
254 autogroup_kref_put(ag); 261 autogroup_kref_put(ag);
255} 262}
256#endif /* CONFIG_PROC_FS */ 263#endif /* CONFIG_PROC_FS */
@@ -258,9 +265,7 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
258#ifdef CONFIG_SCHED_DEBUG 265#ifdef CONFIG_SCHED_DEBUG
259static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 266static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
260{ 267{
261 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); 268 if (!task_group_is_autogroup(tg))
262
263 if (!enabled || !tg->autogroup)
264 return 0; 269 return 0;
265 270
266 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); 271 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 7b859ffe5dad..05577055cfca 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -1,6 +1,11 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3struct autogroup { 3struct autogroup {
4 /*
5 * reference doesn't mean how many thread attach to this
6 * autogroup now. It just stands for the number of task
7 * could use this autogroup.
8 */
4 struct kref kref; 9 struct kref kref;
5 struct task_group *tg; 10 struct task_group *tg;
6 struct rw_semaphore lock; 11 struct rw_semaphore lock;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index eb6cb8edd075..7bacd83a4158 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -179,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
179 179
180 raw_spin_lock_irqsave(&rq->lock, flags); 180 raw_spin_lock_irqsave(&rq->lock, flags);
181 if (cfs_rq->rb_leftmost) 181 if (cfs_rq->rb_leftmost)
182 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; 182 MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
183 last = __pick_last_entity(cfs_rq); 183 last = __pick_last_entity(cfs_rq);
184 if (last) 184 if (last)
185 max_vruntime = last->vruntime; 185 max_vruntime = last->vruntime;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c26e2df450e..3f7ec9e27ee1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -69,14 +69,6 @@ static unsigned int sched_nr_latency = 8;
69unsigned int sysctl_sched_child_runs_first __read_mostly; 69unsigned int sysctl_sched_child_runs_first __read_mostly;
70 70
71/* 71/*
72 * sys_sched_yield() compat mode
73 *
74 * This option switches the agressive yield implementation of the
75 * old scheduler back on.
76 */
77unsigned int __read_mostly sysctl_sched_compat_yield;
78
79/*
80 * SCHED_OTHER wake-up granularity. 72 * SCHED_OTHER wake-up granularity.
81 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 73 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
82 * 74 *
@@ -419,7 +411,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
419 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 411 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
420} 412}
421 413
422static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) 414static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
423{ 415{
424 struct rb_node *left = cfs_rq->rb_leftmost; 416 struct rb_node *left = cfs_rq->rb_leftmost;
425 417
@@ -429,6 +421,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
429 return rb_entry(left, struct sched_entity, run_node); 421 return rb_entry(left, struct sched_entity, run_node);
430} 422}
431 423
424static struct sched_entity *__pick_next_entity(struct sched_entity *se)
425{
426 struct rb_node *next = rb_next(&se->run_node);
427
428 if (!next)
429 return NULL;
430
431 return rb_entry(next, struct sched_entity, run_node);
432}
433
434#ifdef CONFIG_SCHED_DEBUG
432static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 435static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
433{ 436{
434 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 437 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -443,7 +446,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
443 * Scheduling class statistics methods: 446 * Scheduling class statistics methods:
444 */ 447 */
445 448
446#ifdef CONFIG_SCHED_DEBUG
447int sched_proc_update_handler(struct ctl_table *table, int write, 449int sched_proc_update_handler(struct ctl_table *table, int write,
448 void __user *buffer, size_t *lenp, 450 void __user *buffer, size_t *lenp,
449 loff_t *ppos) 451 loff_t *ppos)
@@ -540,7 +542,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
540} 542}
541 543
542static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); 544static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
543static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); 545static void update_cfs_shares(struct cfs_rq *cfs_rq);
544 546
545/* 547/*
546 * Update the current task's runtime statistics. Skip current tasks that 548 * Update the current task's runtime statistics. Skip current tasks that
@@ -733,6 +735,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
733 now - cfs_rq->load_last > 4 * period) { 735 now - cfs_rq->load_last > 4 * period) {
734 cfs_rq->load_period = 0; 736 cfs_rq->load_period = 0;
735 cfs_rq->load_avg = 0; 737 cfs_rq->load_avg = 0;
738 delta = period - 1;
736 } 739 }
737 740
738 cfs_rq->load_stamp = now; 741 cfs_rq->load_stamp = now;
@@ -763,16 +766,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
763 list_del_leaf_cfs_rq(cfs_rq); 766 list_del_leaf_cfs_rq(cfs_rq);
764} 767}
765 768
766static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, 769static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
767 long weight_delta)
768{ 770{
769 long load_weight, load, shares; 771 long load_weight, load, shares;
770 772
771 load = cfs_rq->load.weight + weight_delta; 773 load = cfs_rq->load.weight;
772 774
773 load_weight = atomic_read(&tg->load_weight); 775 load_weight = atomic_read(&tg->load_weight);
774 load_weight -= cfs_rq->load_contribution;
775 load_weight += load; 776 load_weight += load;
777 load_weight -= cfs_rq->load_contribution;
776 778
777 shares = (tg->shares * load); 779 shares = (tg->shares * load);
778 if (load_weight) 780 if (load_weight)
@@ -790,7 +792,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
790{ 792{
791 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { 793 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
792 update_cfs_load(cfs_rq, 0); 794 update_cfs_load(cfs_rq, 0);
793 update_cfs_shares(cfs_rq, 0); 795 update_cfs_shares(cfs_rq);
794 } 796 }
795} 797}
796# else /* CONFIG_SMP */ 798# else /* CONFIG_SMP */
@@ -798,8 +800,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
798{ 800{
799} 801}
800 802
801static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, 803static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
802 long weight_delta)
803{ 804{
804 return tg->shares; 805 return tg->shares;
805} 806}
@@ -824,7 +825,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
824 account_entity_enqueue(cfs_rq, se); 825 account_entity_enqueue(cfs_rq, se);
825} 826}
826 827
827static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) 828static void update_cfs_shares(struct cfs_rq *cfs_rq)
828{ 829{
829 struct task_group *tg; 830 struct task_group *tg;
830 struct sched_entity *se; 831 struct sched_entity *se;
@@ -838,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
838 if (likely(se->load.weight == tg->shares)) 839 if (likely(se->load.weight == tg->shares))
839 return; 840 return;
840#endif 841#endif
841 shares = calc_cfs_shares(cfs_rq, tg, weight_delta); 842 shares = calc_cfs_shares(cfs_rq, tg);
842 843
843 reweight_entity(cfs_rq_of(se), se, shares); 844 reweight_entity(cfs_rq_of(se), se, shares);
844} 845}
@@ -847,7 +848,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
847{ 848{
848} 849}
849 850
850static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) 851static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
851{ 852{
852} 853}
853 854
@@ -978,8 +979,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
978 */ 979 */
979 update_curr(cfs_rq); 980 update_curr(cfs_rq);
980 update_cfs_load(cfs_rq, 0); 981 update_cfs_load(cfs_rq, 0);
981 update_cfs_shares(cfs_rq, se->load.weight);
982 account_entity_enqueue(cfs_rq, se); 982 account_entity_enqueue(cfs_rq, se);
983 update_cfs_shares(cfs_rq);
983 984
984 if (flags & ENQUEUE_WAKEUP) { 985 if (flags & ENQUEUE_WAKEUP) {
985 place_entity(cfs_rq, se, 0); 986 place_entity(cfs_rq, se, 0);
@@ -996,19 +997,49 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
996 list_add_leaf_cfs_rq(cfs_rq); 997 list_add_leaf_cfs_rq(cfs_rq);
997} 998}
998 999
999static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 1000static void __clear_buddies_last(struct sched_entity *se)
1001{
1002 for_each_sched_entity(se) {
1003 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1004 if (cfs_rq->last == se)
1005 cfs_rq->last = NULL;
1006 else
1007 break;
1008 }
1009}
1010
1011static void __clear_buddies_next(struct sched_entity *se)
1000{ 1012{
1001 if (!se || cfs_rq->last == se) 1013 for_each_sched_entity(se) {
1002 cfs_rq->last = NULL; 1014 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1015 if (cfs_rq->next == se)
1016 cfs_rq->next = NULL;
1017 else
1018 break;
1019 }
1020}
1003 1021
1004 if (!se || cfs_rq->next == se) 1022static void __clear_buddies_skip(struct sched_entity *se)
1005 cfs_rq->next = NULL; 1023{
1024 for_each_sched_entity(se) {
1025 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1026 if (cfs_rq->skip == se)
1027 cfs_rq->skip = NULL;
1028 else
1029 break;
1030 }
1006} 1031}
1007 1032
1008static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 1033static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1009{ 1034{
1010 for_each_sched_entity(se) 1035 if (cfs_rq->last == se)
1011 __clear_buddies(cfs_rq_of(se), se); 1036 __clear_buddies_last(se);
1037
1038 if (cfs_rq->next == se)
1039 __clear_buddies_next(se);
1040
1041 if (cfs_rq->skip == se)
1042 __clear_buddies_skip(se);
1012} 1043}
1013 1044
1014static void 1045static void
@@ -1041,7 +1072,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1041 update_cfs_load(cfs_rq, 0); 1072 update_cfs_load(cfs_rq, 0);
1042 account_entity_dequeue(cfs_rq, se); 1073 account_entity_dequeue(cfs_rq, se);
1043 update_min_vruntime(cfs_rq); 1074 update_min_vruntime(cfs_rq);
1044 update_cfs_shares(cfs_rq, 0); 1075 update_cfs_shares(cfs_rq);
1045 1076
1046 /* 1077 /*
1047 * Normalize the entity after updating the min_vruntime because the 1078 * Normalize the entity after updating the min_vruntime because the
@@ -1084,7 +1115,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1084 return; 1115 return;
1085 1116
1086 if (cfs_rq->nr_running > 1) { 1117 if (cfs_rq->nr_running > 1) {
1087 struct sched_entity *se = __pick_next_entity(cfs_rq); 1118 struct sched_entity *se = __pick_first_entity(cfs_rq);
1088 s64 delta = curr->vruntime - se->vruntime; 1119 s64 delta = curr->vruntime - se->vruntime;
1089 1120
1090 if (delta < 0) 1121 if (delta < 0)
@@ -1128,13 +1159,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1128static int 1159static int
1129wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); 1160wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1130 1161
1162/*
1163 * Pick the next process, keeping these things in mind, in this order:
1164 * 1) keep things fair between processes/task groups
1165 * 2) pick the "next" process, since someone really wants that to run
1166 * 3) pick the "last" process, for cache locality
1167 * 4) do not run the "skip" process, if something else is available
1168 */
1131static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 1169static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1132{ 1170{
1133 struct sched_entity *se = __pick_next_entity(cfs_rq); 1171 struct sched_entity *se = __pick_first_entity(cfs_rq);
1134 struct sched_entity *left = se; 1172 struct sched_entity *left = se;
1135 1173
1136 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) 1174 /*
1137 se = cfs_rq->next; 1175 * Avoid running the skip buddy, if running something else can
1176 * be done without getting too unfair.
1177 */
1178 if (cfs_rq->skip == se) {
1179 struct sched_entity *second = __pick_next_entity(se);
1180 if (second && wakeup_preempt_entity(second, left) < 1)
1181 se = second;
1182 }
1138 1183
1139 /* 1184 /*
1140 * Prefer last buddy, try to return the CPU to a preempted task. 1185 * Prefer last buddy, try to return the CPU to a preempted task.
@@ -1142,6 +1187,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1142 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) 1187 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1143 se = cfs_rq->last; 1188 se = cfs_rq->last;
1144 1189
1190 /*
1191 * Someone really wants this to run. If it's not unfair, run it.
1192 */
1193 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1194 se = cfs_rq->next;
1195
1145 clear_buddies(cfs_rq, se); 1196 clear_buddies(cfs_rq, se);
1146 1197
1147 return se; 1198 return se;
@@ -1282,7 +1333,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1282 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1333 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1283 1334
1284 update_cfs_load(cfs_rq, 0); 1335 update_cfs_load(cfs_rq, 0);
1285 update_cfs_shares(cfs_rq, 0); 1336 update_cfs_shares(cfs_rq);
1286 } 1337 }
1287 1338
1288 hrtick_update(rq); 1339 hrtick_update(rq);
@@ -1312,58 +1363,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1312 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1363 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1313 1364
1314 update_cfs_load(cfs_rq, 0); 1365 update_cfs_load(cfs_rq, 0);
1315 update_cfs_shares(cfs_rq, 0); 1366 update_cfs_shares(cfs_rq);
1316 } 1367 }
1317 1368
1318 hrtick_update(rq); 1369 hrtick_update(rq);
1319} 1370}
1320 1371
1321/*
1322 * sched_yield() support is very simple - we dequeue and enqueue.
1323 *
1324 * If compat_yield is turned on then we requeue to the end of the tree.
1325 */
1326static void yield_task_fair(struct rq *rq)
1327{
1328 struct task_struct *curr = rq->curr;
1329 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1330 struct sched_entity *rightmost, *se = &curr->se;
1331
1332 /*
1333 * Are we the only task in the tree?
1334 */
1335 if (unlikely(cfs_rq->nr_running == 1))
1336 return;
1337
1338 clear_buddies(cfs_rq, se);
1339
1340 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
1341 update_rq_clock(rq);
1342 /*
1343 * Update run-time statistics of the 'current'.
1344 */
1345 update_curr(cfs_rq);
1346
1347 return;
1348 }
1349 /*
1350 * Find the rightmost entry in the rbtree:
1351 */
1352 rightmost = __pick_last_entity(cfs_rq);
1353 /*
1354 * Already in the rightmost position?
1355 */
1356 if (unlikely(!rightmost || entity_before(rightmost, se)))
1357 return;
1358
1359 /*
1360 * Minimally necessary key value to be last in the tree:
1361 * Upon rescheduling, sched_class::put_prev_task() will place
1362 * 'current' within the tree based on its new key value.
1363 */
1364 se->vruntime = rightmost->vruntime + 1;
1365}
1366
1367#ifdef CONFIG_SMP 1372#ifdef CONFIG_SMP
1368 1373
1369static void task_waking_fair(struct rq *rq, struct task_struct *p) 1374static void task_waking_fair(struct rq *rq, struct task_struct *p)
@@ -1834,6 +1839,14 @@ static void set_next_buddy(struct sched_entity *se)
1834 } 1839 }
1835} 1840}
1836 1841
1842static void set_skip_buddy(struct sched_entity *se)
1843{
1844 if (likely(task_of(se)->policy != SCHED_IDLE)) {
1845 for_each_sched_entity(se)
1846 cfs_rq_of(se)->skip = se;
1847 }
1848}
1849
1837/* 1850/*
1838 * Preempt the current task with a newly woken task if needed: 1851 * Preempt the current task with a newly woken task if needed:
1839 */ 1852 */
@@ -1857,16 +1870,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1857 if (test_tsk_need_resched(curr)) 1870 if (test_tsk_need_resched(curr))
1858 return; 1871 return;
1859 1872
1873 /* Idle tasks are by definition preempted by non-idle tasks. */
1874 if (unlikely(curr->policy == SCHED_IDLE) &&
1875 likely(p->policy != SCHED_IDLE))
1876 goto preempt;
1877
1860 /* 1878 /*
1861 * Batch and idle tasks do not preempt (their preemption is driven by 1879 * Batch and idle tasks do not preempt non-idle tasks (their preemption
1862 * the tick): 1880 * is driven by the tick):
1863 */ 1881 */
1864 if (unlikely(p->policy != SCHED_NORMAL)) 1882 if (unlikely(p->policy != SCHED_NORMAL))
1865 return; 1883 return;
1866 1884
1867 /* Idle tasks are by definition preempted by everybody. */
1868 if (unlikely(curr->policy == SCHED_IDLE))
1869 goto preempt;
1870 1885
1871 if (!sched_feat(WAKEUP_PREEMPT)) 1886 if (!sched_feat(WAKEUP_PREEMPT))
1872 return; 1887 return;
@@ -1932,6 +1947,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1932 } 1947 }
1933} 1948}
1934 1949
1950/*
1951 * sched_yield() is very simple
1952 *
1953 * The magic of dealing with the ->skip buddy is in pick_next_entity.
1954 */
1955static void yield_task_fair(struct rq *rq)
1956{
1957 struct task_struct *curr = rq->curr;
1958 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1959 struct sched_entity *se = &curr->se;
1960
1961 /*
1962 * Are we the only task in the tree?
1963 */
1964 if (unlikely(rq->nr_running == 1))
1965 return;
1966
1967 clear_buddies(cfs_rq, se);
1968
1969 if (curr->policy != SCHED_BATCH) {
1970 update_rq_clock(rq);
1971 /*
1972 * Update run-time statistics of the 'current'.
1973 */
1974 update_curr(cfs_rq);
1975 }
1976
1977 set_skip_buddy(se);
1978}
1979
1980static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
1981{
1982 struct sched_entity *se = &p->se;
1983
1984 if (!se->on_rq)
1985 return false;
1986
1987 /* Tell the scheduler that we'd really like pse to run next. */
1988 set_next_buddy(se);
1989
1990 yield_task_fair(rq);
1991
1992 return true;
1993}
1994
1935#ifdef CONFIG_SMP 1995#ifdef CONFIG_SMP
1936/************************************************** 1996/**************************************************
1937 * Fair scheduling class load-balancing methods: 1997 * Fair scheduling class load-balancing methods:
@@ -2123,7 +2183,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
2123 * We need to update shares after updating tg->load_weight in 2183 * We need to update shares after updating tg->load_weight in
2124 * order to adjust the weight of groups with long running tasks. 2184 * order to adjust the weight of groups with long running tasks.
2125 */ 2185 */
2126 update_cfs_shares(cfs_rq, 0); 2186 update_cfs_shares(cfs_rq);
2127 2187
2128 raw_spin_unlock_irqrestore(&rq->lock, flags); 2188 raw_spin_unlock_irqrestore(&rq->lock, flags);
2129 2189
@@ -2610,7 +2670,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2610 * @this_cpu: Cpu for which load balance is currently performed. 2670 * @this_cpu: Cpu for which load balance is currently performed.
2611 * @idle: Idle status of this_cpu 2671 * @idle: Idle status of this_cpu
2612 * @load_idx: Load index of sched_domain of this_cpu for load calc. 2672 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2613 * @sd_idle: Idle status of the sched_domain containing group.
2614 * @local_group: Does group contain this_cpu. 2673 * @local_group: Does group contain this_cpu.
2615 * @cpus: Set of cpus considered for load balancing. 2674 * @cpus: Set of cpus considered for load balancing.
2616 * @balance: Should we balance. 2675 * @balance: Should we balance.
@@ -2618,7 +2677,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2618 */ 2677 */
2619static inline void update_sg_lb_stats(struct sched_domain *sd, 2678static inline void update_sg_lb_stats(struct sched_domain *sd,
2620 struct sched_group *group, int this_cpu, 2679 struct sched_group *group, int this_cpu,
2621 enum cpu_idle_type idle, int load_idx, int *sd_idle, 2680 enum cpu_idle_type idle, int load_idx,
2622 int local_group, const struct cpumask *cpus, 2681 int local_group, const struct cpumask *cpus,
2623 int *balance, struct sg_lb_stats *sgs) 2682 int *balance, struct sg_lb_stats *sgs)
2624{ 2683{
@@ -2638,9 +2697,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2638 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 2697 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2639 struct rq *rq = cpu_rq(i); 2698 struct rq *rq = cpu_rq(i);
2640 2699
2641 if (*sd_idle && rq->nr_running)
2642 *sd_idle = 0;
2643
2644 /* Bias balancing toward cpus of our domain */ 2700 /* Bias balancing toward cpus of our domain */
2645 if (local_group) { 2701 if (local_group) {
2646 if (idle_cpu(i) && !first_idle_cpu) { 2702 if (idle_cpu(i) && !first_idle_cpu) {
@@ -2685,7 +2741,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2685 2741
2686 /* 2742 /*
2687 * Consider the group unbalanced when the imbalance is larger 2743 * Consider the group unbalanced when the imbalance is larger
2688 * than the average weight of two tasks. 2744 * than the average weight of a task.
2689 * 2745 *
2690 * APZ: with cgroup the avg task weight can vary wildly and 2746 * APZ: with cgroup the avg task weight can vary wildly and
2691 * might not be a suitable number - should we keep a 2747 * might not be a suitable number - should we keep a
@@ -2695,7 +2751,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2695 if (sgs->sum_nr_running) 2751 if (sgs->sum_nr_running)
2696 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 2752 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2697 2753
2698 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) 2754 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
2699 sgs->group_imb = 1; 2755 sgs->group_imb = 1;
2700 2756
2701 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2757 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
@@ -2755,15 +2811,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
2755 * @sd: sched_domain whose statistics are to be updated. 2811 * @sd: sched_domain whose statistics are to be updated.
2756 * @this_cpu: Cpu for which load balance is currently performed. 2812 * @this_cpu: Cpu for which load balance is currently performed.
2757 * @idle: Idle status of this_cpu 2813 * @idle: Idle status of this_cpu
2758 * @sd_idle: Idle status of the sched_domain containing sg.
2759 * @cpus: Set of cpus considered for load balancing. 2814 * @cpus: Set of cpus considered for load balancing.
2760 * @balance: Should we balance. 2815 * @balance: Should we balance.
2761 * @sds: variable to hold the statistics for this sched_domain. 2816 * @sds: variable to hold the statistics for this sched_domain.
2762 */ 2817 */
2763static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, 2818static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2764 enum cpu_idle_type idle, int *sd_idle, 2819 enum cpu_idle_type idle, const struct cpumask *cpus,
2765 const struct cpumask *cpus, int *balance, 2820 int *balance, struct sd_lb_stats *sds)
2766 struct sd_lb_stats *sds)
2767{ 2821{
2768 struct sched_domain *child = sd->child; 2822 struct sched_domain *child = sd->child;
2769 struct sched_group *sg = sd->groups; 2823 struct sched_group *sg = sd->groups;
@@ -2781,7 +2835,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2781 2835
2782 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); 2836 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2783 memset(&sgs, 0, sizeof(sgs)); 2837 memset(&sgs, 0, sizeof(sgs));
2784 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, 2838 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
2785 local_group, cpus, balance, &sgs); 2839 local_group, cpus, balance, &sgs);
2786 2840
2787 if (local_group && !(*balance)) 2841 if (local_group && !(*balance))
@@ -3033,7 +3087,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3033 * @imbalance: Variable which stores amount of weighted load which should 3087 * @imbalance: Variable which stores amount of weighted load which should
3034 * be moved to restore balance/put a group to idle. 3088 * be moved to restore balance/put a group to idle.
3035 * @idle: The idle status of this_cpu. 3089 * @idle: The idle status of this_cpu.
3036 * @sd_idle: The idleness of sd
3037 * @cpus: The set of CPUs under consideration for load-balancing. 3090 * @cpus: The set of CPUs under consideration for load-balancing.
3038 * @balance: Pointer to a variable indicating if this_cpu 3091 * @balance: Pointer to a variable indicating if this_cpu
3039 * is the appropriate cpu to perform load balancing at this_level. 3092 * is the appropriate cpu to perform load balancing at this_level.
@@ -3046,7 +3099,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3046static struct sched_group * 3099static struct sched_group *
3047find_busiest_group(struct sched_domain *sd, int this_cpu, 3100find_busiest_group(struct sched_domain *sd, int this_cpu,
3048 unsigned long *imbalance, enum cpu_idle_type idle, 3101 unsigned long *imbalance, enum cpu_idle_type idle,
3049 int *sd_idle, const struct cpumask *cpus, int *balance) 3102 const struct cpumask *cpus, int *balance)
3050{ 3103{
3051 struct sd_lb_stats sds; 3104 struct sd_lb_stats sds;
3052 3105
@@ -3056,22 +3109,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3056 * Compute the various statistics relavent for load balancing at 3109 * Compute the various statistics relavent for load balancing at
3057 * this level. 3110 * this level.
3058 */ 3111 */
3059 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, 3112 update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
3060 balance, &sds); 3113
3061 3114 /*
3062 /* Cases where imbalance does not exist from POV of this_cpu */ 3115 * this_cpu is not the appropriate cpu to perform load balancing at
3063 /* 1) this_cpu is not the appropriate cpu to perform load balancing 3116 * this level.
3064 * at this level.
3065 * 2) There is no busy sibling group to pull from.
3066 * 3) This group is the busiest group.
3067 * 4) This group is more busy than the avg busieness at this
3068 * sched_domain.
3069 * 5) The imbalance is within the specified limit.
3070 *
3071 * Note: when doing newidle balance, if the local group has excess
3072 * capacity (i.e. nr_running < group_capacity) and the busiest group
3073 * does not have any capacity, we force a load balance to pull tasks
3074 * to the local group. In this case, we skip past checks 3, 4 and 5.
3075 */ 3117 */
3076 if (!(*balance)) 3118 if (!(*balance))
3077 goto ret; 3119 goto ret;
@@ -3080,41 +3122,55 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3080 check_asym_packing(sd, &sds, this_cpu, imbalance)) 3122 check_asym_packing(sd, &sds, this_cpu, imbalance))
3081 return sds.busiest; 3123 return sds.busiest;
3082 3124
3125 /* There is no busy sibling group to pull tasks from */
3083 if (!sds.busiest || sds.busiest_nr_running == 0) 3126 if (!sds.busiest || sds.busiest_nr_running == 0)
3084 goto out_balanced; 3127 goto out_balanced;
3085 3128
3086 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 3129 /*
3130 * If the busiest group is imbalanced the below checks don't
3131 * work because they assumes all things are equal, which typically
3132 * isn't true due to cpus_allowed constraints and the like.
3133 */
3134 if (sds.group_imb)
3135 goto force_balance;
3136
3137 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
3087 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 3138 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
3088 !sds.busiest_has_capacity) 3139 !sds.busiest_has_capacity)
3089 goto force_balance; 3140 goto force_balance;
3090 3141
3142 /*
3143 * If the local group is more busy than the selected busiest group
3144 * don't try and pull any tasks.
3145 */
3091 if (sds.this_load >= sds.max_load) 3146 if (sds.this_load >= sds.max_load)
3092 goto out_balanced; 3147 goto out_balanced;
3093 3148
3149 /*
3150 * Don't pull any tasks if this group is already above the domain
3151 * average load.
3152 */
3094 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; 3153 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3095
3096 if (sds.this_load >= sds.avg_load) 3154 if (sds.this_load >= sds.avg_load)
3097 goto out_balanced; 3155 goto out_balanced;
3098 3156
3099 /* 3157 if (idle == CPU_IDLE) {
3100 * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
3101 * And to check for busy balance use !idle_cpu instead of
3102 * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
3103 * even when they are idle.
3104 */
3105 if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
3106 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3107 goto out_balanced;
3108 } else {
3109 /* 3158 /*
3110 * This cpu is idle. If the busiest group load doesn't 3159 * This cpu is idle. If the busiest group load doesn't
3111 * have more tasks than the number of available cpu's and 3160 * have more tasks than the number of available cpu's and
3112 * there is no imbalance between this and busiest group 3161 * there is no imbalance between this and busiest group
3113 * wrt to idle cpu's, it is balanced. 3162 * wrt to idle cpu's, it is balanced.
3114 */ 3163 */
3115 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && 3164 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
3116 sds.busiest_nr_running <= sds.busiest_group_weight) 3165 sds.busiest_nr_running <= sds.busiest_group_weight)
3117 goto out_balanced; 3166 goto out_balanced;
3167 } else {
3168 /*
3169 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
3170 * imbalance_pct to be conservative.
3171 */
3172 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3173 goto out_balanced;
3118 } 3174 }
3119 3175
3120force_balance: 3176force_balance:
@@ -3193,7 +3249,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
3193/* Working cpumask for load_balance and load_balance_newidle. */ 3249/* Working cpumask for load_balance and load_balance_newidle. */
3194static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 3250static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3195 3251
3196static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, 3252static int need_active_balance(struct sched_domain *sd, int idle,
3197 int busiest_cpu, int this_cpu) 3253 int busiest_cpu, int this_cpu)
3198{ 3254{
3199 if (idle == CPU_NEWLY_IDLE) { 3255 if (idle == CPU_NEWLY_IDLE) {
@@ -3225,10 +3281,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
3225 * move_tasks() will succeed. ld_moved will be true and this 3281 * move_tasks() will succeed. ld_moved will be true and this
3226 * active balance code will not be triggered. 3282 * active balance code will not be triggered.
3227 */ 3283 */
3228 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3229 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3230 return 0;
3231
3232 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) 3284 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
3233 return 0; 3285 return 0;
3234 } 3286 }
@@ -3246,7 +3298,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3246 struct sched_domain *sd, enum cpu_idle_type idle, 3298 struct sched_domain *sd, enum cpu_idle_type idle,
3247 int *balance) 3299 int *balance)
3248{ 3300{
3249 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3301 int ld_moved, all_pinned = 0, active_balance = 0;
3250 struct sched_group *group; 3302 struct sched_group *group;
3251 unsigned long imbalance; 3303 unsigned long imbalance;
3252 struct rq *busiest; 3304 struct rq *busiest;
@@ -3255,20 +3307,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3255 3307
3256 cpumask_copy(cpus, cpu_active_mask); 3308 cpumask_copy(cpus, cpu_active_mask);
3257 3309
3258 /*
3259 * When power savings policy is enabled for the parent domain, idle
3260 * sibling can pick up load irrespective of busy siblings. In this case,
3261 * let the state of idle sibling percolate up as CPU_IDLE, instead of
3262 * portraying it as CPU_NOT_IDLE.
3263 */
3264 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
3265 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3266 sd_idle = 1;
3267
3268 schedstat_inc(sd, lb_count[idle]); 3310 schedstat_inc(sd, lb_count[idle]);
3269 3311
3270redo: 3312redo:
3271 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3313 group = find_busiest_group(sd, this_cpu, &imbalance, idle,
3272 cpus, balance); 3314 cpus, balance);
3273 3315
3274 if (*balance == 0) 3316 if (*balance == 0)
@@ -3330,8 +3372,7 @@ redo:
3330 if (idle != CPU_NEWLY_IDLE) 3372 if (idle != CPU_NEWLY_IDLE)
3331 sd->nr_balance_failed++; 3373 sd->nr_balance_failed++;
3332 3374
3333 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), 3375 if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
3334 this_cpu)) {
3335 raw_spin_lock_irqsave(&busiest->lock, flags); 3376 raw_spin_lock_irqsave(&busiest->lock, flags);
3336 3377
3337 /* don't kick the active_load_balance_cpu_stop, 3378 /* don't kick the active_load_balance_cpu_stop,
@@ -3386,10 +3427,6 @@ redo:
3386 sd->balance_interval *= 2; 3427 sd->balance_interval *= 2;
3387 } 3428 }
3388 3429
3389 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3390 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3391 ld_moved = -1;
3392
3393 goto out; 3430 goto out;
3394 3431
3395out_balanced: 3432out_balanced:
@@ -3403,11 +3440,7 @@ out_one_pinned:
3403 (sd->balance_interval < sd->max_interval)) 3440 (sd->balance_interval < sd->max_interval))
3404 sd->balance_interval *= 2; 3441 sd->balance_interval *= 2;
3405 3442
3406 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3443 ld_moved = 0;
3407 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3408 ld_moved = -1;
3409 else
3410 ld_moved = 0;
3411out: 3444out:
3412 return ld_moved; 3445 return ld_moved;
3413} 3446}
@@ -3831,8 +3864,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3831 if (load_balance(cpu, rq, sd, idle, &balance)) { 3864 if (load_balance(cpu, rq, sd, idle, &balance)) {
3832 /* 3865 /*
3833 * We've pulled tasks over so either we're no 3866 * We've pulled tasks over so either we're no
3834 * longer idle, or one of our SMT siblings is 3867 * longer idle.
3835 * not idle.
3836 */ 3868 */
3837 idle = CPU_NOT_IDLE; 3869 idle = CPU_NOT_IDLE;
3838 } 3870 }
@@ -4079,33 +4111,62 @@ static void task_fork_fair(struct task_struct *p)
4079 * Priority of the task has changed. Check to see if we preempt 4111 * Priority of the task has changed. Check to see if we preempt
4080 * the current task. 4112 * the current task.
4081 */ 4113 */
4082static void prio_changed_fair(struct rq *rq, struct task_struct *p, 4114static void
4083 int oldprio, int running) 4115prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
4084{ 4116{
4117 if (!p->se.on_rq)
4118 return;
4119
4085 /* 4120 /*
4086 * Reschedule if we are currently running on this runqueue and 4121 * Reschedule if we are currently running on this runqueue and
4087 * our priority decreased, or if we are not currently running on 4122 * our priority decreased, or if we are not currently running on
4088 * this runqueue and our priority is higher than the current's 4123 * this runqueue and our priority is higher than the current's
4089 */ 4124 */
4090 if (running) { 4125 if (rq->curr == p) {
4091 if (p->prio > oldprio) 4126 if (p->prio > oldprio)
4092 resched_task(rq->curr); 4127 resched_task(rq->curr);
4093 } else 4128 } else
4094 check_preempt_curr(rq, p, 0); 4129 check_preempt_curr(rq, p, 0);
4095} 4130}
4096 4131
4132static void switched_from_fair(struct rq *rq, struct task_struct *p)
4133{
4134 struct sched_entity *se = &p->se;
4135 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4136
4137 /*
4138 * Ensure the task's vruntime is normalized, so that when its
4139 * switched back to the fair class the enqueue_entity(.flags=0) will
4140 * do the right thing.
4141 *
4142 * If it was on_rq, then the dequeue_entity(.flags=0) will already
4143 * have normalized the vruntime, if it was !on_rq, then only when
4144 * the task is sleeping will it still have non-normalized vruntime.
4145 */
4146 if (!se->on_rq && p->state != TASK_RUNNING) {
4147 /*
4148 * Fix up our vruntime so that the current sleep doesn't
4149 * cause 'unlimited' sleep bonus.
4150 */
4151 place_entity(cfs_rq, se, 0);
4152 se->vruntime -= cfs_rq->min_vruntime;
4153 }
4154}
4155
4097/* 4156/*
4098 * We switched to the sched_fair class. 4157 * We switched to the sched_fair class.
4099 */ 4158 */
4100static void switched_to_fair(struct rq *rq, struct task_struct *p, 4159static void switched_to_fair(struct rq *rq, struct task_struct *p)
4101 int running)
4102{ 4160{
4161 if (!p->se.on_rq)
4162 return;
4163
4103 /* 4164 /*
4104 * We were most likely switched from sched_rt, so 4165 * We were most likely switched from sched_rt, so
4105 * kick off the schedule if running, otherwise just see 4166 * kick off the schedule if running, otherwise just see
4106 * if we can still preempt the current task. 4167 * if we can still preempt the current task.
4107 */ 4168 */
4108 if (running) 4169 if (rq->curr == p)
4109 resched_task(rq->curr); 4170 resched_task(rq->curr);
4110 else 4171 else
4111 check_preempt_curr(rq, p, 0); 4172 check_preempt_curr(rq, p, 0);
@@ -4171,6 +4232,7 @@ static const struct sched_class fair_sched_class = {
4171 .enqueue_task = enqueue_task_fair, 4232 .enqueue_task = enqueue_task_fair,
4172 .dequeue_task = dequeue_task_fair, 4233 .dequeue_task = dequeue_task_fair,
4173 .yield_task = yield_task_fair, 4234 .yield_task = yield_task_fair,
4235 .yield_to_task = yield_to_task_fair,
4174 4236
4175 .check_preempt_curr = check_preempt_wakeup, 4237 .check_preempt_curr = check_preempt_wakeup,
4176 4238
@@ -4191,6 +4253,7 @@ static const struct sched_class fair_sched_class = {
4191 .task_fork = task_fork_fair, 4253 .task_fork = task_fork_fair,
4192 4254
4193 .prio_changed = prio_changed_fair, 4255 .prio_changed = prio_changed_fair,
4256 .switched_from = switched_from_fair,
4194 .switched_to = switched_to_fair, 4257 .switched_to = switched_to_fair,
4195 4258
4196 .get_rr_interval = get_rr_interval_fair, 4259 .get_rr_interval = get_rr_interval_fair,
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9fa0f402c87c..a776a6396427 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -52,31 +52,15 @@ static void set_curr_task_idle(struct rq *rq)
52{ 52{
53} 53}
54 54
55static void switched_to_idle(struct rq *rq, struct task_struct *p, 55static void switched_to_idle(struct rq *rq, struct task_struct *p)
56 int running)
57{ 56{
58 /* Can this actually happen?? */ 57 BUG();
59 if (running)
60 resched_task(rq->curr);
61 else
62 check_preempt_curr(rq, p, 0);
63} 58}
64 59
65static void prio_changed_idle(struct rq *rq, struct task_struct *p, 60static void
66 int oldprio, int running) 61prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
67{ 62{
68 /* This can happen for hot plug CPUS */ 63 BUG();
69
70 /*
71 * Reschedule if we are currently running on this runqueue and
72 * our priority decreased, or if we are not currently running on
73 * this runqueue and our priority is higher than the current's
74 */
75 if (running) {
76 if (p->prio > oldprio)
77 resched_task(rq->curr);
78 } else
79 check_preempt_curr(rq, p, 0);
80} 64}
81 65
82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) 66static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
@@ -110,6 +94,4 @@ static const struct sched_class idle_sched_class = {
110 94
111 .prio_changed = prio_changed_idle, 95 .prio_changed = prio_changed_idle,
112 .switched_to = switched_to_idle, 96 .switched_to = switched_to_idle,
113
114 /* no .task_new for idle tasks */
115}; 97};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 01f75a5f17af..db308cb08b75 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1599,8 +1599,7 @@ static void rq_offline_rt(struct rq *rq)
1599 * When switch from the rt queue, we bring ourselves to a position 1599 * When switch from the rt queue, we bring ourselves to a position
1600 * that we might want to pull RT tasks from other runqueues. 1600 * that we might want to pull RT tasks from other runqueues.
1601 */ 1601 */
1602static void switched_from_rt(struct rq *rq, struct task_struct *p, 1602static void switched_from_rt(struct rq *rq, struct task_struct *p)
1603 int running)
1604{ 1603{
1605 /* 1604 /*
1606 * If there are other RT tasks then we will reschedule 1605 * If there are other RT tasks then we will reschedule
@@ -1609,7 +1608,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
1609 * we may need to handle the pulling of RT tasks 1608 * we may need to handle the pulling of RT tasks
1610 * now. 1609 * now.
1611 */ 1610 */
1612 if (!rq->rt.rt_nr_running) 1611 if (p->se.on_rq && !rq->rt.rt_nr_running)
1613 pull_rt_task(rq); 1612 pull_rt_task(rq);
1614} 1613}
1615 1614
@@ -1628,8 +1627,7 @@ static inline void init_sched_rt_class(void)
1628 * with RT tasks. In this case we try to push them off to 1627 * with RT tasks. In this case we try to push them off to
1629 * other runqueues. 1628 * other runqueues.
1630 */ 1629 */
1631static void switched_to_rt(struct rq *rq, struct task_struct *p, 1630static void switched_to_rt(struct rq *rq, struct task_struct *p)
1632 int running)
1633{ 1631{
1634 int check_resched = 1; 1632 int check_resched = 1;
1635 1633
@@ -1640,7 +1638,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
1640 * If that current running task is also an RT task 1638 * If that current running task is also an RT task
1641 * then see if we can move to another run queue. 1639 * then see if we can move to another run queue.
1642 */ 1640 */
1643 if (!running) { 1641 if (p->se.on_rq && rq->curr != p) {
1644#ifdef CONFIG_SMP 1642#ifdef CONFIG_SMP
1645 if (rq->rt.overloaded && push_rt_task(rq) && 1643 if (rq->rt.overloaded && push_rt_task(rq) &&
1646 /* Don't resched if we changed runqueues */ 1644 /* Don't resched if we changed runqueues */
@@ -1656,10 +1654,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
1656 * Priority of the task has changed. This may cause 1654 * Priority of the task has changed. This may cause
1657 * us to initiate a push or pull. 1655 * us to initiate a push or pull.
1658 */ 1656 */
1659static void prio_changed_rt(struct rq *rq, struct task_struct *p, 1657static void
1660 int oldprio, int running) 1658prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1661{ 1659{
1662 if (running) { 1660 if (!p->se.on_rq)
1661 return;
1662
1663 if (rq->curr == p) {
1663#ifdef CONFIG_SMP 1664#ifdef CONFIG_SMP
1664 /* 1665 /*
1665 * If our priority decreases while running, we 1666 * If our priority decreases while running, we
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 2bf6b47058c1..1ba2bd40fdac 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -59,14 +59,13 @@ static void set_curr_task_stop(struct rq *rq)
59{ 59{
60} 60}
61 61
62static void switched_to_stop(struct rq *rq, struct task_struct *p, 62static void switched_to_stop(struct rq *rq, struct task_struct *p)
63 int running)
64{ 63{
65 BUG(); /* its impossible to change to this class */ 64 BUG(); /* its impossible to change to this class */
66} 65}
67 66
68static void prio_changed_stop(struct rq *rq, struct task_struct *p, 67static void
69 int oldprio, int running) 68prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
70{ 69{
71 BUG(); /* how!?, what priority? */ 70 BUG(); /* how!?, what priority? */
72} 71}
@@ -103,6 +102,4 @@ static const struct sched_class stop_sched_class = {
103 102
104 .prio_changed = prio_changed_stop, 103 .prio_changed = prio_changed_stop,
105 .switched_to = switched_to_stop, 104 .switched_to = switched_to_stop,
106
107 /* no .task_new for stop tasks */
108}; 105};
diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff10fdce..324eff5468ad 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -636,13 +636,33 @@ static inline bool si_fromuser(const struct siginfo *info)
636} 636}
637 637
638/* 638/*
639 * called with RCU read lock from check_kill_permission()
640 */
641static int kill_ok_by_cred(struct task_struct *t)
642{
643 const struct cred *cred = current_cred();
644 const struct cred *tcred = __task_cred(t);
645
646 if (cred->user->user_ns == tcred->user->user_ns &&
647 (cred->euid == tcred->suid ||
648 cred->euid == tcred->uid ||
649 cred->uid == tcred->suid ||
650 cred->uid == tcred->uid))
651 return 1;
652
653 if (ns_capable(tcred->user->user_ns, CAP_KILL))
654 return 1;
655
656 return 0;
657}
658
659/*
639 * Bad permissions for sending the signal 660 * Bad permissions for sending the signal
640 * - the caller must hold the RCU read lock 661 * - the caller must hold the RCU read lock
641 */ 662 */
642static int check_kill_permission(int sig, struct siginfo *info, 663static int check_kill_permission(int sig, struct siginfo *info,
643 struct task_struct *t) 664 struct task_struct *t)
644{ 665{
645 const struct cred *cred, *tcred;
646 struct pid *sid; 666 struct pid *sid;
647 int error; 667 int error;
648 668
@@ -656,14 +676,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
656 if (error) 676 if (error)
657 return error; 677 return error;
658 678
659 cred = current_cred();
660 tcred = __task_cred(t);
661 if (!same_thread_group(current, t) && 679 if (!same_thread_group(current, t) &&
662 (cred->euid ^ tcred->suid) && 680 !kill_ok_by_cred(t)) {
663 (cred->euid ^ tcred->uid) &&
664 (cred->uid ^ tcred->suid) &&
665 (cred->uid ^ tcred->uid) &&
666 !capable(CAP_KILL)) {
667 switch (sig) { 681 switch (sig) {
668 case SIGCONT: 682 case SIGCONT:
669 sid = task_session(t); 683 sid = task_session(t);
@@ -2421,9 +2435,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2421 return -EFAULT; 2435 return -EFAULT;
2422 2436
2423 /* Not even root can pretend to send signals from the kernel. 2437 /* Not even root can pretend to send signals from the kernel.
2424 Nor can they impersonate a kill(), which adds source info. */ 2438 * Nor can they impersonate a kill()/tgkill(), which adds source info.
2425 if (info.si_code >= 0) 2439 */
2440 if (info.si_code != SI_QUEUE) {
2441 /* We used to allow any < 0 si_code */
2442 WARN_ON_ONCE(info.si_code < 0);
2426 return -EPERM; 2443 return -EPERM;
2444 }
2427 info.si_signo = sig; 2445 info.si_signo = sig;
2428 2446
2429 /* POSIX.1b doesn't mention process groups. */ 2447 /* POSIX.1b doesn't mention process groups. */
@@ -2437,9 +2455,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
2437 return -EINVAL; 2455 return -EINVAL;
2438 2456
2439 /* Not even root can pretend to send signals from the kernel. 2457 /* Not even root can pretend to send signals from the kernel.
2440 Nor can they impersonate a kill(), which adds source info. */ 2458 * Nor can they impersonate a kill()/tgkill(), which adds source info.
2441 if (info->si_code >= 0) 2459 */
2460 if (info->si_code != SI_QUEUE) {
2461 /* We used to allow any < 0 si_code */
2462 WARN_ON_ONCE(info->si_code < 0);
2442 return -EPERM; 2463 return -EPERM;
2464 }
2443 info->si_signo = sig; 2465 info->si_signo = sig;
2444 2466
2445 return do_send_specific(tgid, pid, sig, info); 2467 return do_send_specific(tgid, pid, sig, info);
diff --git a/kernel/smp.c b/kernel/smp.c
index 9910744f0856..73a195193558 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -194,7 +194,7 @@ void generic_smp_call_function_interrupt(void)
194 */ 194 */
195 list_for_each_entry_rcu(data, &call_function.queue, csd.list) { 195 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
196 int refs; 196 int refs;
197 void (*func) (void *info); 197 smp_call_func_t func;
198 198
199 /* 199 /*
200 * Since we walk the list without any locks, we might 200 * Since we walk the list without any locks, we might
@@ -214,17 +214,17 @@ void generic_smp_call_function_interrupt(void)
214 if (atomic_read(&data->refs) == 0) 214 if (atomic_read(&data->refs) == 0)
215 continue; 215 continue;
216 216
217 func = data->csd.func; /* for later warn */ 217 func = data->csd.func; /* save for later warn */
218 data->csd.func(data->csd.info); 218 func(data->csd.info);
219 219
220 /* 220 /*
221 * If the cpu mask is not still set then it enabled interrupts, 221 * If the cpu mask is not still set then func enabled
222 * we took another smp interrupt, and executed the function 222 * interrupts (BUG), and this cpu took another smp call
223 * twice on this cpu. In theory that copy decremented refs. 223 * function interrupt and executed func(info) twice
224 * on this cpu. That nested execution decremented refs.
224 */ 225 */
225 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { 226 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
226 WARN(1, "%pS enabled interrupts and double executed\n", 227 WARN(1, "%pf enabled interrupts and double executed\n", func);
227 func);
228 continue; 228 continue;
229 } 229 }
230 230
@@ -450,7 +450,7 @@ void smp_call_function_many(const struct cpumask *mask,
450{ 450{
451 struct call_function_data *data; 451 struct call_function_data *data;
452 unsigned long flags; 452 unsigned long flags;
453 int cpu, next_cpu, this_cpu = smp_processor_id(); 453 int refs, cpu, next_cpu, this_cpu = smp_processor_id();
454 454
455 /* 455 /*
456 * Can deadlock when called with interrupts disabled. 456 * Can deadlock when called with interrupts disabled.
@@ -461,7 +461,7 @@ void smp_call_function_many(const struct cpumask *mask,
461 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() 461 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
462 && !oops_in_progress && !early_boot_irqs_disabled); 462 && !oops_in_progress && !early_boot_irqs_disabled);
463 463
464 /* So, what's a CPU they want? Ignoring this one. */ 464 /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */
465 cpu = cpumask_first_and(mask, cpu_online_mask); 465 cpu = cpumask_first_and(mask, cpu_online_mask);
466 if (cpu == this_cpu) 466 if (cpu == this_cpu)
467 cpu = cpumask_next_and(cpu, mask, cpu_online_mask); 467 cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
@@ -483,22 +483,49 @@ void smp_call_function_many(const struct cpumask *mask,
483 483
484 data = &__get_cpu_var(cfd_data); 484 data = &__get_cpu_var(cfd_data);
485 csd_lock(&data->csd); 485 csd_lock(&data->csd);
486
487 /* This BUG_ON verifies our reuse assertions and can be removed */
486 BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); 488 BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
487 489
490 /*
491 * The global call function queue list add and delete are protected
492 * by a lock, but the list is traversed without any lock, relying
493 * on the rcu list add and delete to allow safe concurrent traversal.
494 * We reuse the call function data without waiting for any grace
495 * period after some other cpu removes it from the global queue.
496 * This means a cpu might find our data block as it is being
497 * filled out.
498 *
499 * We hold off the interrupt handler on the other cpu by
500 * ordering our writes to the cpu mask vs our setting of the
501 * refs counter. We assert only the cpu owning the data block
502 * will set a bit in cpumask, and each bit will only be cleared
503 * by the subject cpu. Each cpu must first find its bit is
504 * set and then check that refs is set indicating the element is
505 * ready to be processed, otherwise it must skip the entry.
506 *
507 * On the previous iteration refs was set to 0 by another cpu.
508 * To avoid the use of transitivity, set the counter to 0 here
509 * so the wmb will pair with the rmb in the interrupt handler.
510 */
511 atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */
512
488 data->csd.func = func; 513 data->csd.func = func;
489 data->csd.info = info; 514 data->csd.info = info;
490 cpumask_and(data->cpumask, mask, cpu_online_mask);
491 cpumask_clear_cpu(this_cpu, data->cpumask);
492 515
493 /* 516 /* Ensure 0 refs is visible before mask. Also orders func and info */
494 * To ensure the interrupt handler gets an complete view
495 * we order the cpumask and refs writes and order the read
496 * of them in the interrupt handler. In addition we may
497 * only clear our own cpu bit from the mask.
498 */
499 smp_wmb(); 517 smp_wmb();
500 518
501 atomic_set(&data->refs, cpumask_weight(data->cpumask)); 519 /* We rely on the "and" being processed before the store */
520 cpumask_and(data->cpumask, mask, cpu_online_mask);
521 cpumask_clear_cpu(this_cpu, data->cpumask);
522 refs = cpumask_weight(data->cpumask);
523
524 /* Some callers race with other cpus changing the passed mask */
525 if (unlikely(!refs)) {
526 csd_unlock(&data->csd);
527 return;
528 }
502 529
503 raw_spin_lock_irqsave(&call_function.lock, flags); 530 raw_spin_lock_irqsave(&call_function.lock, flags);
504 /* 531 /*
@@ -507,6 +534,12 @@ void smp_call_function_many(const struct cpumask *mask,
507 * will not miss any other list entries: 534 * will not miss any other list entries:
508 */ 535 */
509 list_add_rcu(&data->csd.list, &call_function.queue); 536 list_add_rcu(&data->csd.list, &call_function.queue);
537 /*
538 * We rely on the wmb() in list_add_rcu to complete our writes
539 * to the cpumask before this write to refs, which indicates
540 * data is on the list and is ready to be processed.
541 */
542 atomic_set(&data->refs, refs);
510 raw_spin_unlock_irqrestore(&call_function.lock, flags); 543 raw_spin_unlock_irqrestore(&call_function.lock, flags);
511 544
512 /* 545 /*
@@ -571,6 +604,87 @@ void ipi_call_unlock_irq(void)
571} 604}
572#endif /* USE_GENERIC_SMP_HELPERS */ 605#endif /* USE_GENERIC_SMP_HELPERS */
573 606
607/* Setup configured maximum number of CPUs to activate */
608unsigned int setup_max_cpus = NR_CPUS;
609EXPORT_SYMBOL(setup_max_cpus);
610
611
612/*
613 * Setup routine for controlling SMP activation
614 *
615 * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
616 * activation entirely (the MPS table probe still happens, though).
617 *
618 * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
619 * greater than 0, limits the maximum number of CPUs activated in
620 * SMP mode to <NUM>.
621 */
622
623void __weak arch_disable_smp_support(void) { }
624
625static int __init nosmp(char *str)
626{
627 setup_max_cpus = 0;
628 arch_disable_smp_support();
629
630 return 0;
631}
632
633early_param("nosmp", nosmp);
634
635/* this is hard limit */
636static int __init nrcpus(char *str)
637{
638 int nr_cpus;
639
640 get_option(&str, &nr_cpus);
641 if (nr_cpus > 0 && nr_cpus < nr_cpu_ids)
642 nr_cpu_ids = nr_cpus;
643
644 return 0;
645}
646
647early_param("nr_cpus", nrcpus);
648
649static int __init maxcpus(char *str)
650{
651 get_option(&str, &setup_max_cpus);
652 if (setup_max_cpus == 0)
653 arch_disable_smp_support();
654
655 return 0;
656}
657
658early_param("maxcpus", maxcpus);
659
660/* Setup number of possible processor ids */
661int nr_cpu_ids __read_mostly = NR_CPUS;
662EXPORT_SYMBOL(nr_cpu_ids);
663
664/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
665void __init setup_nr_cpu_ids(void)
666{
667 nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
668}
669
670/* Called by boot processor to activate the rest. */
671void __init smp_init(void)
672{
673 unsigned int cpu;
674
675 /* FIXME: This should be done in userspace --RR */
676 for_each_present_cpu(cpu) {
677 if (num_online_cpus() >= setup_max_cpus)
678 break;
679 if (!cpu_online(cpu))
680 cpu_up(cpu);
681 }
682
683 /* Any cleanup work */
684 printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
685 smp_cpus_done(setup_max_cpus);
686}
687
574/* 688/*
575 * Call a function on all processors. May be used during early boot while 689 * Call a function on all processors. May be used during early boot while
576 * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead 690 * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 68eb5efec388..735d87095172 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_stat);
54 54
55static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; 55static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
56 56
57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 57DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
@@ -311,9 +311,21 @@ void irq_enter(void)
311} 311}
312 312
313#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 313#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
314# define invoke_softirq() __do_softirq() 314static inline void invoke_softirq(void)
315{
316 if (!force_irqthreads)
317 __do_softirq();
318 else
319 wakeup_softirqd();
320}
315#else 321#else
316# define invoke_softirq() do_softirq() 322static inline void invoke_softirq(void)
323{
324 if (!force_irqthreads)
325 do_softirq();
326 else
327 wakeup_softirqd();
328}
317#endif 329#endif
318 330
319/* 331/*
@@ -721,7 +733,6 @@ static int run_ksoftirqd(void * __bind_cpu)
721{ 733{
722 set_current_state(TASK_INTERRUPTIBLE); 734 set_current_state(TASK_INTERRUPTIBLE);
723 735
724 current->flags |= PF_KSOFTIRQD;
725 while (!kthread_should_stop()) { 736 while (!kthread_should_stop()) {
726 preempt_disable(); 737 preempt_disable();
727 if (!local_softirq_pending()) { 738 if (!local_softirq_pending()) {
@@ -738,7 +749,10 @@ static int run_ksoftirqd(void * __bind_cpu)
738 don't process */ 749 don't process */
739 if (cpu_is_offline((long)__bind_cpu)) 750 if (cpu_is_offline((long)__bind_cpu))
740 goto wait_to_die; 751 goto wait_to_die;
741 do_softirq(); 752 local_irq_disable();
753 if (local_softirq_pending())
754 __do_softirq();
755 local_irq_enable();
742 preempt_enable_no_resched(); 756 preempt_enable_no_resched();
743 cond_resched(); 757 cond_resched();
744 preempt_disable(); 758 preempt_disable();
@@ -831,7 +845,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
831 switch (action) { 845 switch (action) {
832 case CPU_UP_PREPARE: 846 case CPU_UP_PREPARE:
833 case CPU_UP_PREPARE_FROZEN: 847 case CPU_UP_PREPARE_FROZEN:
834 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 848 p = kthread_create_on_node(run_ksoftirqd,
849 hcpu,
850 cpu_to_node(hotcpu),
851 "ksoftirqd/%d", hotcpu);
835 if (IS_ERR(p)) { 852 if (IS_ERR(p)) {
836 printk("ksoftirqd for %i failed\n", hotcpu); 853 printk("ksoftirqd for %i failed\n", hotcpu);
837 return notifier_from_errno(PTR_ERR(p)); 854 return notifier_from_errno(PTR_ERR(p));
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2df820b03beb..e3516b29076c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -301,8 +301,10 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
301 case CPU_UP_PREPARE: 301 case CPU_UP_PREPARE:
302 BUG_ON(stopper->thread || stopper->enabled || 302 BUG_ON(stopper->thread || stopper->enabled ||
303 !list_empty(&stopper->works)); 303 !list_empty(&stopper->works));
304 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", 304 p = kthread_create_on_node(cpu_stopper_thread,
305 cpu); 305 stopper,
306 cpu_to_node(cpu),
307 "migration/%d", cpu);
306 if (IS_ERR(p)) 308 if (IS_ERR(p))
307 return notifier_from_errno(PTR_ERR(p)); 309 return notifier_from_errno(PTR_ERR(p));
308 get_task_struct(p); 310 get_task_struct(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 18da702ec813..af468edf096a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -37,6 +37,7 @@
37#include <linux/ptrace.h> 37#include <linux/ptrace.h>
38#include <linux/fs_struct.h> 38#include <linux/fs_struct.h>
39#include <linux/gfp.h> 39#include <linux/gfp.h>
40#include <linux/syscore_ops.h>
40 41
41#include <linux/compat.h> 42#include <linux/compat.h>
42#include <linux/syscalls.h> 43#include <linux/syscalls.h>
@@ -119,16 +120,33 @@ EXPORT_SYMBOL(cad_pid);
119void (*pm_power_off_prepare)(void); 120void (*pm_power_off_prepare)(void);
120 121
121/* 122/*
123 * Returns true if current's euid is same as p's uid or euid,
124 * or has CAP_SYS_NICE to p's user_ns.
125 *
126 * Called with rcu_read_lock, creds are safe
127 */
128static bool set_one_prio_perm(struct task_struct *p)
129{
130 const struct cred *cred = current_cred(), *pcred = __task_cred(p);
131
132 if (pcred->user->user_ns == cred->user->user_ns &&
133 (pcred->uid == cred->euid ||
134 pcred->euid == cred->euid))
135 return true;
136 if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
137 return true;
138 return false;
139}
140
141/*
122 * set the priority of a task 142 * set the priority of a task
123 * - the caller must hold the RCU read lock 143 * - the caller must hold the RCU read lock
124 */ 144 */
125static int set_one_prio(struct task_struct *p, int niceval, int error) 145static int set_one_prio(struct task_struct *p, int niceval, int error)
126{ 146{
127 const struct cred *cred = current_cred(), *pcred = __task_cred(p);
128 int no_nice; 147 int no_nice;
129 148
130 if (pcred->uid != cred->euid && 149 if (!set_one_prio_perm(p)) {
131 pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
132 error = -EPERM; 150 error = -EPERM;
133 goto out; 151 goto out;
134 } 152 }
@@ -298,6 +316,7 @@ void kernel_restart_prepare(char *cmd)
298 system_state = SYSTEM_RESTART; 316 system_state = SYSTEM_RESTART;
299 device_shutdown(); 317 device_shutdown();
300 sysdev_shutdown(); 318 sysdev_shutdown();
319 syscore_shutdown();
301} 320}
302 321
303/** 322/**
@@ -336,6 +355,7 @@ void kernel_halt(void)
336{ 355{
337 kernel_shutdown_prepare(SYSTEM_HALT); 356 kernel_shutdown_prepare(SYSTEM_HALT);
338 sysdev_shutdown(); 357 sysdev_shutdown();
358 syscore_shutdown();
339 printk(KERN_EMERG "System halted.\n"); 359 printk(KERN_EMERG "System halted.\n");
340 kmsg_dump(KMSG_DUMP_HALT); 360 kmsg_dump(KMSG_DUMP_HALT);
341 machine_halt(); 361 machine_halt();
@@ -355,6 +375,7 @@ void kernel_power_off(void)
355 pm_power_off_prepare(); 375 pm_power_off_prepare();
356 disable_nonboot_cpus(); 376 disable_nonboot_cpus();
357 sysdev_shutdown(); 377 sysdev_shutdown();
378 syscore_shutdown();
358 printk(KERN_EMERG "Power down.\n"); 379 printk(KERN_EMERG "Power down.\n");
359 kmsg_dump(KMSG_DUMP_POWEROFF); 380 kmsg_dump(KMSG_DUMP_POWEROFF);
360 machine_power_off(); 381 machine_power_off();
@@ -502,7 +523,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
502 if (rgid != (gid_t) -1) { 523 if (rgid != (gid_t) -1) {
503 if (old->gid == rgid || 524 if (old->gid == rgid ||
504 old->egid == rgid || 525 old->egid == rgid ||
505 capable(CAP_SETGID)) 526 nsown_capable(CAP_SETGID))
506 new->gid = rgid; 527 new->gid = rgid;
507 else 528 else
508 goto error; 529 goto error;
@@ -511,7 +532,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
511 if (old->gid == egid || 532 if (old->gid == egid ||
512 old->egid == egid || 533 old->egid == egid ||
513 old->sgid == egid || 534 old->sgid == egid ||
514 capable(CAP_SETGID)) 535 nsown_capable(CAP_SETGID))
515 new->egid = egid; 536 new->egid = egid;
516 else 537 else
517 goto error; 538 goto error;
@@ -546,7 +567,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
546 old = current_cred(); 567 old = current_cred();
547 568
548 retval = -EPERM; 569 retval = -EPERM;
549 if (capable(CAP_SETGID)) 570 if (nsown_capable(CAP_SETGID))
550 new->gid = new->egid = new->sgid = new->fsgid = gid; 571 new->gid = new->egid = new->sgid = new->fsgid = gid;
551 else if (gid == old->gid || gid == old->sgid) 572 else if (gid == old->gid || gid == old->sgid)
552 new->egid = new->fsgid = gid; 573 new->egid = new->fsgid = gid;
@@ -613,7 +634,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
613 new->uid = ruid; 634 new->uid = ruid;
614 if (old->uid != ruid && 635 if (old->uid != ruid &&
615 old->euid != ruid && 636 old->euid != ruid &&
616 !capable(CAP_SETUID)) 637 !nsown_capable(CAP_SETUID))
617 goto error; 638 goto error;
618 } 639 }
619 640
@@ -622,7 +643,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
622 if (old->uid != euid && 643 if (old->uid != euid &&
623 old->euid != euid && 644 old->euid != euid &&
624 old->suid != euid && 645 old->suid != euid &&
625 !capable(CAP_SETUID)) 646 !nsown_capable(CAP_SETUID))
626 goto error; 647 goto error;
627 } 648 }
628 649
@@ -670,7 +691,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
670 old = current_cred(); 691 old = current_cred();
671 692
672 retval = -EPERM; 693 retval = -EPERM;
673 if (capable(CAP_SETUID)) { 694 if (nsown_capable(CAP_SETUID)) {
674 new->suid = new->uid = uid; 695 new->suid = new->uid = uid;
675 if (uid != old->uid) { 696 if (uid != old->uid) {
676 retval = set_user(new); 697 retval = set_user(new);
@@ -712,7 +733,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
712 old = current_cred(); 733 old = current_cred();
713 734
714 retval = -EPERM; 735 retval = -EPERM;
715 if (!capable(CAP_SETUID)) { 736 if (!nsown_capable(CAP_SETUID)) {
716 if (ruid != (uid_t) -1 && ruid != old->uid && 737 if (ruid != (uid_t) -1 && ruid != old->uid &&
717 ruid != old->euid && ruid != old->suid) 738 ruid != old->euid && ruid != old->suid)
718 goto error; 739 goto error;
@@ -776,7 +797,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
776 old = current_cred(); 797 old = current_cred();
777 798
778 retval = -EPERM; 799 retval = -EPERM;
779 if (!capable(CAP_SETGID)) { 800 if (!nsown_capable(CAP_SETGID)) {
780 if (rgid != (gid_t) -1 && rgid != old->gid && 801 if (rgid != (gid_t) -1 && rgid != old->gid &&
781 rgid != old->egid && rgid != old->sgid) 802 rgid != old->egid && rgid != old->sgid)
782 goto error; 803 goto error;
@@ -836,7 +857,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
836 857
837 if (uid == old->uid || uid == old->euid || 858 if (uid == old->uid || uid == old->euid ||
838 uid == old->suid || uid == old->fsuid || 859 uid == old->suid || uid == old->fsuid ||
839 capable(CAP_SETUID)) { 860 nsown_capable(CAP_SETUID)) {
840 if (uid != old_fsuid) { 861 if (uid != old_fsuid) {
841 new->fsuid = uid; 862 new->fsuid = uid;
842 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) 863 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -869,7 +890,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
869 890
870 if (gid == old->gid || gid == old->egid || 891 if (gid == old->gid || gid == old->egid ||
871 gid == old->sgid || gid == old->fsgid || 892 gid == old->sgid || gid == old->fsgid ||
872 capable(CAP_SETGID)) { 893 nsown_capable(CAP_SETGID)) {
873 if (gid != old_fsgid) { 894 if (gid != old_fsgid) {
874 new->fsgid = gid; 895 new->fsgid = gid;
875 goto change_okay; 896 goto change_okay;
@@ -1177,8 +1198,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1177 int errno; 1198 int errno;
1178 char tmp[__NEW_UTS_LEN]; 1199 char tmp[__NEW_UTS_LEN];
1179 1200
1180 if (!capable(CAP_SYS_ADMIN)) 1201 if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
1181 return -EPERM; 1202 return -EPERM;
1203
1182 if (len < 0 || len > __NEW_UTS_LEN) 1204 if (len < 0 || len > __NEW_UTS_LEN)
1183 return -EINVAL; 1205 return -EINVAL;
1184 down_write(&uts_sem); 1206 down_write(&uts_sem);
@@ -1226,7 +1248,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1226 int errno; 1248 int errno;
1227 char tmp[__NEW_UTS_LEN]; 1249 char tmp[__NEW_UTS_LEN];
1228 1250
1229 if (!capable(CAP_SYS_ADMIN)) 1251 if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
1230 return -EPERM; 1252 return -EPERM;
1231 if (len < 0 || len > __NEW_UTS_LEN) 1253 if (len < 0 || len > __NEW_UTS_LEN)
1232 return -EINVAL; 1254 return -EINVAL;
@@ -1341,6 +1363,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
1341 rlim = tsk->signal->rlim + resource; 1363 rlim = tsk->signal->rlim + resource;
1342 task_lock(tsk->group_leader); 1364 task_lock(tsk->group_leader);
1343 if (new_rlim) { 1365 if (new_rlim) {
1366 /* Keep the capable check against init_user_ns until
1367 cgroups can contain all limits */
1344 if (new_rlim->rlim_max > rlim->rlim_max && 1368 if (new_rlim->rlim_max > rlim->rlim_max &&
1345 !capable(CAP_SYS_RESOURCE)) 1369 !capable(CAP_SYS_RESOURCE))
1346 retval = -EPERM; 1370 retval = -EPERM;
@@ -1384,19 +1408,22 @@ static int check_prlimit_permission(struct task_struct *task)
1384{ 1408{
1385 const struct cred *cred = current_cred(), *tcred; 1409 const struct cred *cred = current_cred(), *tcred;
1386 1410
1387 tcred = __task_cred(task); 1411 if (current == task)
1388 if (current != task && 1412 return 0;
1389 (cred->uid != tcred->euid ||
1390 cred->uid != tcred->suid ||
1391 cred->uid != tcred->uid ||
1392 cred->gid != tcred->egid ||
1393 cred->gid != tcred->sgid ||
1394 cred->gid != tcred->gid) &&
1395 !capable(CAP_SYS_RESOURCE)) {
1396 return -EPERM;
1397 }
1398 1413
1399 return 0; 1414 tcred = __task_cred(task);
1415 if (cred->user->user_ns == tcred->user->user_ns &&
1416 (cred->uid == tcred->euid &&
1417 cred->uid == tcred->suid &&
1418 cred->uid == tcred->uid &&
1419 cred->gid == tcred->egid &&
1420 cred->gid == tcred->sgid &&
1421 cred->gid == tcred->gid))
1422 return 0;
1423 if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
1424 return 0;
1425
1426 return -EPERM;
1400} 1427}
1401 1428
1402SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, 1429SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c782fe9924c7..25cc41cd8f33 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -186,3 +186,8 @@ cond_syscall(sys_perf_event_open);
186/* fanotify! */ 186/* fanotify! */
187cond_syscall(sys_fanotify_init); 187cond_syscall(sys_fanotify_init);
188cond_syscall(sys_fanotify_mark); 188cond_syscall(sys_fanotify_mark);
189
190/* open by handle */
191cond_syscall(sys_name_to_handle_at);
192cond_syscall(sys_open_by_handle_at);
193cond_syscall(compat_sys_open_by_handle_at);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4eed0af5d144..c0bb32414b17 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -117,6 +117,7 @@ static int neg_one = -1;
117static int zero; 117static int zero;
118static int __maybe_unused one = 1; 118static int __maybe_unused one = 1;
119static int __maybe_unused two = 2; 119static int __maybe_unused two = 2;
120static int __maybe_unused three = 3;
120static unsigned long one_ul = 1; 121static unsigned long one_ul = 1;
121static int one_hundred = 100; 122static int one_hundred = 100;
122#ifdef CONFIG_PRINTK 123#ifdef CONFIG_PRINTK
@@ -169,6 +170,11 @@ static int proc_taint(struct ctl_table *table, int write,
169 void __user *buffer, size_t *lenp, loff_t *ppos); 170 void __user *buffer, size_t *lenp, loff_t *ppos);
170#endif 171#endif
171 172
173#ifdef CONFIG_PRINTK
174static int proc_dmesg_restrict(struct ctl_table *table, int write,
175 void __user *buffer, size_t *lenp, loff_t *ppos);
176#endif
177
172#ifdef CONFIG_MAGIC_SYSRQ 178#ifdef CONFIG_MAGIC_SYSRQ
173/* Note: sysrq code uses it's own private copy */ 179/* Note: sysrq code uses it's own private copy */
174static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; 180static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -361,20 +367,13 @@ static struct ctl_table kern_table[] = {
361 .mode = 0644, 367 .mode = 0644,
362 .proc_handler = sched_rt_handler, 368 .proc_handler = sched_rt_handler,
363 }, 369 },
364 {
365 .procname = "sched_compat_yield",
366 .data = &sysctl_sched_compat_yield,
367 .maxlen = sizeof(unsigned int),
368 .mode = 0644,
369 .proc_handler = proc_dointvec,
370 },
371#ifdef CONFIG_SCHED_AUTOGROUP 370#ifdef CONFIG_SCHED_AUTOGROUP
372 { 371 {
373 .procname = "sched_autogroup_enabled", 372 .procname = "sched_autogroup_enabled",
374 .data = &sysctl_sched_autogroup_enabled, 373 .data = &sysctl_sched_autogroup_enabled,
375 .maxlen = sizeof(unsigned int), 374 .maxlen = sizeof(unsigned int),
376 .mode = 0644, 375 .mode = 0644,
377 .proc_handler = proc_dointvec, 376 .proc_handler = proc_dointvec_minmax,
378 .extra1 = &zero, 377 .extra1 = &zero,
379 .extra2 = &one, 378 .extra2 = &one,
380 }, 379 },
@@ -713,7 +712,7 @@ static struct ctl_table kern_table[] = {
713 .data = &kptr_restrict, 712 .data = &kptr_restrict,
714 .maxlen = sizeof(int), 713 .maxlen = sizeof(int),
715 .mode = 0644, 714 .mode = 0644,
716 .proc_handler = proc_dointvec_minmax, 715 .proc_handler = proc_dmesg_restrict,
717 .extra1 = &zero, 716 .extra1 = &zero,
718 .extra2 = &two, 717 .extra2 = &two,
719 }, 718 },
@@ -948,7 +947,7 @@ static struct ctl_table kern_table[] = {
948 .data = &sysctl_perf_event_sample_rate, 947 .data = &sysctl_perf_event_sample_rate,
949 .maxlen = sizeof(sysctl_perf_event_sample_rate), 948 .maxlen = sizeof(sysctl_perf_event_sample_rate),
950 .mode = 0644, 949 .mode = 0644,
951 .proc_handler = proc_dointvec, 950 .proc_handler = perf_proc_update_handler,
952 }, 951 },
953#endif 952#endif
954#ifdef CONFIG_KMEMCHECK 953#ifdef CONFIG_KMEMCHECK
@@ -978,14 +977,18 @@ static struct ctl_table vm_table[] = {
978 .data = &sysctl_overcommit_memory, 977 .data = &sysctl_overcommit_memory,
979 .maxlen = sizeof(sysctl_overcommit_memory), 978 .maxlen = sizeof(sysctl_overcommit_memory),
980 .mode = 0644, 979 .mode = 0644,
981 .proc_handler = proc_dointvec, 980 .proc_handler = proc_dointvec_minmax,
981 .extra1 = &zero,
982 .extra2 = &two,
982 }, 983 },
983 { 984 {
984 .procname = "panic_on_oom", 985 .procname = "panic_on_oom",
985 .data = &sysctl_panic_on_oom, 986 .data = &sysctl_panic_on_oom,
986 .maxlen = sizeof(sysctl_panic_on_oom), 987 .maxlen = sizeof(sysctl_panic_on_oom),
987 .mode = 0644, 988 .mode = 0644,
988 .proc_handler = proc_dointvec, 989 .proc_handler = proc_dointvec_minmax,
990 .extra1 = &zero,
991 .extra2 = &two,
989 }, 992 },
990 { 993 {
991 .procname = "oom_kill_allocating_task", 994 .procname = "oom_kill_allocating_task",
@@ -1013,7 +1016,8 @@ static struct ctl_table vm_table[] = {
1013 .data = &page_cluster, 1016 .data = &page_cluster,
1014 .maxlen = sizeof(int), 1017 .maxlen = sizeof(int),
1015 .mode = 0644, 1018 .mode = 0644,
1016 .proc_handler = proc_dointvec, 1019 .proc_handler = proc_dointvec_minmax,
1020 .extra1 = &zero,
1017 }, 1021 },
1018 { 1022 {
1019 .procname = "dirty_background_ratio", 1023 .procname = "dirty_background_ratio",
@@ -1061,7 +1065,8 @@ static struct ctl_table vm_table[] = {
1061 .data = &dirty_expire_interval, 1065 .data = &dirty_expire_interval,
1062 .maxlen = sizeof(dirty_expire_interval), 1066 .maxlen = sizeof(dirty_expire_interval),
1063 .mode = 0644, 1067 .mode = 0644,
1064 .proc_handler = proc_dointvec, 1068 .proc_handler = proc_dointvec_minmax,
1069 .extra1 = &zero,
1065 }, 1070 },
1066 { 1071 {
1067 .procname = "nr_pdflush_threads", 1072 .procname = "nr_pdflush_threads",
@@ -1137,6 +1142,8 @@ static struct ctl_table vm_table[] = {
1137 .maxlen = sizeof(int), 1142 .maxlen = sizeof(int),
1138 .mode = 0644, 1143 .mode = 0644,
1139 .proc_handler = drop_caches_sysctl_handler, 1144 .proc_handler = drop_caches_sysctl_handler,
1145 .extra1 = &one,
1146 .extra2 = &three,
1140 }, 1147 },
1141#ifdef CONFIG_COMPACTION 1148#ifdef CONFIG_COMPACTION
1142 { 1149 {
@@ -1690,13 +1697,8 @@ static int test_perm(int mode, int op)
1690 1697
1691int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) 1698int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1692{ 1699{
1693 int error;
1694 int mode; 1700 int mode;
1695 1701
1696 error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
1697 if (error)
1698 return error;
1699
1700 if (root->permissions) 1702 if (root->permissions)
1701 mode = root->permissions(root, current->nsproxy, table); 1703 mode = root->permissions(root, current->nsproxy, table);
1702 else 1704 else
@@ -2397,6 +2399,17 @@ static int proc_taint(struct ctl_table *table, int write,
2397 return err; 2399 return err;
2398} 2400}
2399 2401
2402#ifdef CONFIG_PRINTK
2403static int proc_dmesg_restrict(struct ctl_table *table, int write,
2404 void __user *buffer, size_t *lenp, loff_t *ppos)
2405{
2406 if (write && !capable(CAP_SYS_ADMIN))
2407 return -EPERM;
2408
2409 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2410}
2411#endif
2412
2400struct do_proc_dointvec_minmax_conv_param { 2413struct do_proc_dointvec_minmax_conv_param {
2401 int *min; 2414 int *min;
2402 int *max; 2415 int *max;
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index b875bedf7c9a..3b8e028b9601 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1321,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1321 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1321 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1322{ 1322{
1323 const struct bin_table *table = NULL; 1323 const struct bin_table *table = NULL;
1324 struct nameidata nd;
1325 struct vfsmount *mnt; 1324 struct vfsmount *mnt;
1326 struct file *file; 1325 struct file *file;
1327 ssize_t result; 1326 ssize_t result;
1328 char *pathname; 1327 char *pathname;
1329 int flags; 1328 int flags;
1330 int acc_mode;
1331 1329
1332 pathname = sysctl_getname(name, nlen, &table); 1330 pathname = sysctl_getname(name, nlen, &table);
1333 result = PTR_ERR(pathname); 1331 result = PTR_ERR(pathname);
@@ -1337,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1337 /* How should the sysctl be accessed? */ 1335 /* How should the sysctl be accessed? */
1338 if (oldval && oldlen && newval && newlen) { 1336 if (oldval && oldlen && newval && newlen) {
1339 flags = O_RDWR; 1337 flags = O_RDWR;
1340 acc_mode = MAY_READ | MAY_WRITE;
1341 } else if (newval && newlen) { 1338 } else if (newval && newlen) {
1342 flags = O_WRONLY; 1339 flags = O_WRONLY;
1343 acc_mode = MAY_WRITE;
1344 } else if (oldval && oldlen) { 1340 } else if (oldval && oldlen) {
1345 flags = O_RDONLY; 1341 flags = O_RDONLY;
1346 acc_mode = MAY_READ;
1347 } else { 1342 } else {
1348 result = 0; 1343 result = 0;
1349 goto out_putname; 1344 goto out_putname;
1350 } 1345 }
1351 1346
1352 mnt = current->nsproxy->pid_ns->proc_mnt; 1347 mnt = current->nsproxy->pid_ns->proc_mnt;
1353 result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd); 1348 file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
1354 if (result)
1355 goto out_putname;
1356
1357 result = may_open(&nd.path, acc_mode, flags);
1358 if (result)
1359 goto out_putpath;
1360
1361 file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
1362 result = PTR_ERR(file); 1349 result = PTR_ERR(file);
1363 if (IS_ERR(file)) 1350 if (IS_ERR(file))
1364 goto out_putname; 1351 goto out_putname;
@@ -1370,10 +1357,6 @@ out_putname:
1370 putname(pathname); 1357 putname(pathname);
1371out: 1358out:
1372 return result; 1359 return result;
1373
1374out_putpath:
1375 path_put(&nd.path);
1376 goto out_putname;
1377} 1360}
1378 1361
1379 1362
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 10b90d8a03c4..4e4932a7b360 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
111 const char *fail = NULL; 111 const char *fail = NULL;
112 112
113 if (table->parent) { 113 if (table->parent) {
114 if (table->procname && !table->parent->procname) 114 if (!table->parent->procname)
115 set_fail(&fail, table, "Parent without procname"); 115 set_fail(&fail, table, "Parent without procname");
116 } 116 }
117 if (!table->procname)
118 set_fail(&fail, table, "No procname");
119 if (table->child) { 117 if (table->child) {
120 if (table->data) 118 if (table->data)
121 set_fail(&fail, table, "Directory with data?"); 119 set_fail(&fail, table, "Directory with data?");
@@ -144,13 +142,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
144 set_fail(&fail, table, "No maxlen"); 142 set_fail(&fail, table, "No maxlen");
145 } 143 }
146#ifdef CONFIG_PROC_SYSCTL 144#ifdef CONFIG_PROC_SYSCTL
147 if (table->procname && !table->proc_handler) 145 if (!table->proc_handler)
148 set_fail(&fail, table, "No proc_handler"); 146 set_fail(&fail, table, "No proc_handler");
149#endif 147#endif
150#if 0
151 if (!table->procname && table->proc_handler)
152 set_fail(&fail, table, "proc_handler without procname");
153#endif
154 sysctl_check_leaf(namespaces, table, &fail); 148 sysctl_check_leaf(namespaces, table, &fail);
155 } 149 }
156 if (table->mode > 0777) 150 if (table->mode > 0777)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 3971c6b9d58d..9ffea360a778 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -685,7 +685,7 @@ static int __init taskstats_init(void)
685 goto err_cgroup_ops; 685 goto err_cgroup_ops;
686 686
687 family_registered = 1; 687 family_registered = 1;
688 printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 688 pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
689 return 0; 689 return 0;
690err_cgroup_ops: 690err_cgroup_ops:
691 genl_unregister_ops(&family, &taskstats_ops); 691 genl_unregister_ops(&family, &taskstats_ops);
diff --git a/kernel/time.c b/kernel/time.c
index 32174359576f..8e8dc6d705c9 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -150,7 +150,7 @@ static inline void warp_clock(void)
150 * various programs will get confused when the clock gets warped. 150 * various programs will get confused when the clock gets warped.
151 */ 151 */
152 152
153int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) 153int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
154{ 154{
155 static int firsttime = 1; 155 static int firsttime = 1;
156 int error = 0; 156 int error = 0;
@@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x)
645} 645}
646 646
647/** 647/**
648 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies 648 * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
649 * 649 *
650 * @n: nsecs in u64 650 * @n: nsecs in u64
651 * 651 *
@@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x)
657 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) 657 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
658 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years 658 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
659 */ 659 */
660unsigned long nsecs_to_jiffies(u64 n) 660u64 nsecs_to_jiffies64(u64 n)
661{ 661{
662#if (NSEC_PER_SEC % HZ) == 0 662#if (NSEC_PER_SEC % HZ) == 0
663 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ 663 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
@@ -674,22 +674,23 @@ unsigned long nsecs_to_jiffies(u64 n)
674#endif 674#endif
675} 675}
676 676
677#if (BITS_PER_LONG < 64) 677/**
678u64 get_jiffies_64(void) 678 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
679 *
680 * @n: nsecs in u64
681 *
682 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
683 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
684 * for scheduler, not for use in device drivers to calculate timeout value.
685 *
686 * note:
687 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
688 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
689 */
690unsigned long nsecs_to_jiffies(u64 n)
679{ 691{
680 unsigned long seq; 692 return (unsigned long)nsecs_to_jiffies64(n);
681 u64 ret;
682
683 do {
684 seq = read_seqbegin(&xtime_lock);
685 ret = jiffies_64;
686 } while (read_seqretry(&xtime_lock, seq));
687 return ret;
688} 693}
689EXPORT_SYMBOL(get_jiffies_64);
690#endif
691
692EXPORT_SYMBOL(jiffies);
693 694
694/* 695/*
695 * Add two timespec values and do a safety check for overflow. 696 * Add two timespec values and do a safety check for overflow.
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ee266620b06c..b0425991e9ac 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,5 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
2obj-y += timeconv.o posix-clock.o
2 3
3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d7395fdfb9f3..0d74b9ba90c8 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,7 +18,6 @@
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 5404a8456909..b2fa506667c0 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -22,8 +22,11 @@
22************************************************************************/ 22************************************************************************/
23#include <linux/clocksource.h> 23#include <linux/clocksource.h>
24#include <linux/jiffies.h> 24#include <linux/jiffies.h>
25#include <linux/module.h>
25#include <linux/init.h> 26#include <linux/init.h>
26 27
28#include "tick-internal.h"
29
27/* The Jiffies based clocksource is the lowest common 30/* The Jiffies based clocksource is the lowest common
28 * denominator clock source which should function on 31 * denominator clock source which should function on
29 * all systems. It has the same coarse resolution as 32 * all systems. It has the same coarse resolution as
@@ -64,6 +67,23 @@ struct clocksource clocksource_jiffies = {
64 .shift = JIFFIES_SHIFT, 67 .shift = JIFFIES_SHIFT,
65}; 68};
66 69
70#if (BITS_PER_LONG < 64)
71u64 get_jiffies_64(void)
72{
73 unsigned long seq;
74 u64 ret;
75
76 do {
77 seq = read_seqbegin(&xtime_lock);
78 ret = jiffies_64;
79 } while (read_seqretry(&xtime_lock, seq));
80 return ret;
81}
82EXPORT_SYMBOL(get_jiffies_64);
83#endif
84
85EXPORT_SYMBOL(jiffies);
86
67static int __init init_jiffies_clocksource(void) 87static int __init init_jiffies_clocksource(void)
68{ 88{
69 return clocksource_register(&clocksource_jiffies); 89 return clocksource_register(&clocksource_jiffies);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5c00242fa921..5f1bb8e2008f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -16,6 +16,8 @@
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/module.h> 17#include <linux/module.h>
18 18
19#include "tick-internal.h"
20
19/* 21/*
20 * NTP timekeeping variables: 22 * NTP timekeeping variables:
21 */ 23 */
@@ -646,6 +648,17 @@ int do_adjtimex(struct timex *txc)
646 hrtimer_cancel(&leap_timer); 648 hrtimer_cancel(&leap_timer);
647 } 649 }
648 650
651 if (txc->modes & ADJ_SETOFFSET) {
652 struct timespec delta;
653 delta.tv_sec = txc->time.tv_sec;
654 delta.tv_nsec = txc->time.tv_usec;
655 if (!(txc->modes & ADJ_NANO))
656 delta.tv_nsec *= 1000;
657 result = timekeeping_inject_offset(&delta);
658 if (result)
659 return result;
660 }
661
649 getnstimeofday(&ts); 662 getnstimeofday(&ts);
650 663
651 write_seqlock_irq(&xtime_lock); 664 write_seqlock_irq(&xtime_lock);
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
new file mode 100644
index 000000000000..25028dd4fa18
--- /dev/null
+++ b/kernel/time/posix-clock.c
@@ -0,0 +1,451 @@
1/*
2 * posix-clock.c - support for dynamic clock devices
3 *
4 * Copyright (C) 2010 OMICRON electronics GmbH
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20#include <linux/device.h>
21#include <linux/file.h>
22#include <linux/mutex.h>
23#include <linux/posix-clock.h>
24#include <linux/slab.h>
25#include <linux/syscalls.h>
26#include <linux/uaccess.h>
27
28static void delete_clock(struct kref *kref);
29
30/*
31 * Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
32 */
33static struct posix_clock *get_posix_clock(struct file *fp)
34{
35 struct posix_clock *clk = fp->private_data;
36
37 mutex_lock(&clk->mutex);
38
39 if (!clk->zombie)
40 return clk;
41
42 mutex_unlock(&clk->mutex);
43
44 return NULL;
45}
46
47static void put_posix_clock(struct posix_clock *clk)
48{
49 mutex_unlock(&clk->mutex);
50}
51
52static ssize_t posix_clock_read(struct file *fp, char __user *buf,
53 size_t count, loff_t *ppos)
54{
55 struct posix_clock *clk = get_posix_clock(fp);
56 int err = -EINVAL;
57
58 if (!clk)
59 return -ENODEV;
60
61 if (clk->ops.read)
62 err = clk->ops.read(clk, fp->f_flags, buf, count);
63
64 put_posix_clock(clk);
65
66 return err;
67}
68
69static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
70{
71 struct posix_clock *clk = get_posix_clock(fp);
72 int result = 0;
73
74 if (!clk)
75 return -ENODEV;
76
77 if (clk->ops.poll)
78 result = clk->ops.poll(clk, fp, wait);
79
80 put_posix_clock(clk);
81
82 return result;
83}
84
85static int posix_clock_fasync(int fd, struct file *fp, int on)
86{
87 struct posix_clock *clk = get_posix_clock(fp);
88 int err = 0;
89
90 if (!clk)
91 return -ENODEV;
92
93 if (clk->ops.fasync)
94 err = clk->ops.fasync(clk, fd, fp, on);
95
96 put_posix_clock(clk);
97
98 return err;
99}
100
101static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma)
102{
103 struct posix_clock *clk = get_posix_clock(fp);
104 int err = -ENODEV;
105
106 if (!clk)
107 return -ENODEV;
108
109 if (clk->ops.mmap)
110 err = clk->ops.mmap(clk, vma);
111
112 put_posix_clock(clk);
113
114 return err;
115}
116
117static long posix_clock_ioctl(struct file *fp,
118 unsigned int cmd, unsigned long arg)
119{
120 struct posix_clock *clk = get_posix_clock(fp);
121 int err = -ENOTTY;
122
123 if (!clk)
124 return -ENODEV;
125
126 if (clk->ops.ioctl)
127 err = clk->ops.ioctl(clk, cmd, arg);
128
129 put_posix_clock(clk);
130
131 return err;
132}
133
134#ifdef CONFIG_COMPAT
135static long posix_clock_compat_ioctl(struct file *fp,
136 unsigned int cmd, unsigned long arg)
137{
138 struct posix_clock *clk = get_posix_clock(fp);
139 int err = -ENOTTY;
140
141 if (!clk)
142 return -ENODEV;
143
144 if (clk->ops.ioctl)
145 err = clk->ops.ioctl(clk, cmd, arg);
146
147 put_posix_clock(clk);
148
149 return err;
150}
151#endif
152
153static int posix_clock_open(struct inode *inode, struct file *fp)
154{
155 int err;
156 struct posix_clock *clk =
157 container_of(inode->i_cdev, struct posix_clock, cdev);
158
159 mutex_lock(&clk->mutex);
160
161 if (clk->zombie) {
162 err = -ENODEV;
163 goto out;
164 }
165 if (clk->ops.open)
166 err = clk->ops.open(clk, fp->f_mode);
167 else
168 err = 0;
169
170 if (!err) {
171 kref_get(&clk->kref);
172 fp->private_data = clk;
173 }
174out:
175 mutex_unlock(&clk->mutex);
176 return err;
177}
178
179static int posix_clock_release(struct inode *inode, struct file *fp)
180{
181 struct posix_clock *clk = fp->private_data;
182 int err = 0;
183
184 if (clk->ops.release)
185 err = clk->ops.release(clk);
186
187 kref_put(&clk->kref, delete_clock);
188
189 fp->private_data = NULL;
190
191 return err;
192}
193
194static const struct file_operations posix_clock_file_operations = {
195 .owner = THIS_MODULE,
196 .llseek = no_llseek,
197 .read = posix_clock_read,
198 .poll = posix_clock_poll,
199 .unlocked_ioctl = posix_clock_ioctl,
200 .open = posix_clock_open,
201 .release = posix_clock_release,
202 .fasync = posix_clock_fasync,
203 .mmap = posix_clock_mmap,
204#ifdef CONFIG_COMPAT
205 .compat_ioctl = posix_clock_compat_ioctl,
206#endif
207};
208
209int posix_clock_register(struct posix_clock *clk, dev_t devid)
210{
211 int err;
212
213 kref_init(&clk->kref);
214 mutex_init(&clk->mutex);
215
216 cdev_init(&clk->cdev, &posix_clock_file_operations);
217 clk->cdev.owner = clk->ops.owner;
218 err = cdev_add(&clk->cdev, devid, 1);
219 if (err)
220 goto no_cdev;
221
222 return err;
223no_cdev:
224 mutex_destroy(&clk->mutex);
225 return err;
226}
227EXPORT_SYMBOL_GPL(posix_clock_register);
228
229static void delete_clock(struct kref *kref)
230{
231 struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
232 mutex_destroy(&clk->mutex);
233 if (clk->release)
234 clk->release(clk);
235}
236
237void posix_clock_unregister(struct posix_clock *clk)
238{
239 cdev_del(&clk->cdev);
240
241 mutex_lock(&clk->mutex);
242 clk->zombie = true;
243 mutex_unlock(&clk->mutex);
244
245 kref_put(&clk->kref, delete_clock);
246}
247EXPORT_SYMBOL_GPL(posix_clock_unregister);
248
249struct posix_clock_desc {
250 struct file *fp;
251 struct posix_clock *clk;
252};
253
254static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd)
255{
256 struct file *fp = fget(CLOCKID_TO_FD(id));
257 int err = -EINVAL;
258
259 if (!fp)
260 return err;
261
262 if (fp->f_op->open != posix_clock_open || !fp->private_data)
263 goto out;
264
265 cd->fp = fp;
266 cd->clk = get_posix_clock(fp);
267
268 err = cd->clk ? 0 : -ENODEV;
269out:
270 if (err)
271 fput(fp);
272 return err;
273}
274
275static void put_clock_desc(struct posix_clock_desc *cd)
276{
277 put_posix_clock(cd->clk);
278 fput(cd->fp);
279}
280
281static int pc_clock_adjtime(clockid_t id, struct timex *tx)
282{
283 struct posix_clock_desc cd;
284 int err;
285
286 err = get_clock_desc(id, &cd);
287 if (err)
288 return err;
289
290 if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
291 err = -EACCES;
292 goto out;
293 }
294
295 if (cd.clk->ops.clock_adjtime)
296 err = cd.clk->ops.clock_adjtime(cd.clk, tx);
297 else
298 err = -EOPNOTSUPP;
299out:
300 put_clock_desc(&cd);
301
302 return err;
303}
304
305static int pc_clock_gettime(clockid_t id, struct timespec *ts)
306{
307 struct posix_clock_desc cd;
308 int err;
309
310 err = get_clock_desc(id, &cd);
311 if (err)
312 return err;
313
314 if (cd.clk->ops.clock_gettime)
315 err = cd.clk->ops.clock_gettime(cd.clk, ts);
316 else
317 err = -EOPNOTSUPP;
318
319 put_clock_desc(&cd);
320
321 return err;
322}
323
324static int pc_clock_getres(clockid_t id, struct timespec *ts)
325{
326 struct posix_clock_desc cd;
327 int err;
328
329 err = get_clock_desc(id, &cd);
330 if (err)
331 return err;
332
333 if (cd.clk->ops.clock_getres)
334 err = cd.clk->ops.clock_getres(cd.clk, ts);
335 else
336 err = -EOPNOTSUPP;
337
338 put_clock_desc(&cd);
339
340 return err;
341}
342
343static int pc_clock_settime(clockid_t id, const struct timespec *ts)
344{
345 struct posix_clock_desc cd;
346 int err;
347
348 err = get_clock_desc(id, &cd);
349 if (err)
350 return err;
351
352 if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
353 err = -EACCES;
354 goto out;
355 }
356
357 if (cd.clk->ops.clock_settime)
358 err = cd.clk->ops.clock_settime(cd.clk, ts);
359 else
360 err = -EOPNOTSUPP;
361out:
362 put_clock_desc(&cd);
363
364 return err;
365}
366
367static int pc_timer_create(struct k_itimer *kit)
368{
369 clockid_t id = kit->it_clock;
370 struct posix_clock_desc cd;
371 int err;
372
373 err = get_clock_desc(id, &cd);
374 if (err)
375 return err;
376
377 if (cd.clk->ops.timer_create)
378 err = cd.clk->ops.timer_create(cd.clk, kit);
379 else
380 err = -EOPNOTSUPP;
381
382 put_clock_desc(&cd);
383
384 return err;
385}
386
387static int pc_timer_delete(struct k_itimer *kit)
388{
389 clockid_t id = kit->it_clock;
390 struct posix_clock_desc cd;
391 int err;
392
393 err = get_clock_desc(id, &cd);
394 if (err)
395 return err;
396
397 if (cd.clk->ops.timer_delete)
398 err = cd.clk->ops.timer_delete(cd.clk, kit);
399 else
400 err = -EOPNOTSUPP;
401
402 put_clock_desc(&cd);
403
404 return err;
405}
406
407static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
408{
409 clockid_t id = kit->it_clock;
410 struct posix_clock_desc cd;
411
412 if (get_clock_desc(id, &cd))
413 return;
414
415 if (cd.clk->ops.timer_gettime)
416 cd.clk->ops.timer_gettime(cd.clk, kit, ts);
417
418 put_clock_desc(&cd);
419}
420
421static int pc_timer_settime(struct k_itimer *kit, int flags,
422 struct itimerspec *ts, struct itimerspec *old)
423{
424 clockid_t id = kit->it_clock;
425 struct posix_clock_desc cd;
426 int err;
427
428 err = get_clock_desc(id, &cd);
429 if (err)
430 return err;
431
432 if (cd.clk->ops.timer_settime)
433 err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
434 else
435 err = -EOPNOTSUPP;
436
437 put_clock_desc(&cd);
438
439 return err;
440}
441
442struct k_clock clock_posix_dynamic = {
443 .clock_getres = pc_clock_getres,
444 .clock_set = pc_clock_settime,
445 .clock_get = pc_clock_gettime,
446 .clock_adj = pc_clock_adjtime,
447 .timer_create = pc_timer_create,
448 .timer_set = pc_timer_settime,
449 .timer_del = pc_timer_delete,
450 .timer_get = pc_timer_gettime,
451};
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index a3b5aff62606..da800ffa810c 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index ed228ef6f6b8..119528de8235 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include <asm/irq_regs.h> 22#include <asm/irq_regs.h>
24 23
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f65d3a723a64..1009b06d6f89 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
1/* 1/*
2 * tick internal variable and functions used by low/high res code 2 * tick internal variable and functions used by low/high res code
3 */ 3 */
4#include <linux/hrtimer.h>
5#include <linux/tick.h>
6
7#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
4 8
5#define TICK_DO_TIMER_NONE -1 9#define TICK_DO_TIMER_NONE -1
6#define TICK_DO_TIMER_BOOT -2 10#define TICK_DO_TIMER_BOOT -2
@@ -135,3 +139,8 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
135{ 139{
136 return !(dev->features & CLOCK_EVT_FEAT_DUMMY); 140 return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
137} 141}
142
143#endif
144
145extern void do_timer(unsigned long ticks);
146extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 5cbc101f908b..2d04411a5f05 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c55ea2433471..d5097c44b407 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -19,7 +19,6 @@
19#include <linux/percpu.h> 19#include <linux/percpu.h>
20#include <linux/profile.h> 20#include <linux/profile.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/tick.h>
23#include <linux/module.h> 22#include <linux/module.h>
24 23
25#include <asm/irq_regs.h> 24#include <asm/irq_regs.h>
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d27c7562902c..3bd7e3d5c632 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -353,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday);
353 * 353 *
354 * Sets the time of day to the new time and update NTP and notify hrtimers 354 * Sets the time of day to the new time and update NTP and notify hrtimers
355 */ 355 */
356int do_settimeofday(struct timespec *tv) 356int do_settimeofday(const struct timespec *tv)
357{ 357{
358 struct timespec ts_delta; 358 struct timespec ts_delta;
359 unsigned long flags; 359 unsigned long flags;
@@ -387,6 +387,42 @@ int do_settimeofday(struct timespec *tv)
387 387
388EXPORT_SYMBOL(do_settimeofday); 388EXPORT_SYMBOL(do_settimeofday);
389 389
390
391/**
392 * timekeeping_inject_offset - Adds or subtracts from the current time.
393 * @tv: pointer to the timespec variable containing the offset
394 *
395 * Adds or subtracts an offset value from the current time.
396 */
397int timekeeping_inject_offset(struct timespec *ts)
398{
399 unsigned long flags;
400
401 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
402 return -EINVAL;
403
404 write_seqlock_irqsave(&xtime_lock, flags);
405
406 timekeeping_forward_now();
407
408 xtime = timespec_add(xtime, *ts);
409 wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts);
410
411 timekeeper.ntp_error = 0;
412 ntp_clear();
413
414 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
415 timekeeper.mult);
416
417 write_sequnlock_irqrestore(&xtime_lock, flags);
418
419 /* signal hrtimers about time change */
420 clock_was_set();
421
422 return 0;
423}
424EXPORT_SYMBOL(timekeeping_inject_offset);
425
390/** 426/**
391 * change_clocksource - Swaps clocksources if a new one is available 427 * change_clocksource - Swaps clocksources if a new one is available
392 * 428 *
@@ -779,7 +815,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
779 * 815 *
780 * Called from the timer interrupt, must hold a write on xtime_lock. 816 * Called from the timer interrupt, must hold a write on xtime_lock.
781 */ 817 */
782void update_wall_time(void) 818static void update_wall_time(void)
783{ 819{
784 struct clocksource *clock; 820 struct clocksource *clock;
785 cycle_t offset; 821 cycle_t offset;
@@ -871,7 +907,7 @@ void update_wall_time(void)
871 * getboottime - Return the real time of system boot. 907 * getboottime - Return the real time of system boot.
872 * @ts: pointer to the timespec to be set 908 * @ts: pointer to the timespec to be set
873 * 909 *
874 * Returns the time of day in a timespec. 910 * Returns the wall-time of boot in a timespec.
875 * 911 *
876 * This is based on the wall_to_monotonic offset and the total suspend 912 * This is based on the wall_to_monotonic offset and the total suspend
877 * time. Calls to settimeofday will affect the value returned (which 913 * time. Calls to settimeofday will affect the value returned (which
@@ -889,6 +925,55 @@ void getboottime(struct timespec *ts)
889} 925}
890EXPORT_SYMBOL_GPL(getboottime); 926EXPORT_SYMBOL_GPL(getboottime);
891 927
928
929/**
930 * get_monotonic_boottime - Returns monotonic time since boot
931 * @ts: pointer to the timespec to be set
932 *
933 * Returns the monotonic time since boot in a timespec.
934 *
935 * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
936 * includes the time spent in suspend.
937 */
938void get_monotonic_boottime(struct timespec *ts)
939{
940 struct timespec tomono, sleep;
941 unsigned int seq;
942 s64 nsecs;
943
944 WARN_ON(timekeeping_suspended);
945
946 do {
947 seq = read_seqbegin(&xtime_lock);
948 *ts = xtime;
949 tomono = wall_to_monotonic;
950 sleep = total_sleep_time;
951 nsecs = timekeeping_get_ns();
952
953 } while (read_seqretry(&xtime_lock, seq));
954
955 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
956 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
957}
958EXPORT_SYMBOL_GPL(get_monotonic_boottime);
959
960/**
961 * ktime_get_boottime - Returns monotonic time since boot in a ktime
962 *
963 * Returns the monotonic time since boot in a ktime
964 *
965 * This is similar to CLOCK_MONTONIC/ktime_get, but also
966 * includes the time spent in suspend.
967 */
968ktime_t ktime_get_boottime(void)
969{
970 struct timespec ts;
971
972 get_monotonic_boottime(&ts);
973 return timespec_to_ktime(ts);
974}
975EXPORT_SYMBOL_GPL(ktime_get_boottime);
976
892/** 977/**
893 * monotonic_to_bootbased - Convert the monotonic time to boot based. 978 * monotonic_to_bootbased - Convert the monotonic time to boot based.
894 * @ts: pointer to the timespec to be converted 979 * @ts: pointer to the timespec to be converted
@@ -910,11 +995,6 @@ struct timespec __current_kernel_time(void)
910 return xtime; 995 return xtime;
911} 996}
912 997
913struct timespec __get_wall_to_monotonic(void)
914{
915 return wall_to_monotonic;
916}
917
918struct timespec current_kernel_time(void) 998struct timespec current_kernel_time(void)
919{ 999{
920 struct timespec now; 1000 struct timespec now;
@@ -946,3 +1026,48 @@ struct timespec get_monotonic_coarse(void)
946 now.tv_nsec + mono.tv_nsec); 1026 now.tv_nsec + mono.tv_nsec);
947 return now; 1027 return now;
948} 1028}
1029
1030/*
1031 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1032 * without sampling the sequence number in xtime_lock.
1033 * jiffies is defined in the linker script...
1034 */
1035void do_timer(unsigned long ticks)
1036{
1037 jiffies_64 += ticks;
1038 update_wall_time();
1039 calc_global_load(ticks);
1040}
1041
1042/**
1043 * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic,
1044 * and sleep offsets.
1045 * @xtim: pointer to timespec to be set with xtime
1046 * @wtom: pointer to timespec to be set with wall_to_monotonic
1047 * @sleep: pointer to timespec to be set with time in suspend
1048 */
1049void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1050 struct timespec *wtom, struct timespec *sleep)
1051{
1052 unsigned long seq;
1053
1054 do {
1055 seq = read_seqbegin(&xtime_lock);
1056 *xtim = xtime;
1057 *wtom = wall_to_monotonic;
1058 *sleep = total_sleep_time;
1059 } while (read_seqretry(&xtime_lock, seq));
1060}
1061
1062/**
1063 * xtime_update() - advances the timekeeping infrastructure
1064 * @ticks: number of ticks, that have elapsed since the last call.
1065 *
1066 * Must be called with interrupts disabled.
1067 */
1068void xtime_update(unsigned long ticks)
1069{
1070 write_seqlock(&xtime_lock);
1071 do_timer(ticks);
1072 write_sequnlock(&xtime_lock);
1073}
diff --git a/kernel/timer.c b/kernel/timer.c
index d6459923d245..fd6198692b57 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -404,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {}
404 404
405static struct debug_obj_descr timer_debug_descr; 405static struct debug_obj_descr timer_debug_descr;
406 406
407static void *timer_debug_hint(void *addr)
408{
409 return ((struct timer_list *) addr)->function;
410}
411
407/* 412/*
408 * fixup_init is called when: 413 * fixup_init is called when:
409 * - an active object is initialized 414 * - an active object is initialized
@@ -477,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
477 482
478static struct debug_obj_descr timer_debug_descr = { 483static struct debug_obj_descr timer_debug_descr = {
479 .name = "timer_list", 484 .name = "timer_list",
485 .debug_hint = timer_debug_hint,
480 .fixup_init = timer_fixup_init, 486 .fixup_init = timer_fixup_init,
481 .fixup_activate = timer_fixup_activate, 487 .fixup_activate = timer_fixup_activate,
482 .fixup_free = timer_fixup_free, 488 .fixup_free = timer_fixup_free,
@@ -964,6 +970,25 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
964 * add_timer_on(). Upon exit the timer is not queued and the handler is 970 * add_timer_on(). Upon exit the timer is not queued and the handler is
965 * not running on any CPU. 971 * not running on any CPU.
966 * 972 *
973 * Note: You must not hold locks that are held in interrupt context
974 * while calling this function. Even if the lock has nothing to do
975 * with the timer in question. Here's why:
976 *
977 * CPU0 CPU1
978 * ---- ----
979 * <SOFTIRQ>
980 * call_timer_fn();
981 * base->running_timer = mytimer;
982 * spin_lock_irq(somelock);
983 * <IRQ>
984 * spin_lock(somelock);
985 * del_timer_sync(mytimer);
986 * while (base->running_timer == mytimer);
987 *
988 * Now del_timer_sync() will never return and never release somelock.
989 * The interrupt on the other CPU is waiting to grab somelock but
990 * it has interrupted the softirq that CPU0 is waiting to finish.
991 *
967 * The function returns whether it has deactivated a pending timer or not. 992 * The function returns whether it has deactivated a pending timer or not.
968 */ 993 */
969int del_timer_sync(struct timer_list *timer) 994int del_timer_sync(struct timer_list *timer)
@@ -971,6 +996,10 @@ int del_timer_sync(struct timer_list *timer)
971#ifdef CONFIG_LOCKDEP 996#ifdef CONFIG_LOCKDEP
972 unsigned long flags; 997 unsigned long flags;
973 998
999 /*
1000 * If lockdep gives a backtrace here, please reference
1001 * the synchronization rules above.
1002 */
974 local_irq_save(flags); 1003 local_irq_save(flags);
975 lock_map_acquire(&timer->lockdep_map); 1004 lock_map_acquire(&timer->lockdep_map);
976 lock_map_release(&timer->lockdep_map); 1005 lock_map_release(&timer->lockdep_map);
@@ -1295,19 +1324,6 @@ void run_local_timers(void)
1295 raise_softirq(TIMER_SOFTIRQ); 1324 raise_softirq(TIMER_SOFTIRQ);
1296} 1325}
1297 1326
1298/*
1299 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1300 * without sampling the sequence number in xtime_lock.
1301 * jiffies is defined in the linker script...
1302 */
1303
1304void do_timer(unsigned long ticks)
1305{
1306 jiffies_64 += ticks;
1307 update_wall_time();
1308 calc_global_load(ticks);
1309}
1310
1311#ifdef __ARCH_WANT_SYS_ALARM 1327#ifdef __ARCH_WANT_SYS_ALARM
1312 1328
1313/* 1329/*
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 14674dce77a6..61d7d59f4a1a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -275,7 +275,7 @@ config PROFILE_ANNOTATED_BRANCHES
275 This tracer profiles all the the likely and unlikely macros 275 This tracer profiles all the the likely and unlikely macros
276 in the kernel. It will display the results in: 276 in the kernel. It will display the results in:
277 277
278 /sys/kernel/debug/tracing/profile_annotated_branch 278 /sys/kernel/debug/tracing/trace_stat/branch_annotated
279 279
280 Note: this will add a significant overhead; only turn this 280 Note: this will add a significant overhead; only turn this
281 on if you need to profile the system's use of these macros. 281 on if you need to profile the system's use of these macros.
@@ -288,7 +288,7 @@ config PROFILE_ALL_BRANCHES
288 taken in the kernel is recorded whether it hit or miss. 288 taken in the kernel is recorded whether it hit or miss.
289 The results will be displayed in: 289 The results will be displayed in:
290 290
291 /sys/kernel/debug/tracing/profile_branch 291 /sys/kernel/debug/tracing/trace_stat/branch_all
292 292
293 This option also enables the likely/unlikely profiler. 293 This option also enables the likely/unlikely profiler.
294 294
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index cbafed7d4f38..7aa40f8e182d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -703,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q)
703 * 703 *
704 **/ 704 **/
705static void blk_add_trace_rq(struct request_queue *q, struct request *rq, 705static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
706 u32 what) 706 u32 what)
707{ 707{
708 struct blk_trace *bt = q->blk_trace; 708 struct blk_trace *bt = q->blk_trace;
709 int rw = rq->cmd_flags & 0x03;
710 709
711 if (likely(!bt)) 710 if (likely(!bt))
712 return; 711 return;
713 712
714 if (rq->cmd_flags & REQ_DISCARD)
715 rw |= REQ_DISCARD;
716
717 if (rq->cmd_flags & REQ_SECURE)
718 rw |= REQ_SECURE;
719
720 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 713 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
721 what |= BLK_TC_ACT(BLK_TC_PC); 714 what |= BLK_TC_ACT(BLK_TC_PC);
722 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, 715 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags,
723 what, rq->errors, rq->cmd_len, rq->cmd); 716 what, rq->errors, rq->cmd_len, rq->cmd);
724 } else { 717 } else {
725 what |= BLK_TC_ACT(BLK_TC_FS); 718 what |= BLK_TC_ACT(BLK_TC_FS);
726 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw, 719 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
727 what, rq->errors, 0, NULL); 720 rq->cmd_flags, what, rq->errors, 0, NULL);
728 } 721 }
729} 722}
730 723
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f3dadae83883..c075f4ea6b94 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1467,7 +1467,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1467 return t_hash_next(m, pos); 1467 return t_hash_next(m, pos);
1468 1468
1469 (*pos)++; 1469 (*pos)++;
1470 iter->pos = *pos; 1470 iter->pos = iter->func_pos = *pos;
1471 1471
1472 if (iter->flags & FTRACE_ITER_PRINTALL) 1472 if (iter->flags & FTRACE_ITER_PRINTALL)
1473 return t_hash_start(m, pos); 1473 return t_hash_start(m, pos);
@@ -1502,7 +1502,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1502 if (!rec) 1502 if (!rec)
1503 return t_hash_start(m, pos); 1503 return t_hash_start(m, pos);
1504 1504
1505 iter->func_pos = *pos;
1506 iter->func = rec; 1505 iter->func = rec;
1507 1506
1508 return iter; 1507 return iter;
@@ -3328,7 +3327,7 @@ static int start_graph_tracing(void)
3328 /* The cpu_boot init_task->ret_stack will never be freed */ 3327 /* The cpu_boot init_task->ret_stack will never be freed */
3329 for_each_online_cpu(cpu) { 3328 for_each_online_cpu(cpu) {
3330 if (!idle_task(cpu)->ret_stack) 3329 if (!idle_task(cpu)->ret_stack)
3331 ftrace_graph_init_task(idle_task(cpu)); 3330 ftrace_graph_init_idle_task(idle_task(cpu), cpu);
3332 } 3331 }
3333 3332
3334 do { 3333 do {
@@ -3418,6 +3417,49 @@ void unregister_ftrace_graph(void)
3418 mutex_unlock(&ftrace_lock); 3417 mutex_unlock(&ftrace_lock);
3419} 3418}
3420 3419
3420static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
3421
3422static void
3423graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
3424{
3425 atomic_set(&t->tracing_graph_pause, 0);
3426 atomic_set(&t->trace_overrun, 0);
3427 t->ftrace_timestamp = 0;
3428 /* make curr_ret_stack visable before we add the ret_stack */
3429 smp_wmb();
3430 t->ret_stack = ret_stack;
3431}
3432
3433/*
3434 * Allocate a return stack for the idle task. May be the first
3435 * time through, or it may be done by CPU hotplug online.
3436 */
3437void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
3438{
3439 t->curr_ret_stack = -1;
3440 /*
3441 * The idle task has no parent, it either has its own
3442 * stack or no stack at all.
3443 */
3444 if (t->ret_stack)
3445 WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
3446
3447 if (ftrace_graph_active) {
3448 struct ftrace_ret_stack *ret_stack;
3449
3450 ret_stack = per_cpu(idle_ret_stack, cpu);
3451 if (!ret_stack) {
3452 ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
3453 * sizeof(struct ftrace_ret_stack),
3454 GFP_KERNEL);
3455 if (!ret_stack)
3456 return;
3457 per_cpu(idle_ret_stack, cpu) = ret_stack;
3458 }
3459 graph_init_task(t, ret_stack);
3460 }
3461}
3462
3421/* Allocate a return stack for newly created task */ 3463/* Allocate a return stack for newly created task */
3422void ftrace_graph_init_task(struct task_struct *t) 3464void ftrace_graph_init_task(struct task_struct *t)
3423{ 3465{
@@ -3433,12 +3475,7 @@ void ftrace_graph_init_task(struct task_struct *t)
3433 GFP_KERNEL); 3475 GFP_KERNEL);
3434 if (!ret_stack) 3476 if (!ret_stack)
3435 return; 3477 return;
3436 atomic_set(&t->tracing_graph_pause, 0); 3478 graph_init_task(t, ret_stack);
3437 atomic_set(&t->trace_overrun, 0);
3438 t->ftrace_timestamp = 0;
3439 /* make curr_ret_stack visable before we add the ret_stack */
3440 smp_wmb();
3441 t->ret_stack = ret_stack;
3442 } 3479 }
3443} 3480}
3444 3481
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bd1c35a4fbcc..d9c8bcafb120 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5,7 +5,6 @@
5 */ 5 */
6#include <linux/ring_buffer.h> 6#include <linux/ring_buffer.h>
7#include <linux/trace_clock.h> 7#include <linux/trace_clock.h>
8#include <linux/ftrace_irq.h>
9#include <linux/spinlock.h> 8#include <linux/spinlock.h>
10#include <linux/debugfs.h> 9#include <linux/debugfs.h>
11#include <linux/uaccess.h> 10#include <linux/uaccess.h>
@@ -669,7 +668,7 @@ static struct list_head *rb_list_head(struct list_head *list)
669 * the reader page). But if the next page is a header page, 668 * the reader page). But if the next page is a header page,
670 * its flags will be non zero. 669 * its flags will be non zero.
671 */ 670 */
672static int inline 671static inline int
673rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, 672rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
674 struct buffer_page *page, struct list_head *list) 673 struct buffer_page *page, struct list_head *list)
675{ 674{
@@ -1429,6 +1428,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1429} 1428}
1430EXPORT_SYMBOL_GPL(ring_buffer_resize); 1429EXPORT_SYMBOL_GPL(ring_buffer_resize);
1431 1430
1431void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
1432{
1433 mutex_lock(&buffer->mutex);
1434 if (val)
1435 buffer->flags |= RB_FL_OVERWRITE;
1436 else
1437 buffer->flags &= ~RB_FL_OVERWRITE;
1438 mutex_unlock(&buffer->mutex);
1439}
1440EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
1441
1432static inline void * 1442static inline void *
1433__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) 1443__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
1434{ 1444{
@@ -2162,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2162 if (likely(ts >= cpu_buffer->write_stamp)) { 2172 if (likely(ts >= cpu_buffer->write_stamp)) {
2163 delta = diff; 2173 delta = diff;
2164 if (unlikely(test_time_stamp(delta))) { 2174 if (unlikely(test_time_stamp(delta))) {
2175 int local_clock_stable = 1;
2176#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
2177 local_clock_stable = sched_clock_stable;
2178#endif
2165 WARN_ONCE(delta > (1ULL << 59), 2179 WARN_ONCE(delta > (1ULL << 59),
2166 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", 2180 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
2167 (unsigned long long)delta, 2181 (unsigned long long)delta,
2168 (unsigned long long)ts, 2182 (unsigned long long)ts,
2169 (unsigned long long)cpu_buffer->write_stamp); 2183 (unsigned long long)cpu_buffer->write_stamp,
2184 local_clock_stable ? "" :
2185 "If you just came from a suspend/resume,\n"
2186 "please switch to the trace global clock:\n"
2187 " echo global > /sys/kernel/debug/tracing/trace_clock\n");
2170 add_timestamp = 1; 2188 add_timestamp = 1;
2171 } 2189 }
2172 } 2190 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dc53ecb80589..9541c27c1cf2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,8 +41,6 @@
41#include "trace.h" 41#include "trace.h"
42#include "trace_output.h" 42#include "trace_output.h"
43 43
44#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
45
46/* 44/*
47 * On boot up, the ring buffer is set to the minimum size, so that 45 * On boot up, the ring buffer is set to the minimum size, so that
48 * we do not waste memory on systems that are not using tracing. 46 * we do not waste memory on systems that are not using tracing.
@@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
340/* trace_flags holds trace_options default values */ 338/* trace_flags holds trace_options default values */
341unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 339unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
342 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 340 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
343 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; 341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
344 342
345static int trace_stop_count; 343static int trace_stop_count;
346static DEFINE_SPINLOCK(tracing_start_lock); 344static DEFINE_SPINLOCK(tracing_start_lock);
@@ -425,6 +423,7 @@ static const char *trace_options[] = {
425 "sleep-time", 423 "sleep-time",
426 "graph-time", 424 "graph-time",
427 "record-cmd", 425 "record-cmd",
426 "overwrite",
428 NULL 427 NULL
429}; 428};
430 429
@@ -780,6 +779,11 @@ __acquires(kernel_lock)
780 tracing_reset_online_cpus(tr); 779 tracing_reset_online_cpus(tr);
781 780
782 current_trace = type; 781 current_trace = type;
782
783 /* If we expanded the buffers, make sure the max is expanded too */
784 if (ring_buffer_expanded && type->use_max_tr)
785 ring_buffer_resize(max_tr.buffer, trace_buf_size);
786
783 /* the test is responsible for initializing and enabling */ 787 /* the test is responsible for initializing and enabling */
784 pr_info("Testing tracer %s: ", type->name); 788 pr_info("Testing tracer %s: ", type->name);
785 ret = type->selftest(type, tr); 789 ret = type->selftest(type, tr);
@@ -792,6 +796,10 @@ __acquires(kernel_lock)
792 /* Only reset on passing, to avoid touching corrupted buffers */ 796 /* Only reset on passing, to avoid touching corrupted buffers */
793 tracing_reset_online_cpus(tr); 797 tracing_reset_online_cpus(tr);
794 798
799 /* Shrink the max buffer again */
800 if (ring_buffer_expanded && type->use_max_tr)
801 ring_buffer_resize(max_tr.buffer, 1);
802
795 printk(KERN_CONT "PASSED\n"); 803 printk(KERN_CONT "PASSED\n");
796 } 804 }
797#endif 805#endif
@@ -1102,7 +1110,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
1102 1110
1103 entry->preempt_count = pc & 0xff; 1111 entry->preempt_count = pc & 0xff;
1104 entry->pid = (tsk) ? tsk->pid : 0; 1112 entry->pid = (tsk) ? tsk->pid : 0;
1105 entry->lock_depth = (tsk) ? tsk->lock_depth : 0;
1106 entry->flags = 1113 entry->flags =
1107#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 1114#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
1108 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 1115 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1749,10 +1756,9 @@ static void print_lat_help_header(struct seq_file *m)
1749 seq_puts(m, "# | / _----=> need-resched \n"); 1756 seq_puts(m, "# | / _----=> need-resched \n");
1750 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 1757 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1751 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 1758 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1752 seq_puts(m, "# |||| /_--=> lock-depth \n"); 1759 seq_puts(m, "# |||| / delay \n");
1753 seq_puts(m, "# |||||/ delay \n"); 1760 seq_puts(m, "# cmd pid ||||| time | caller \n");
1754 seq_puts(m, "# cmd pid |||||| time | caller \n"); 1761 seq_puts(m, "# \\ / ||||| \\ | / \n");
1755 seq_puts(m, "# \\ / |||||| \\ | / \n");
1756} 1762}
1757 1763
1758static void print_func_help_header(struct seq_file *m) 1764static void print_func_help_header(struct seq_file *m)
@@ -2529,6 +2535,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2529 2535
2530 if (mask == TRACE_ITER_RECORD_CMD) 2536 if (mask == TRACE_ITER_RECORD_CMD)
2531 trace_event_enable_cmd_record(enabled); 2537 trace_event_enable_cmd_record(enabled);
2538
2539 if (mask == TRACE_ITER_OVERWRITE)
2540 ring_buffer_change_overwrite(global_trace.buffer, enabled);
2532} 2541}
2533 2542
2534static ssize_t 2543static ssize_t
@@ -2710,6 +2719,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2710 2719
2711 mutex_lock(&trace_types_lock); 2720 mutex_lock(&trace_types_lock);
2712 if (tracer_enabled ^ val) { 2721 if (tracer_enabled ^ val) {
2722
2723 /* Only need to warn if this is used to change the state */
2724 WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
2725
2713 if (val) { 2726 if (val) {
2714 tracer_enabled = 1; 2727 tracer_enabled = 1;
2715 if (current_trace->start) 2728 if (current_trace->start)
@@ -4551,9 +4564,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4551__init static int tracer_alloc_buffers(void) 4564__init static int tracer_alloc_buffers(void)
4552{ 4565{
4553 int ring_buf_size; 4566 int ring_buf_size;
4567 enum ring_buffer_flags rb_flags;
4554 int i; 4568 int i;
4555 int ret = -ENOMEM; 4569 int ret = -ENOMEM;
4556 4570
4571
4557 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) 4572 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
4558 goto out; 4573 goto out;
4559 4574
@@ -4566,12 +4581,13 @@ __init static int tracer_alloc_buffers(void)
4566 else 4581 else
4567 ring_buf_size = 1; 4582 ring_buf_size = 1;
4568 4583
4584 rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
4585
4569 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 4586 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
4570 cpumask_copy(tracing_cpumask, cpu_all_mask); 4587 cpumask_copy(tracing_cpumask, cpu_all_mask);
4571 4588
4572 /* TODO: make the number of buffers hot pluggable with CPUS */ 4589 /* TODO: make the number of buffers hot pluggable with CPUS */
4573 global_trace.buffer = ring_buffer_alloc(ring_buf_size, 4590 global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
4574 TRACE_BUFFER_FLAGS);
4575 if (!global_trace.buffer) { 4591 if (!global_trace.buffer) {
4576 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 4592 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
4577 WARN_ON(1); 4593 WARN_ON(1);
@@ -4581,7 +4597,7 @@ __init static int tracer_alloc_buffers(void)
4581 4597
4582 4598
4583#ifdef CONFIG_TRACER_MAX_TRACE 4599#ifdef CONFIG_TRACER_MAX_TRACE
4584 max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); 4600 max_tr.buffer = ring_buffer_alloc(1, rb_flags);
4585 if (!max_tr.buffer) { 4601 if (!max_tr.buffer) {
4586 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 4602 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
4587 WARN_ON(1); 4603 WARN_ON(1);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9021f8c0c0c3..5e9dfc6286dd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -272,8 +272,8 @@ struct tracer {
272 /* If you handled the flag setting, return 0 */ 272 /* If you handled the flag setting, return 0 */
273 int (*set_flag)(u32 old_flags, u32 bit, int set); 273 int (*set_flag)(u32 old_flags, u32 bit, int set);
274 struct tracer *next; 274 struct tracer *next;
275 int print_max;
276 struct tracer_flags *flags; 275 struct tracer_flags *flags;
276 int print_max;
277 int use_max_tr; 277 int use_max_tr;
278}; 278};
279 279
@@ -606,6 +606,7 @@ enum trace_iterator_flags {
606 TRACE_ITER_SLEEP_TIME = 0x40000, 606 TRACE_ITER_SLEEP_TIME = 0x40000,
607 TRACE_ITER_GRAPH_TIME = 0x80000, 607 TRACE_ITER_GRAPH_TIME = 0x80000,
608 TRACE_ITER_RECORD_CMD = 0x100000, 608 TRACE_ITER_RECORD_CMD = 0x100000,
609 TRACE_ITER_OVERWRITE = 0x200000,
609}; 610};
610 611
611/* 612/*
@@ -661,8 +662,10 @@ struct ftrace_event_field {
661}; 662};
662 663
663struct event_filter { 664struct event_filter {
664 int n_preds; 665 int n_preds; /* Number assigned */
665 struct filter_pred **preds; 666 int a_preds; /* allocated */
667 struct filter_pred *preds;
668 struct filter_pred *root;
666 char *filter_string; 669 char *filter_string;
667}; 670};
668 671
@@ -674,11 +677,23 @@ struct event_subsystem {
674 int nr_events; 677 int nr_events;
675}; 678};
676 679
680#define FILTER_PRED_INVALID ((unsigned short)-1)
681#define FILTER_PRED_IS_RIGHT (1 << 15)
682#define FILTER_PRED_FOLD (1 << 15)
683
684/*
685 * The max preds is the size of unsigned short with
686 * two flags at the MSBs. One bit is used for both the IS_RIGHT
687 * and FOLD flags. The other is reserved.
688 *
689 * 2^14 preds is way more than enough.
690 */
691#define MAX_FILTER_PRED 16384
692
677struct filter_pred; 693struct filter_pred;
678struct regex; 694struct regex;
679 695
680typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, 696typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
681 int val1, int val2);
682 697
683typedef int (*regex_match_func)(char *str, struct regex *r, int len); 698typedef int (*regex_match_func)(char *str, struct regex *r, int len);
684 699
@@ -700,11 +715,23 @@ struct filter_pred {
700 filter_pred_fn_t fn; 715 filter_pred_fn_t fn;
701 u64 val; 716 u64 val;
702 struct regex regex; 717 struct regex regex;
703 char *field_name; 718 /*
719 * Leaf nodes use field_name, ops is used by AND and OR
720 * nodes. The field_name is always freed when freeing a pred.
721 * We can overload field_name for ops and have it freed
722 * as well.
723 */
724 union {
725 char *field_name;
726 unsigned short *ops;
727 };
704 int offset; 728 int offset;
705 int not; 729 int not;
706 int op; 730 int op;
707 int pop_n; 731 unsigned short index;
732 unsigned short parent;
733 unsigned short left;
734 unsigned short right;
708}; 735};
709 736
710extern struct list_head ftrace_common_fields; 737extern struct list_head ftrace_common_fields;
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 6cf223764be8..1516cb3ec549 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
109 */ 109 */
110#define FTRACE_CTX_FIELDS \ 110#define FTRACE_CTX_FIELDS \
111 __field( unsigned int, prev_pid ) \ 111 __field( unsigned int, prev_pid ) \
112 __field( unsigned int, next_pid ) \
113 __field( unsigned int, next_cpu ) \
112 __field( unsigned char, prev_prio ) \ 114 __field( unsigned char, prev_prio ) \
113 __field( unsigned char, prev_state ) \ 115 __field( unsigned char, prev_state ) \
114 __field( unsigned int, next_pid ) \
115 __field( unsigned char, next_prio ) \ 116 __field( unsigned char, next_prio ) \
116 __field( unsigned char, next_state ) \ 117 __field( unsigned char, next_state )
117 __field( unsigned int, next_cpu )
118 118
119FTRACE_ENTRY(context_switch, ctx_switch_entry, 119FTRACE_ENTRY(context_switch, ctx_switch_entry,
120 120
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5f499e0438a4..e88f74fe1d4c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,7 +116,6 @@ static int trace_define_common_fields(void)
116 __common_field(unsigned char, flags); 116 __common_field(unsigned char, flags);
117 __common_field(unsigned char, preempt_count); 117 __common_field(unsigned char, preempt_count);
118 __common_field(int, pid); 118 __common_field(int, pid);
119 __common_field(int, lock_depth);
120 119
121 return ret; 120 return ret;
122} 121}
@@ -326,6 +325,7 @@ int trace_set_clr_event(const char *system, const char *event, int set)
326{ 325{
327 return __ftrace_set_clr_event(NULL, system, event, set); 326 return __ftrace_set_clr_event(NULL, system, event, set);
328} 327}
328EXPORT_SYMBOL_GPL(trace_set_clr_event);
329 329
330/* 128 should be much more than enough */ 330/* 128 should be much more than enough */
331#define EVENT_BUF_SIZE 127 331#define EVENT_BUF_SIZE 127
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 36d40104b17f..8008ddcfbf20 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -123,9 +123,13 @@ struct filter_parse_state {
123 } operand; 123 } operand;
124}; 124};
125 125
126struct pred_stack {
127 struct filter_pred **preds;
128 int index;
129};
130
126#define DEFINE_COMPARISON_PRED(type) \ 131#define DEFINE_COMPARISON_PRED(type) \
127static int filter_pred_##type(struct filter_pred *pred, void *event, \ 132static int filter_pred_##type(struct filter_pred *pred, void *event) \
128 int val1, int val2) \
129{ \ 133{ \
130 type *addr = (type *)(event + pred->offset); \ 134 type *addr = (type *)(event + pred->offset); \
131 type val = (type)pred->val; \ 135 type val = (type)pred->val; \
@@ -152,8 +156,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \
152} 156}
153 157
154#define DEFINE_EQUALITY_PRED(size) \ 158#define DEFINE_EQUALITY_PRED(size) \
155static int filter_pred_##size(struct filter_pred *pred, void *event, \ 159static int filter_pred_##size(struct filter_pred *pred, void *event) \
156 int val1, int val2) \
157{ \ 160{ \
158 u##size *addr = (u##size *)(event + pred->offset); \ 161 u##size *addr = (u##size *)(event + pred->offset); \
159 u##size val = (u##size)pred->val; \ 162 u##size val = (u##size)pred->val; \
@@ -178,23 +181,8 @@ DEFINE_EQUALITY_PRED(32);
178DEFINE_EQUALITY_PRED(16); 181DEFINE_EQUALITY_PRED(16);
179DEFINE_EQUALITY_PRED(8); 182DEFINE_EQUALITY_PRED(8);
180 183
181static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
182 void *event __attribute((unused)),
183 int val1, int val2)
184{
185 return val1 && val2;
186}
187
188static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
189 void *event __attribute((unused)),
190 int val1, int val2)
191{
192 return val1 || val2;
193}
194
195/* Filter predicate for fixed sized arrays of characters */ 184/* Filter predicate for fixed sized arrays of characters */
196static int filter_pred_string(struct filter_pred *pred, void *event, 185static int filter_pred_string(struct filter_pred *pred, void *event)
197 int val1, int val2)
198{ 186{
199 char *addr = (char *)(event + pred->offset); 187 char *addr = (char *)(event + pred->offset);
200 int cmp, match; 188 int cmp, match;
@@ -207,8 +195,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
207} 195}
208 196
209/* Filter predicate for char * pointers */ 197/* Filter predicate for char * pointers */
210static int filter_pred_pchar(struct filter_pred *pred, void *event, 198static int filter_pred_pchar(struct filter_pred *pred, void *event)
211 int val1, int val2)
212{ 199{
213 char **addr = (char **)(event + pred->offset); 200 char **addr = (char **)(event + pred->offset);
214 int cmp, match; 201 int cmp, match;
@@ -231,8 +218,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
231 * and add it to the address of the entry, and at last we have 218 * and add it to the address of the entry, and at last we have
232 * the address of the string. 219 * the address of the string.
233 */ 220 */
234static int filter_pred_strloc(struct filter_pred *pred, void *event, 221static int filter_pred_strloc(struct filter_pred *pred, void *event)
235 int val1, int val2)
236{ 222{
237 u32 str_item = *(u32 *)(event + pred->offset); 223 u32 str_item = *(u32 *)(event + pred->offset);
238 int str_loc = str_item & 0xffff; 224 int str_loc = str_item & 0xffff;
@@ -247,8 +233,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
247 return match; 233 return match;
248} 234}
249 235
250static int filter_pred_none(struct filter_pred *pred, void *event, 236static int filter_pred_none(struct filter_pred *pred, void *event)
251 int val1, int val2)
252{ 237{
253 return 0; 238 return 0;
254} 239}
@@ -377,32 +362,147 @@ static void filter_build_regex(struct filter_pred *pred)
377 pred->not ^= not; 362 pred->not ^= not;
378} 363}
379 364
365enum move_type {
366 MOVE_DOWN,
367 MOVE_UP_FROM_LEFT,
368 MOVE_UP_FROM_RIGHT
369};
370
371static struct filter_pred *
372get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
373 int index, enum move_type *move)
374{
375 if (pred->parent & FILTER_PRED_IS_RIGHT)
376 *move = MOVE_UP_FROM_RIGHT;
377 else
378 *move = MOVE_UP_FROM_LEFT;
379 pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT];
380
381 return pred;
382}
383
384/*
385 * A series of AND or ORs where found together. Instead of
386 * climbing up and down the tree branches, an array of the
387 * ops were made in order of checks. We can just move across
388 * the array and short circuit if needed.
389 */
390static int process_ops(struct filter_pred *preds,
391 struct filter_pred *op, void *rec)
392{
393 struct filter_pred *pred;
394 int match = 0;
395 int type;
396 int i;
397
398 /*
399 * Micro-optimization: We set type to true if op
400 * is an OR and false otherwise (AND). Then we
401 * just need to test if the match is equal to
402 * the type, and if it is, we can short circuit the
403 * rest of the checks:
404 *
405 * if ((match && op->op == OP_OR) ||
406 * (!match && op->op == OP_AND))
407 * return match;
408 */
409 type = op->op == OP_OR;
410
411 for (i = 0; i < op->val; i++) {
412 pred = &preds[op->ops[i]];
413 match = pred->fn(pred, rec);
414 if (!!match == type)
415 return match;
416 }
417 return match;
418}
419
380/* return 1 if event matches, 0 otherwise (discard) */ 420/* return 1 if event matches, 0 otherwise (discard) */
381int filter_match_preds(struct event_filter *filter, void *rec) 421int filter_match_preds(struct event_filter *filter, void *rec)
382{ 422{
383 int match, top = 0, val1 = 0, val2 = 0; 423 int match = -1;
384 int stack[MAX_FILTER_PRED]; 424 enum move_type move = MOVE_DOWN;
425 struct filter_pred *preds;
385 struct filter_pred *pred; 426 struct filter_pred *pred;
386 int i; 427 struct filter_pred *root;
428 int n_preds;
429 int done = 0;
430
431 /* no filter is considered a match */
432 if (!filter)
433 return 1;
434
435 n_preds = filter->n_preds;
436
437 if (!n_preds)
438 return 1;
439
440 /*
441 * n_preds, root and filter->preds are protect with preemption disabled.
442 */
443 preds = rcu_dereference_sched(filter->preds);
444 root = rcu_dereference_sched(filter->root);
445 if (!root)
446 return 1;
447
448 pred = root;
387 449
388 for (i = 0; i < filter->n_preds; i++) { 450 /* match is currently meaningless */
389 pred = filter->preds[i]; 451 match = -1;
390 if (!pred->pop_n) { 452
391 match = pred->fn(pred, rec, val1, val2); 453 do {
392 stack[top++] = match; 454 switch (move) {
455 case MOVE_DOWN:
456 /* only AND and OR have children */
457 if (pred->left != FILTER_PRED_INVALID) {
458 /* If ops is set, then it was folded. */
459 if (!pred->ops) {
460 /* keep going to down the left side */
461 pred = &preds[pred->left];
462 continue;
463 }
464 /* We can treat folded ops as a leaf node */
465 match = process_ops(preds, pred, rec);
466 } else
467 match = pred->fn(pred, rec);
468 /* If this pred is the only pred */
469 if (pred == root)
470 break;
471 pred = get_pred_parent(pred, preds,
472 pred->parent, &move);
473 continue;
474 case MOVE_UP_FROM_LEFT:
475 /*
476 * Check for short circuits.
477 *
478 * Optimization: !!match == (pred->op == OP_OR)
479 * is the same as:
480 * if ((match && pred->op == OP_OR) ||
481 * (!match && pred->op == OP_AND))
482 */
483 if (!!match == (pred->op == OP_OR)) {
484 if (pred == root)
485 break;
486 pred = get_pred_parent(pred, preds,
487 pred->parent, &move);
488 continue;
489 }
490 /* now go down the right side of the tree. */
491 pred = &preds[pred->right];
492 move = MOVE_DOWN;
493 continue;
494 case MOVE_UP_FROM_RIGHT:
495 /* We finished this equation. */
496 if (pred == root)
497 break;
498 pred = get_pred_parent(pred, preds,
499 pred->parent, &move);
393 continue; 500 continue;
394 } 501 }
395 if (pred->pop_n > top) { 502 done = 1;
396 WARN_ON_ONCE(1); 503 } while (!done);
397 return 0;
398 }
399 val1 = stack[--top];
400 val2 = stack[--top];
401 match = pred->fn(pred, rec, val1, val2);
402 stack[top++] = match;
403 }
404 504
405 return stack[--top]; 505 return match;
406} 506}
407EXPORT_SYMBOL_GPL(filter_match_preds); 507EXPORT_SYMBOL_GPL(filter_match_preds);
408 508
@@ -414,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos)
414 514
415static void remove_filter_string(struct event_filter *filter) 515static void remove_filter_string(struct event_filter *filter)
416{ 516{
517 if (!filter)
518 return;
519
417 kfree(filter->filter_string); 520 kfree(filter->filter_string);
418 filter->filter_string = NULL; 521 filter->filter_string = NULL;
419} 522}
@@ -473,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps,
473 576
474void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) 577void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
475{ 578{
476 struct event_filter *filter = call->filter; 579 struct event_filter *filter;
477 580
478 mutex_lock(&event_mutex); 581 mutex_lock(&event_mutex);
582 filter = call->filter;
479 if (filter && filter->filter_string) 583 if (filter && filter->filter_string)
480 trace_seq_printf(s, "%s\n", filter->filter_string); 584 trace_seq_printf(s, "%s\n", filter->filter_string);
481 else 585 else
@@ -486,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
486void print_subsystem_event_filter(struct event_subsystem *system, 590void print_subsystem_event_filter(struct event_subsystem *system,
487 struct trace_seq *s) 591 struct trace_seq *s)
488{ 592{
489 struct event_filter *filter = system->filter; 593 struct event_filter *filter;
490 594
491 mutex_lock(&event_mutex); 595 mutex_lock(&event_mutex);
596 filter = system->filter;
492 if (filter && filter->filter_string) 597 if (filter && filter->filter_string)
493 trace_seq_printf(s, "%s\n", filter->filter_string); 598 trace_seq_printf(s, "%s\n", filter->filter_string);
494 else 599 else
@@ -539,10 +644,58 @@ static void filter_clear_pred(struct filter_pred *pred)
539 pred->regex.len = 0; 644 pred->regex.len = 0;
540} 645}
541 646
542static int filter_set_pred(struct filter_pred *dest, 647static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
648{
649 stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
650 if (!stack->preds)
651 return -ENOMEM;
652 stack->index = n_preds;
653 return 0;
654}
655
656static void __free_pred_stack(struct pred_stack *stack)
657{
658 kfree(stack->preds);
659 stack->index = 0;
660}
661
662static int __push_pred_stack(struct pred_stack *stack,
663 struct filter_pred *pred)
664{
665 int index = stack->index;
666
667 if (WARN_ON(index == 0))
668 return -ENOSPC;
669
670 stack->preds[--index] = pred;
671 stack->index = index;
672 return 0;
673}
674
675static struct filter_pred *
676__pop_pred_stack(struct pred_stack *stack)
677{
678 struct filter_pred *pred;
679 int index = stack->index;
680
681 pred = stack->preds[index++];
682 if (!pred)
683 return NULL;
684
685 stack->index = index;
686 return pred;
687}
688
689static int filter_set_pred(struct event_filter *filter,
690 int idx,
691 struct pred_stack *stack,
543 struct filter_pred *src, 692 struct filter_pred *src,
544 filter_pred_fn_t fn) 693 filter_pred_fn_t fn)
545{ 694{
695 struct filter_pred *dest = &filter->preds[idx];
696 struct filter_pred *left;
697 struct filter_pred *right;
698
546 *dest = *src; 699 *dest = *src;
547 if (src->field_name) { 700 if (src->field_name) {
548 dest->field_name = kstrdup(src->field_name, GFP_KERNEL); 701 dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
@@ -550,116 +703,140 @@ static int filter_set_pred(struct filter_pred *dest,
550 return -ENOMEM; 703 return -ENOMEM;
551 } 704 }
552 dest->fn = fn; 705 dest->fn = fn;
706 dest->index = idx;
553 707
554 return 0; 708 if (dest->op == OP_OR || dest->op == OP_AND) {
709 right = __pop_pred_stack(stack);
710 left = __pop_pred_stack(stack);
711 if (!left || !right)
712 return -EINVAL;
713 /*
714 * If both children can be folded
715 * and they are the same op as this op or a leaf,
716 * then this op can be folded.
717 */
718 if (left->index & FILTER_PRED_FOLD &&
719 (left->op == dest->op ||
720 left->left == FILTER_PRED_INVALID) &&
721 right->index & FILTER_PRED_FOLD &&
722 (right->op == dest->op ||
723 right->left == FILTER_PRED_INVALID))
724 dest->index |= FILTER_PRED_FOLD;
725
726 dest->left = left->index & ~FILTER_PRED_FOLD;
727 dest->right = right->index & ~FILTER_PRED_FOLD;
728 left->parent = dest->index & ~FILTER_PRED_FOLD;
729 right->parent = dest->index | FILTER_PRED_IS_RIGHT;
730 } else {
731 /*
732 * Make dest->left invalid to be used as a quick
733 * way to know this is a leaf node.
734 */
735 dest->left = FILTER_PRED_INVALID;
736
737 /* All leafs allow folding the parent ops. */
738 dest->index |= FILTER_PRED_FOLD;
739 }
740
741 return __push_pred_stack(stack, dest);
555} 742}
556 743
557static void filter_disable_preds(struct ftrace_event_call *call) 744static void __free_preds(struct event_filter *filter)
558{ 745{
559 struct event_filter *filter = call->filter;
560 int i; 746 int i;
561 747
562 call->flags &= ~TRACE_EVENT_FL_FILTERED; 748 if (filter->preds) {
749 for (i = 0; i < filter->a_preds; i++)
750 kfree(filter->preds[i].field_name);
751 kfree(filter->preds);
752 filter->preds = NULL;
753 }
754 filter->a_preds = 0;
563 filter->n_preds = 0; 755 filter->n_preds = 0;
564
565 for (i = 0; i < MAX_FILTER_PRED; i++)
566 filter->preds[i]->fn = filter_pred_none;
567} 756}
568 757
569static void __free_preds(struct event_filter *filter) 758static void filter_disable(struct ftrace_event_call *call)
570{ 759{
571 int i; 760 call->flags &= ~TRACE_EVENT_FL_FILTERED;
761}
572 762
763static void __free_filter(struct event_filter *filter)
764{
573 if (!filter) 765 if (!filter)
574 return; 766 return;
575 767
576 for (i = 0; i < MAX_FILTER_PRED; i++) { 768 __free_preds(filter);
577 if (filter->preds[i])
578 filter_free_pred(filter->preds[i]);
579 }
580 kfree(filter->preds);
581 kfree(filter->filter_string); 769 kfree(filter->filter_string);
582 kfree(filter); 770 kfree(filter);
583} 771}
584 772
773/*
774 * Called when destroying the ftrace_event_call.
775 * The call is being freed, so we do not need to worry about
776 * the call being currently used. This is for module code removing
777 * the tracepoints from within it.
778 */
585void destroy_preds(struct ftrace_event_call *call) 779void destroy_preds(struct ftrace_event_call *call)
586{ 780{
587 __free_preds(call->filter); 781 __free_filter(call->filter);
588 call->filter = NULL; 782 call->filter = NULL;
589 call->flags &= ~TRACE_EVENT_FL_FILTERED;
590} 783}
591 784
592static struct event_filter *__alloc_preds(void) 785static struct event_filter *__alloc_filter(void)
593{ 786{
594 struct event_filter *filter; 787 struct event_filter *filter;
788
789 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
790 return filter;
791}
792
793static int __alloc_preds(struct event_filter *filter, int n_preds)
794{
595 struct filter_pred *pred; 795 struct filter_pred *pred;
596 int i; 796 int i;
597 797
598 filter = kzalloc(sizeof(*filter), GFP_KERNEL); 798 if (filter->preds)
599 if (!filter) 799 __free_preds(filter);
600 return ERR_PTR(-ENOMEM);
601 800
602 filter->n_preds = 0; 801 filter->preds =
802 kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
603 803
604 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
605 if (!filter->preds) 804 if (!filter->preds)
606 goto oom; 805 return -ENOMEM;
607 806
608 for (i = 0; i < MAX_FILTER_PRED; i++) { 807 filter->a_preds = n_preds;
609 pred = kzalloc(sizeof(*pred), GFP_KERNEL); 808 filter->n_preds = 0;
610 if (!pred) 809
611 goto oom; 810 for (i = 0; i < n_preds; i++) {
811 pred = &filter->preds[i];
612 pred->fn = filter_pred_none; 812 pred->fn = filter_pred_none;
613 filter->preds[i] = pred;
614 } 813 }
615 814
616 return filter;
617
618oom:
619 __free_preds(filter);
620 return ERR_PTR(-ENOMEM);
621}
622
623static int init_preds(struct ftrace_event_call *call)
624{
625 if (call->filter)
626 return 0;
627
628 call->flags &= ~TRACE_EVENT_FL_FILTERED;
629 call->filter = __alloc_preds();
630 if (IS_ERR(call->filter))
631 return PTR_ERR(call->filter);
632
633 return 0; 815 return 0;
634} 816}
635 817
636static int init_subsystem_preds(struct event_subsystem *system) 818static void filter_free_subsystem_preds(struct event_subsystem *system)
637{ 819{
638 struct ftrace_event_call *call; 820 struct ftrace_event_call *call;
639 int err;
640 821
641 list_for_each_entry(call, &ftrace_events, list) { 822 list_for_each_entry(call, &ftrace_events, list) {
642 if (strcmp(call->class->system, system->name) != 0) 823 if (strcmp(call->class->system, system->name) != 0)
643 continue; 824 continue;
644 825
645 err = init_preds(call); 826 filter_disable(call);
646 if (err) 827 remove_filter_string(call->filter);
647 return err;
648 } 828 }
649
650 return 0;
651} 829}
652 830
653static void filter_free_subsystem_preds(struct event_subsystem *system) 831static void filter_free_subsystem_filters(struct event_subsystem *system)
654{ 832{
655 struct ftrace_event_call *call; 833 struct ftrace_event_call *call;
656 834
657 list_for_each_entry(call, &ftrace_events, list) { 835 list_for_each_entry(call, &ftrace_events, list) {
658 if (strcmp(call->class->system, system->name) != 0) 836 if (strcmp(call->class->system, system->name) != 0)
659 continue; 837 continue;
660 838 __free_filter(call->filter);
661 filter_disable_preds(call); 839 call->filter = NULL;
662 remove_filter_string(call->filter);
663 } 840 }
664} 841}
665 842
@@ -667,18 +844,19 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
667 struct ftrace_event_call *call, 844 struct ftrace_event_call *call,
668 struct event_filter *filter, 845 struct event_filter *filter,
669 struct filter_pred *pred, 846 struct filter_pred *pred,
847 struct pred_stack *stack,
670 filter_pred_fn_t fn) 848 filter_pred_fn_t fn)
671{ 849{
672 int idx, err; 850 int idx, err;
673 851
674 if (filter->n_preds == MAX_FILTER_PRED) { 852 if (WARN_ON(filter->n_preds == filter->a_preds)) {
675 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 853 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
676 return -ENOSPC; 854 return -ENOSPC;
677 } 855 }
678 856
679 idx = filter->n_preds; 857 idx = filter->n_preds;
680 filter_clear_pred(filter->preds[idx]); 858 filter_clear_pred(&filter->preds[idx]);
681 err = filter_set_pred(filter->preds[idx], pred, fn); 859 err = filter_set_pred(filter, idx, stack, pred, fn);
682 if (err) 860 if (err)
683 return err; 861 return err;
684 862
@@ -763,6 +941,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
763 struct ftrace_event_call *call, 941 struct ftrace_event_call *call,
764 struct event_filter *filter, 942 struct event_filter *filter,
765 struct filter_pred *pred, 943 struct filter_pred *pred,
944 struct pred_stack *stack,
766 bool dry_run) 945 bool dry_run)
767{ 946{
768 struct ftrace_event_field *field; 947 struct ftrace_event_field *field;
@@ -770,17 +949,12 @@ static int filter_add_pred(struct filter_parse_state *ps,
770 unsigned long long val; 949 unsigned long long val;
771 int ret; 950 int ret;
772 951
773 pred->fn = filter_pred_none; 952 fn = pred->fn = filter_pred_none;
774 953
775 if (pred->op == OP_AND) { 954 if (pred->op == OP_AND)
776 pred->pop_n = 2;
777 fn = filter_pred_and;
778 goto add_pred_fn; 955 goto add_pred_fn;
779 } else if (pred->op == OP_OR) { 956 else if (pred->op == OP_OR)
780 pred->pop_n = 2;
781 fn = filter_pred_or;
782 goto add_pred_fn; 957 goto add_pred_fn;
783 }
784 958
785 field = find_event_field(call, pred->field_name); 959 field = find_event_field(call, pred->field_name);
786 if (!field) { 960 if (!field) {
@@ -829,7 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
829 1003
830add_pred_fn: 1004add_pred_fn:
831 if (!dry_run) 1005 if (!dry_run)
832 return filter_add_pred_fn(ps, call, filter, pred, fn); 1006 return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
833 return 0; 1007 return 0;
834} 1008}
835 1009
@@ -1187,6 +1361,234 @@ static int check_preds(struct filter_parse_state *ps)
1187 return 0; 1361 return 0;
1188} 1362}
1189 1363
1364static int count_preds(struct filter_parse_state *ps)
1365{
1366 struct postfix_elt *elt;
1367 int n_preds = 0;
1368
1369 list_for_each_entry(elt, &ps->postfix, list) {
1370 if (elt->op == OP_NONE)
1371 continue;
1372 n_preds++;
1373 }
1374
1375 return n_preds;
1376}
1377
1378/*
1379 * The tree is walked at filtering of an event. If the tree is not correctly
1380 * built, it may cause an infinite loop. Check here that the tree does
1381 * indeed terminate.
1382 */
1383static int check_pred_tree(struct event_filter *filter,
1384 struct filter_pred *root)
1385{
1386 struct filter_pred *preds;
1387 struct filter_pred *pred;
1388 enum move_type move = MOVE_DOWN;
1389 int count = 0;
1390 int done = 0;
1391 int max;
1392
1393 /*
1394 * The max that we can hit a node is three times.
1395 * Once going down, once coming up from left, and
1396 * once coming up from right. This is more than enough
1397 * since leafs are only hit a single time.
1398 */
1399 max = 3 * filter->n_preds;
1400
1401 preds = filter->preds;
1402 if (!preds)
1403 return -EINVAL;
1404 pred = root;
1405
1406 do {
1407 if (WARN_ON(count++ > max))
1408 return -EINVAL;
1409
1410 switch (move) {
1411 case MOVE_DOWN:
1412 if (pred->left != FILTER_PRED_INVALID) {
1413 pred = &preds[pred->left];
1414 continue;
1415 }
1416 /* A leaf at the root is just a leaf in the tree */
1417 if (pred == root)
1418 break;
1419 pred = get_pred_parent(pred, preds,
1420 pred->parent, &move);
1421 continue;
1422 case MOVE_UP_FROM_LEFT:
1423 pred = &preds[pred->right];
1424 move = MOVE_DOWN;
1425 continue;
1426 case MOVE_UP_FROM_RIGHT:
1427 if (pred == root)
1428 break;
1429 pred = get_pred_parent(pred, preds,
1430 pred->parent, &move);
1431 continue;
1432 }
1433 done = 1;
1434 } while (!done);
1435
1436 /* We are fine. */
1437 return 0;
1438}
1439
1440static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
1441{
1442 struct filter_pred *pred;
1443 enum move_type move = MOVE_DOWN;
1444 int count = 0;
1445 int done = 0;
1446
1447 pred = root;
1448
1449 do {
1450 switch (move) {
1451 case MOVE_DOWN:
1452 if (pred->left != FILTER_PRED_INVALID) {
1453 pred = &preds[pred->left];
1454 continue;
1455 }
1456 /* A leaf at the root is just a leaf in the tree */
1457 if (pred == root)
1458 return 1;
1459 count++;
1460 pred = get_pred_parent(pred, preds,
1461 pred->parent, &move);
1462 continue;
1463 case MOVE_UP_FROM_LEFT:
1464 pred = &preds[pred->right];
1465 move = MOVE_DOWN;
1466 continue;
1467 case MOVE_UP_FROM_RIGHT:
1468 if (pred == root)
1469 break;
1470 pred = get_pred_parent(pred, preds,
1471 pred->parent, &move);
1472 continue;
1473 }
1474 done = 1;
1475 } while (!done);
1476
1477 return count;
1478}
1479
1480static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1481{
1482 struct filter_pred *pred;
1483 enum move_type move = MOVE_DOWN;
1484 int count = 0;
1485 int children;
1486 int done = 0;
1487
1488 /* No need to keep the fold flag */
1489 root->index &= ~FILTER_PRED_FOLD;
1490
1491 /* If the root is a leaf then do nothing */
1492 if (root->left == FILTER_PRED_INVALID)
1493 return 0;
1494
1495 /* count the children */
1496 children = count_leafs(preds, &preds[root->left]);
1497 children += count_leafs(preds, &preds[root->right]);
1498
1499 root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL);
1500 if (!root->ops)
1501 return -ENOMEM;
1502
1503 root->val = children;
1504
1505 pred = root;
1506 do {
1507 switch (move) {
1508 case MOVE_DOWN:
1509 if (pred->left != FILTER_PRED_INVALID) {
1510 pred = &preds[pred->left];
1511 continue;
1512 }
1513 if (WARN_ON(count == children))
1514 return -EINVAL;
1515 pred->index &= ~FILTER_PRED_FOLD;
1516 root->ops[count++] = pred->index;
1517 pred = get_pred_parent(pred, preds,
1518 pred->parent, &move);
1519 continue;
1520 case MOVE_UP_FROM_LEFT:
1521 pred = &preds[pred->right];
1522 move = MOVE_DOWN;
1523 continue;
1524 case MOVE_UP_FROM_RIGHT:
1525 if (pred == root)
1526 break;
1527 pred = get_pred_parent(pred, preds,
1528 pred->parent, &move);
1529 continue;
1530 }
1531 done = 1;
1532 } while (!done);
1533
1534 return 0;
1535}
1536
1537/*
1538 * To optimize the processing of the ops, if we have several "ors" or
1539 * "ands" together, we can put them in an array and process them all
1540 * together speeding up the filter logic.
1541 */
1542static int fold_pred_tree(struct event_filter *filter,
1543 struct filter_pred *root)
1544{
1545 struct filter_pred *preds;
1546 struct filter_pred *pred;
1547 enum move_type move = MOVE_DOWN;
1548 int done = 0;
1549 int err;
1550
1551 preds = filter->preds;
1552 if (!preds)
1553 return -EINVAL;
1554 pred = root;
1555
1556 do {
1557 switch (move) {
1558 case MOVE_DOWN:
1559 if (pred->index & FILTER_PRED_FOLD) {
1560 err = fold_pred(preds, pred);
1561 if (err)
1562 return err;
1563 /* Folded nodes are like leafs */
1564 } else if (pred->left != FILTER_PRED_INVALID) {
1565 pred = &preds[pred->left];
1566 continue;
1567 }
1568
1569 /* A leaf at the root is just a leaf in the tree */
1570 if (pred == root)
1571 break;
1572 pred = get_pred_parent(pred, preds,
1573 pred->parent, &move);
1574 continue;
1575 case MOVE_UP_FROM_LEFT:
1576 pred = &preds[pred->right];
1577 move = MOVE_DOWN;
1578 continue;
1579 case MOVE_UP_FROM_RIGHT:
1580 if (pred == root)
1581 break;
1582 pred = get_pred_parent(pred, preds,
1583 pred->parent, &move);
1584 continue;
1585 }
1586 done = 1;
1587 } while (!done);
1588
1589 return 0;
1590}
1591
1190static int replace_preds(struct ftrace_event_call *call, 1592static int replace_preds(struct ftrace_event_call *call,
1191 struct event_filter *filter, 1593 struct event_filter *filter,
1192 struct filter_parse_state *ps, 1594 struct filter_parse_state *ps,
@@ -1195,14 +1597,32 @@ static int replace_preds(struct ftrace_event_call *call,
1195{ 1597{
1196 char *operand1 = NULL, *operand2 = NULL; 1598 char *operand1 = NULL, *operand2 = NULL;
1197 struct filter_pred *pred; 1599 struct filter_pred *pred;
1600 struct filter_pred *root;
1198 struct postfix_elt *elt; 1601 struct postfix_elt *elt;
1602 struct pred_stack stack = { }; /* init to NULL */
1199 int err; 1603 int err;
1200 int n_preds = 0; 1604 int n_preds = 0;
1201 1605
1606 n_preds = count_preds(ps);
1607 if (n_preds >= MAX_FILTER_PRED) {
1608 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1609 return -ENOSPC;
1610 }
1611
1202 err = check_preds(ps); 1612 err = check_preds(ps);
1203 if (err) 1613 if (err)
1204 return err; 1614 return err;
1205 1615
1616 if (!dry_run) {
1617 err = __alloc_pred_stack(&stack, n_preds);
1618 if (err)
1619 return err;
1620 err = __alloc_preds(filter, n_preds);
1621 if (err)
1622 goto fail;
1623 }
1624
1625 n_preds = 0;
1206 list_for_each_entry(elt, &ps->postfix, list) { 1626 list_for_each_entry(elt, &ps->postfix, list) {
1207 if (elt->op == OP_NONE) { 1627 if (elt->op == OP_NONE) {
1208 if (!operand1) 1628 if (!operand1)
@@ -1211,14 +1631,16 @@ static int replace_preds(struct ftrace_event_call *call,
1211 operand2 = elt->operand; 1631 operand2 = elt->operand;
1212 else { 1632 else {
1213 parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); 1633 parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
1214 return -EINVAL; 1634 err = -EINVAL;
1635 goto fail;
1215 } 1636 }
1216 continue; 1637 continue;
1217 } 1638 }
1218 1639
1219 if (n_preds++ == MAX_FILTER_PRED) { 1640 if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
1220 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 1641 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1221 return -ENOSPC; 1642 err = -ENOSPC;
1643 goto fail;
1222 } 1644 }
1223 1645
1224 if (elt->op == OP_AND || elt->op == OP_OR) { 1646 if (elt->op == OP_AND || elt->op == OP_OR) {
@@ -1228,76 +1650,181 @@ static int replace_preds(struct ftrace_event_call *call,
1228 1650
1229 if (!operand1 || !operand2) { 1651 if (!operand1 || !operand2) {
1230 parse_error(ps, FILT_ERR_MISSING_FIELD, 0); 1652 parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
1231 return -EINVAL; 1653 err = -EINVAL;
1654 goto fail;
1232 } 1655 }
1233 1656
1234 pred = create_pred(elt->op, operand1, operand2); 1657 pred = create_pred(elt->op, operand1, operand2);
1235add_pred: 1658add_pred:
1236 if (!pred) 1659 if (!pred) {
1237 return -ENOMEM; 1660 err = -ENOMEM;
1238 err = filter_add_pred(ps, call, filter, pred, dry_run); 1661 goto fail;
1662 }
1663 err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
1239 filter_free_pred(pred); 1664 filter_free_pred(pred);
1240 if (err) 1665 if (err)
1241 return err; 1666 goto fail;
1242 1667
1243 operand1 = operand2 = NULL; 1668 operand1 = operand2 = NULL;
1244 } 1669 }
1245 1670
1246 return 0; 1671 if (!dry_run) {
1672 /* We should have one item left on the stack */
1673 pred = __pop_pred_stack(&stack);
1674 if (!pred)
1675 return -EINVAL;
1676 /* This item is where we start from in matching */
1677 root = pred;
1678 /* Make sure the stack is empty */
1679 pred = __pop_pred_stack(&stack);
1680 if (WARN_ON(pred)) {
1681 err = -EINVAL;
1682 filter->root = NULL;
1683 goto fail;
1684 }
1685 err = check_pred_tree(filter, root);
1686 if (err)
1687 goto fail;
1688
1689 /* Optimize the tree */
1690 err = fold_pred_tree(filter, root);
1691 if (err)
1692 goto fail;
1693
1694 /* We don't set root until we know it works */
1695 barrier();
1696 filter->root = root;
1697 }
1698
1699 err = 0;
1700fail:
1701 __free_pred_stack(&stack);
1702 return err;
1247} 1703}
1248 1704
1705struct filter_list {
1706 struct list_head list;
1707 struct event_filter *filter;
1708};
1709
1249static int replace_system_preds(struct event_subsystem *system, 1710static int replace_system_preds(struct event_subsystem *system,
1250 struct filter_parse_state *ps, 1711 struct filter_parse_state *ps,
1251 char *filter_string) 1712 char *filter_string)
1252{ 1713{
1253 struct ftrace_event_call *call; 1714 struct ftrace_event_call *call;
1715 struct filter_list *filter_item;
1716 struct filter_list *tmp;
1717 LIST_HEAD(filter_list);
1254 bool fail = true; 1718 bool fail = true;
1255 int err; 1719 int err;
1256 1720
1257 list_for_each_entry(call, &ftrace_events, list) { 1721 list_for_each_entry(call, &ftrace_events, list) {
1258 struct event_filter *filter = call->filter;
1259 1722
1260 if (strcmp(call->class->system, system->name) != 0) 1723 if (strcmp(call->class->system, system->name) != 0)
1261 continue; 1724 continue;
1262 1725
1263 /* try to see if the filter can be applied */ 1726 /*
1264 err = replace_preds(call, filter, ps, filter_string, true); 1727 * Try to see if the filter can be applied
1728 * (filter arg is ignored on dry_run)
1729 */
1730 err = replace_preds(call, NULL, ps, filter_string, true);
1265 if (err) 1731 if (err)
1732 goto fail;
1733 }
1734
1735 list_for_each_entry(call, &ftrace_events, list) {
1736 struct event_filter *filter;
1737
1738 if (strcmp(call->class->system, system->name) != 0)
1266 continue; 1739 continue;
1267 1740
1268 /* really apply the filter */ 1741 filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
1269 filter_disable_preds(call); 1742 if (!filter_item)
1270 err = replace_preds(call, filter, ps, filter_string, false); 1743 goto fail_mem;
1744
1745 list_add_tail(&filter_item->list, &filter_list);
1746
1747 filter_item->filter = __alloc_filter();
1748 if (!filter_item->filter)
1749 goto fail_mem;
1750 filter = filter_item->filter;
1751
1752 /* Can only fail on no memory */
1753 err = replace_filter_string(filter, filter_string);
1271 if (err) 1754 if (err)
1272 filter_disable_preds(call); 1755 goto fail_mem;
1273 else { 1756
1757 err = replace_preds(call, filter, ps, filter_string, false);
1758 if (err) {
1759 filter_disable(call);
1760 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1761 append_filter_err(ps, filter);
1762 } else
1274 call->flags |= TRACE_EVENT_FL_FILTERED; 1763 call->flags |= TRACE_EVENT_FL_FILTERED;
1275 replace_filter_string(filter, filter_string); 1764 /*
1276 } 1765 * Regardless of if this returned an error, we still
1766 * replace the filter for the call.
1767 */
1768 filter = call->filter;
1769 call->filter = filter_item->filter;
1770 filter_item->filter = filter;
1771
1277 fail = false; 1772 fail = false;
1278 } 1773 }
1279 1774
1280 if (fail) { 1775 if (fail)
1281 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 1776 goto fail;
1282 return -EINVAL; 1777
1778 /*
1779 * The calls can still be using the old filters.
1780 * Do a synchronize_sched() to ensure all calls are
1781 * done with them before we free them.
1782 */
1783 synchronize_sched();
1784 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1785 __free_filter(filter_item->filter);
1786 list_del(&filter_item->list);
1787 kfree(filter_item);
1283 } 1788 }
1284 return 0; 1789 return 0;
1790 fail:
1791 /* No call succeeded */
1792 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1793 list_del(&filter_item->list);
1794 kfree(filter_item);
1795 }
1796 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1797 return -EINVAL;
1798 fail_mem:
1799 /* If any call succeeded, we still need to sync */
1800 if (!fail)
1801 synchronize_sched();
1802 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1803 __free_filter(filter_item->filter);
1804 list_del(&filter_item->list);
1805 kfree(filter_item);
1806 }
1807 return -ENOMEM;
1285} 1808}
1286 1809
1287int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1810int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1288{ 1811{
1289 int err;
1290 struct filter_parse_state *ps; 1812 struct filter_parse_state *ps;
1813 struct event_filter *filter;
1814 struct event_filter *tmp;
1815 int err = 0;
1291 1816
1292 mutex_lock(&event_mutex); 1817 mutex_lock(&event_mutex);
1293 1818
1294 err = init_preds(call);
1295 if (err)
1296 goto out_unlock;
1297
1298 if (!strcmp(strstrip(filter_string), "0")) { 1819 if (!strcmp(strstrip(filter_string), "0")) {
1299 filter_disable_preds(call); 1820 filter_disable(call);
1300 remove_filter_string(call->filter); 1821 filter = call->filter;
1822 if (!filter)
1823 goto out_unlock;
1824 call->filter = NULL;
1825 /* Make sure the filter is not being used */
1826 synchronize_sched();
1827 __free_filter(filter);
1301 goto out_unlock; 1828 goto out_unlock;
1302 } 1829 }
1303 1830
@@ -1306,22 +1833,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1306 if (!ps) 1833 if (!ps)
1307 goto out_unlock; 1834 goto out_unlock;
1308 1835
1309 filter_disable_preds(call); 1836 filter = __alloc_filter();
1310 replace_filter_string(call->filter, filter_string); 1837 if (!filter) {
1838 kfree(ps);
1839 goto out_unlock;
1840 }
1841
1842 replace_filter_string(filter, filter_string);
1311 1843
1312 parse_init(ps, filter_ops, filter_string); 1844 parse_init(ps, filter_ops, filter_string);
1313 err = filter_parse(ps); 1845 err = filter_parse(ps);
1314 if (err) { 1846 if (err) {
1315 append_filter_err(ps, call->filter); 1847 append_filter_err(ps, filter);
1316 goto out; 1848 goto out;
1317 } 1849 }
1318 1850
1319 err = replace_preds(call, call->filter, ps, filter_string, false); 1851 err = replace_preds(call, filter, ps, filter_string, false);
1320 if (err) 1852 if (err) {
1321 append_filter_err(ps, call->filter); 1853 filter_disable(call);
1322 else 1854 append_filter_err(ps, filter);
1855 } else
1323 call->flags |= TRACE_EVENT_FL_FILTERED; 1856 call->flags |= TRACE_EVENT_FL_FILTERED;
1324out: 1857out:
1858 /*
1859 * Always swap the call filter with the new filter
1860 * even if there was an error. If there was an error
1861 * in the filter, we disable the filter and show the error
1862 * string
1863 */
1864 tmp = call->filter;
1865 call->filter = filter;
1866 if (tmp) {
1867 /* Make sure the call is done with the filter */
1868 synchronize_sched();
1869 __free_filter(tmp);
1870 }
1325 filter_opstack_clear(ps); 1871 filter_opstack_clear(ps);
1326 postfix_clear(ps); 1872 postfix_clear(ps);
1327 kfree(ps); 1873 kfree(ps);
@@ -1334,18 +1880,21 @@ out_unlock:
1334int apply_subsystem_event_filter(struct event_subsystem *system, 1880int apply_subsystem_event_filter(struct event_subsystem *system,
1335 char *filter_string) 1881 char *filter_string)
1336{ 1882{
1337 int err;
1338 struct filter_parse_state *ps; 1883 struct filter_parse_state *ps;
1884 struct event_filter *filter;
1885 int err = 0;
1339 1886
1340 mutex_lock(&event_mutex); 1887 mutex_lock(&event_mutex);
1341 1888
1342 err = init_subsystem_preds(system);
1343 if (err)
1344 goto out_unlock;
1345
1346 if (!strcmp(strstrip(filter_string), "0")) { 1889 if (!strcmp(strstrip(filter_string), "0")) {
1347 filter_free_subsystem_preds(system); 1890 filter_free_subsystem_preds(system);
1348 remove_filter_string(system->filter); 1891 remove_filter_string(system->filter);
1892 filter = system->filter;
1893 system->filter = NULL;
1894 /* Ensure all filters are no longer used */
1895 synchronize_sched();
1896 filter_free_subsystem_filters(system);
1897 __free_filter(filter);
1349 goto out_unlock; 1898 goto out_unlock;
1350 } 1899 }
1351 1900
@@ -1354,7 +1903,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1354 if (!ps) 1903 if (!ps)
1355 goto out_unlock; 1904 goto out_unlock;
1356 1905
1357 replace_filter_string(system->filter, filter_string); 1906 filter = __alloc_filter();
1907 if (!filter)
1908 goto out;
1909
1910 replace_filter_string(filter, filter_string);
1911 /*
1912 * No event actually uses the system filter
1913 * we can free it without synchronize_sched().
1914 */
1915 __free_filter(system->filter);
1916 system->filter = filter;
1358 1917
1359 parse_init(ps, filter_ops, filter_string); 1918 parse_init(ps, filter_ops, filter_string);
1360 err = filter_parse(ps); 1919 err = filter_parse(ps);
@@ -1384,7 +1943,7 @@ void ftrace_profile_free_filter(struct perf_event *event)
1384 struct event_filter *filter = event->filter; 1943 struct event_filter *filter = event->filter;
1385 1944
1386 event->filter = NULL; 1945 event->filter = NULL;
1387 __free_preds(filter); 1946 __free_filter(filter);
1388} 1947}
1389 1948
1390int ftrace_profile_set_filter(struct perf_event *event, int event_id, 1949int ftrace_profile_set_filter(struct perf_event *event, int event_id,
@@ -1410,8 +1969,8 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1410 if (event->filter) 1969 if (event->filter)
1411 goto out_unlock; 1970 goto out_unlock;
1412 1971
1413 filter = __alloc_preds(); 1972 filter = __alloc_filter();
1414 if (IS_ERR(filter)) { 1973 if (!filter) {
1415 err = PTR_ERR(filter); 1974 err = PTR_ERR(filter);
1416 goto out_unlock; 1975 goto out_unlock;
1417 } 1976 }
@@ -1419,7 +1978,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1419 err = -ENOMEM; 1978 err = -ENOMEM;
1420 ps = kzalloc(sizeof(*ps), GFP_KERNEL); 1979 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1421 if (!ps) 1980 if (!ps)
1422 goto free_preds; 1981 goto free_filter;
1423 1982
1424 parse_init(ps, filter_ops, filter_str); 1983 parse_init(ps, filter_ops, filter_str);
1425 err = filter_parse(ps); 1984 err = filter_parse(ps);
@@ -1435,9 +1994,9 @@ free_ps:
1435 postfix_clear(ps); 1994 postfix_clear(ps);
1436 kfree(ps); 1995 kfree(ps);
1437 1996
1438free_preds: 1997free_filter:
1439 if (err) 1998 if (err)
1440 __free_preds(filter); 1999 __free_filter(filter);
1441 2000
1442out_unlock: 2001out_unlock:
1443 mutex_unlock(&event_mutex); 2002 mutex_unlock(&event_mutex);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 2dec9bcde8b4..8435b43b1782 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -353,6 +353,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
353 kfree(data); 353 kfree(data);
354} 354}
355 355
356/* Bitfield fetch function */
357struct bitfield_fetch_param {
358 struct fetch_param orig;
359 unsigned char hi_shift;
360 unsigned char low_shift;
361};
362
363#define DEFINE_FETCH_bitfield(type) \
364static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
365 void *data, void *dest) \
366{ \
367 struct bitfield_fetch_param *bprm = data; \
368 type buf = 0; \
369 call_fetch(&bprm->orig, regs, &buf); \
370 if (buf) { \
371 buf <<= bprm->hi_shift; \
372 buf >>= bprm->low_shift; \
373 } \
374 *(type *)dest = buf; \
375}
376DEFINE_BASIC_FETCH_FUNCS(bitfield)
377#define fetch_bitfield_string NULL
378#define fetch_bitfield_string_size NULL
379
380static __kprobes void
381free_bitfield_fetch_param(struct bitfield_fetch_param *data)
382{
383 /*
384 * Don't check the bitfield itself, because this must be the
385 * last fetch function.
386 */
387 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
388 free_deref_fetch_param(data->orig.data);
389 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
390 free_symbol_cache(data->orig.data);
391 kfree(data);
392}
356/* Default (unsigned long) fetch type */ 393/* Default (unsigned long) fetch type */
357#define __DEFAULT_FETCH_TYPE(t) u##t 394#define __DEFAULT_FETCH_TYPE(t) u##t
358#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) 395#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -367,6 +404,7 @@ enum {
367 FETCH_MTD_memory, 404 FETCH_MTD_memory,
368 FETCH_MTD_symbol, 405 FETCH_MTD_symbol,
369 FETCH_MTD_deref, 406 FETCH_MTD_deref,
407 FETCH_MTD_bitfield,
370 FETCH_MTD_END, 408 FETCH_MTD_END,
371}; 409};
372 410
@@ -387,6 +425,7 @@ ASSIGN_FETCH_FUNC(retval, ftype), \
387ASSIGN_FETCH_FUNC(memory, ftype), \ 425ASSIGN_FETCH_FUNC(memory, ftype), \
388ASSIGN_FETCH_FUNC(symbol, ftype), \ 426ASSIGN_FETCH_FUNC(symbol, ftype), \
389ASSIGN_FETCH_FUNC(deref, ftype), \ 427ASSIGN_FETCH_FUNC(deref, ftype), \
428ASSIGN_FETCH_FUNC(bitfield, ftype), \
390 } \ 429 } \
391 } 430 }
392 431
@@ -430,9 +469,33 @@ static const struct fetch_type *find_fetch_type(const char *type)
430 if (!type) 469 if (!type)
431 type = DEFAULT_FETCH_TYPE_STR; 470 type = DEFAULT_FETCH_TYPE_STR;
432 471
472 /* Special case: bitfield */
473 if (*type == 'b') {
474 unsigned long bs;
475 type = strchr(type, '/');
476 if (!type)
477 goto fail;
478 type++;
479 if (strict_strtoul(type, 0, &bs))
480 goto fail;
481 switch (bs) {
482 case 8:
483 return find_fetch_type("u8");
484 case 16:
485 return find_fetch_type("u16");
486 case 32:
487 return find_fetch_type("u32");
488 case 64:
489 return find_fetch_type("u64");
490 default:
491 goto fail;
492 }
493 }
494
433 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) 495 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
434 if (strcmp(type, fetch_type_table[i].name) == 0) 496 if (strcmp(type, fetch_type_table[i].name) == 0)
435 return &fetch_type_table[i]; 497 return &fetch_type_table[i];
498fail:
436 return NULL; 499 return NULL;
437} 500}
438 501
@@ -586,7 +649,9 @@ error:
586 649
587static void free_probe_arg(struct probe_arg *arg) 650static void free_probe_arg(struct probe_arg *arg)
588{ 651{
589 if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) 652 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
653 free_bitfield_fetch_param(arg->fetch.data);
654 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
590 free_deref_fetch_param(arg->fetch.data); 655 free_deref_fetch_param(arg->fetch.data);
591 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) 656 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
592 free_symbol_cache(arg->fetch.data); 657 free_symbol_cache(arg->fetch.data);
@@ -767,16 +832,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
767 } 832 }
768 break; 833 break;
769 case '+': /* deref memory */ 834 case '+': /* deref memory */
835 arg++; /* Skip '+', because strict_strtol() rejects it. */
770 case '-': 836 case '-':
771 tmp = strchr(arg, '('); 837 tmp = strchr(arg, '(');
772 if (!tmp) 838 if (!tmp)
773 break; 839 break;
774 *tmp = '\0'; 840 *tmp = '\0';
775 ret = strict_strtol(arg + 1, 0, &offset); 841 ret = strict_strtol(arg, 0, &offset);
776 if (ret) 842 if (ret)
777 break; 843 break;
778 if (arg[0] == '-')
779 offset = -offset;
780 arg = tmp + 1; 844 arg = tmp + 1;
781 tmp = strrchr(arg, ')'); 845 tmp = strrchr(arg, ')');
782 if (tmp) { 846 if (tmp) {
@@ -807,6 +871,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
807 return ret; 871 return ret;
808} 872}
809 873
874#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
875
876/* Bitfield type needs to be parsed into a fetch function */
877static int __parse_bitfield_probe_arg(const char *bf,
878 const struct fetch_type *t,
879 struct fetch_param *f)
880{
881 struct bitfield_fetch_param *bprm;
882 unsigned long bw, bo;
883 char *tail;
884
885 if (*bf != 'b')
886 return 0;
887
888 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
889 if (!bprm)
890 return -ENOMEM;
891 bprm->orig = *f;
892 f->fn = t->fetch[FETCH_MTD_bitfield];
893 f->data = (void *)bprm;
894
895 bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
896 if (bw == 0 || *tail != '@')
897 return -EINVAL;
898
899 bf = tail + 1;
900 bo = simple_strtoul(bf, &tail, 0);
901 if (tail == bf || *tail != '/')
902 return -EINVAL;
903
904 bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
905 bprm->low_shift = bprm->hi_shift + bo;
906 return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
907}
908
810/* String length checking wrapper */ 909/* String length checking wrapper */
811static int parse_probe_arg(char *arg, struct trace_probe *tp, 910static int parse_probe_arg(char *arg, struct trace_probe *tp,
812 struct probe_arg *parg, int is_return) 911 struct probe_arg *parg, int is_return)
@@ -836,6 +935,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
836 parg->offset = tp->size; 935 parg->offset = tp->size;
837 tp->size += parg->type->size; 936 tp->size += parg->type->size;
838 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); 937 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
938 if (ret >= 0 && t != NULL)
939 ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
839 if (ret >= 0) { 940 if (ret >= 0) {
840 parg->fetch_size.fn = get_fetch_size_function(parg->type, 941 parg->fetch_size.fn = get_fetch_size_function(parg->type,
841 parg->fetch.fn); 942 parg->fetch.fn);
@@ -1130,7 +1231,7 @@ static int command_trace_probe(const char *buf)
1130 return ret; 1231 return ret;
1131} 1232}
1132 1233
1133#define WRITE_BUFSIZE 128 1234#define WRITE_BUFSIZE 4096
1134 1235
1135static ssize_t probes_write(struct file *file, const char __user *buffer, 1236static ssize_t probes_write(struct file *file, const char __user *buffer,
1136 size_t count, loff_t *ppos) 1237 size_t count, loff_t *ppos)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 02272baa2206..456be9063c2d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -529,24 +529,34 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
529 * @entry: The trace entry field from the ring buffer 529 * @entry: The trace entry field from the ring buffer
530 * 530 *
531 * Prints the generic fields of irqs off, in hard or softirq, preempt 531 * Prints the generic fields of irqs off, in hard or softirq, preempt
532 * count and lock depth. 532 * count.
533 */ 533 */
534int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) 534int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
535{ 535{
536 int hardirq, softirq; 536 char hardsoft_irq;
537 char need_resched;
538 char irqs_off;
539 int hardirq;
540 int softirq;
537 int ret; 541 int ret;
538 542
539 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 543 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
540 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 544 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
541 545
546 irqs_off =
547 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
548 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
549 '.';
550 need_resched =
551 (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
552 hardsoft_irq =
553 (hardirq && softirq) ? 'H' :
554 hardirq ? 'h' :
555 softirq ? 's' :
556 '.';
557
542 if (!trace_seq_printf(s, "%c%c%c", 558 if (!trace_seq_printf(s, "%c%c%c",
543 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 559 irqs_off, need_resched, hardsoft_irq))
544 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
545 'X' : '.',
546 (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
547 'N' : '.',
548 (hardirq && softirq) ? 'H' :
549 hardirq ? 'h' : softirq ? 's' : '.'))
550 return 0; 560 return 0;
551 561
552 if (entry->preempt_count) 562 if (entry->preempt_count)
@@ -554,13 +564,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
554 else 564 else
555 ret = trace_seq_putc(s, '.'); 565 ret = trace_seq_putc(s, '.');
556 566
557 if (!ret) 567 return ret;
558 return 0;
559
560 if (entry->lock_depth < 0)
561 return trace_seq_putc(s, '.');
562
563 return trace_seq_printf(s, "%d", entry->lock_depth);
564} 568}
565 569
566static int 570static int
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8f758d070c43..7e62c0a18456 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
247 ctx_trace = tr; 247 ctx_trace = tr;
248} 248}
249 249
250static void stop_sched_trace(struct trace_array *tr)
251{
252 tracing_stop_sched_switch_record();
253}
254
255static int sched_switch_trace_init(struct trace_array *tr)
256{
257 ctx_trace = tr;
258 tracing_reset_online_cpus(tr);
259 tracing_start_sched_switch_record();
260 return 0;
261}
262
263static void sched_switch_trace_reset(struct trace_array *tr)
264{
265 if (sched_ref)
266 stop_sched_trace(tr);
267}
268
269static void sched_switch_trace_start(struct trace_array *tr)
270{
271 sched_stopped = 0;
272}
273
274static void sched_switch_trace_stop(struct trace_array *tr)
275{
276 sched_stopped = 1;
277}
278
279static struct tracer sched_switch_trace __read_mostly =
280{
281 .name = "sched_switch",
282 .init = sched_switch_trace_init,
283 .reset = sched_switch_trace_reset,
284 .start = sched_switch_trace_start,
285 .stop = sched_switch_trace_stop,
286 .wait_pipe = poll_wait_pipe,
287#ifdef CONFIG_FTRACE_SELFTEST
288 .selftest = trace_selftest_startup_sched_switch,
289#endif
290};
291
292__init static int init_sched_switch_trace(void)
293{
294 return register_tracer(&sched_switch_trace);
295}
296device_initcall(init_sched_switch_trace);
297
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5c9fe08d2093..ee7b5a0bb9f8 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -60,6 +60,19 @@ extern struct syscall_metadata *__stop_syscalls_metadata[];
60 60
61static struct syscall_metadata **syscalls_metadata; 61static struct syscall_metadata **syscalls_metadata;
62 62
63#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
64static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
65{
66 /*
67 * Only compare after the "sys" prefix. Archs that use
68 * syscall wrappers may have syscalls symbols aliases prefixed
69 * with "SyS" instead of "sys", leading to an unwanted
70 * mismatch.
71 */
72 return !strcmp(sym + 3, name + 3);
73}
74#endif
75
63static __init struct syscall_metadata * 76static __init struct syscall_metadata *
64find_syscall_meta(unsigned long syscall) 77find_syscall_meta(unsigned long syscall)
65{ 78{
@@ -72,14 +85,11 @@ find_syscall_meta(unsigned long syscall)
72 stop = __stop_syscalls_metadata; 85 stop = __stop_syscalls_metadata;
73 kallsyms_lookup(syscall, NULL, NULL, NULL, str); 86 kallsyms_lookup(syscall, NULL, NULL, NULL, str);
74 87
88 if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
89 return NULL;
90
75 for ( ; start < stop; start++) { 91 for ( ; start < stop; start++) {
76 /* 92 if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
77 * Only compare after the "sys" prefix. Archs that use
78 * syscall wrappers may have syscalls symbols aliases prefixed
79 * with "SyS" instead of "sys", leading to an unwanted
80 * mismatch.
81 */
82 if ((*start)->name && !strcmp((*start)->name + 3, str + 3))
83 return *start; 93 return *start;
84 } 94 }
85 return NULL; 95 return NULL;
@@ -359,7 +369,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
359 int num; 369 int num;
360 370
361 num = ((struct syscall_metadata *)call->data)->syscall_nr; 371 num = ((struct syscall_metadata *)call->data)->syscall_nr;
362 if (num < 0 || num >= NR_syscalls) 372 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
363 return -ENOSYS; 373 return -ENOSYS;
364 mutex_lock(&syscall_trace_lock); 374 mutex_lock(&syscall_trace_lock);
365 if (!sys_refcount_enter) 375 if (!sys_refcount_enter)
@@ -377,7 +387,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
377 int num; 387 int num;
378 388
379 num = ((struct syscall_metadata *)call->data)->syscall_nr; 389 num = ((struct syscall_metadata *)call->data)->syscall_nr;
380 if (num < 0 || num >= NR_syscalls) 390 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
381 return; 391 return;
382 mutex_lock(&syscall_trace_lock); 392 mutex_lock(&syscall_trace_lock);
383 sys_refcount_enter--; 393 sys_refcount_enter--;
@@ -393,7 +403,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
393 int num; 403 int num;
394 404
395 num = ((struct syscall_metadata *)call->data)->syscall_nr; 405 num = ((struct syscall_metadata *)call->data)->syscall_nr;
396 if (num < 0 || num >= NR_syscalls) 406 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
397 return -ENOSYS; 407 return -ENOSYS;
398 mutex_lock(&syscall_trace_lock); 408 mutex_lock(&syscall_trace_lock);
399 if (!sys_refcount_exit) 409 if (!sys_refcount_exit)
@@ -411,7 +421,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
411 int num; 421 int num;
412 422
413 num = ((struct syscall_metadata *)call->data)->syscall_nr; 423 num = ((struct syscall_metadata *)call->data)->syscall_nr;
414 if (num < 0 || num >= NR_syscalls) 424 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
415 return; 425 return;
416 mutex_lock(&syscall_trace_lock); 426 mutex_lock(&syscall_trace_lock);
417 sys_refcount_exit--; 427 sys_refcount_exit--;
@@ -424,6 +434,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
424int init_syscall_trace(struct ftrace_event_call *call) 434int init_syscall_trace(struct ftrace_event_call *call)
425{ 435{
426 int id; 436 int id;
437 int num;
438
439 num = ((struct syscall_metadata *)call->data)->syscall_nr;
440 if (num < 0 || num >= NR_syscalls) {
441 pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
442 ((struct syscall_metadata *)call->data)->name);
443 return -ENOSYS;
444 }
427 445
428 if (set_syscall_print_fmt(call) < 0) 446 if (set_syscall_print_fmt(call) < 0)
429 return -ENOMEM; 447 return -ENOMEM;
@@ -438,7 +456,7 @@ int init_syscall_trace(struct ftrace_event_call *call)
438 return id; 456 return id;
439} 457}
440 458
441unsigned long __init arch_syscall_addr(int nr) 459unsigned long __init __weak arch_syscall_addr(int nr)
442{ 460{
443 return (unsigned long)sys_call_table[nr]; 461 return (unsigned long)sys_call_table[nr];
444} 462}
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 419209893d87..51c6e89e8619 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
189 struct group_info *group_info; 189 struct group_info *group_info;
190 int retval; 190 int retval;
191 191
192 if (!capable(CAP_SETGID)) 192 if (!nsown_capable(CAP_SETGID))
193 return -EPERM; 193 return -EPERM;
194 if ((unsigned)gidsetsize > NGROUPS_MAX) 194 if ((unsigned)gidsetsize > NGROUPS_MAX)
195 return -EINVAL; 195 return -EINVAL;
diff --git a/kernel/user.c b/kernel/user.c
index 5c598ca781df..9e03e9c1df8d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -17,9 +17,13 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19 19
20/*
21 * userns count is 1 for root user, 1 for init_uts_ns,
22 * and 1 for... ?
23 */
20struct user_namespace init_user_ns = { 24struct user_namespace init_user_ns = {
21 .kref = { 25 .kref = {
22 .refcount = ATOMIC_INIT(2), 26 .refcount = ATOMIC_INIT(3),
23 }, 27 },
24 .creator = &root_user, 28 .creator = &root_user,
25}; 29};
@@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep;
47 */ 51 */
48static DEFINE_SPINLOCK(uidhash_lock); 52static DEFINE_SPINLOCK(uidhash_lock);
49 53
50/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */ 54/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */
51struct user_struct root_user = { 55struct user_struct root_user = {
52 .__count = ATOMIC_INIT(2), 56 .__count = ATOMIC_INIT(2),
53 .processes = ATOMIC_INIT(1), 57 .processes = ATOMIC_INIT(1),
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 8a82b4b8ea52..44646179eaba 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/user_namespace.h>
17 18
18static struct uts_namespace *create_uts_ns(void) 19static struct uts_namespace *create_uts_ns(void)
19{ 20{
@@ -30,7 +31,8 @@ static struct uts_namespace *create_uts_ns(void)
30 * @old_ns: namespace to clone 31 * @old_ns: namespace to clone
31 * Return NULL on error (failure to kmalloc), new ns otherwise 32 * Return NULL on error (failure to kmalloc), new ns otherwise
32 */ 33 */
33static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) 34static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
35 struct uts_namespace *old_ns)
34{ 36{
35 struct uts_namespace *ns; 37 struct uts_namespace *ns;
36 38
@@ -40,6 +42,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
40 42
41 down_read(&uts_sem); 43 down_read(&uts_sem);
42 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 44 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
45 ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
43 up_read(&uts_sem); 46 up_read(&uts_sem);
44 return ns; 47 return ns;
45} 48}
@@ -50,8 +53,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
50 * utsname of this process won't be seen by parent, and vice 53 * utsname of this process won't be seen by parent, and vice
51 * versa. 54 * versa.
52 */ 55 */
53struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns) 56struct uts_namespace *copy_utsname(unsigned long flags,
57 struct task_struct *tsk)
54{ 58{
59 struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
55 struct uts_namespace *new_ns; 60 struct uts_namespace *new_ns;
56 61
57 BUG_ON(!old_ns); 62 BUG_ON(!old_ns);
@@ -60,7 +65,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol
60 if (!(flags & CLONE_NEWUTS)) 65 if (!(flags & CLONE_NEWUTS))
61 return old_ns; 66 return old_ns;
62 67
63 new_ns = clone_uts_ns(old_ns); 68 new_ns = clone_uts_ns(tsk, old_ns);
64 69
65 put_uts_ns(old_ns); 70 put_uts_ns(old_ns);
66 return new_ns; 71 return new_ns;
@@ -71,5 +76,6 @@ void free_uts_ns(struct kref *kref)
71 struct uts_namespace *ns; 76 struct uts_namespace *ns;
72 77
73 ns = container_of(kref, struct uts_namespace, kref); 78 ns = container_of(kref, struct uts_namespace, kref);
79 put_user_ns(ns->user_ns);
74 kfree(ns); 80 kfree(ns);
75} 81}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 18bb15776c57..140dce750450 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -48,12 +48,15 @@ static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
48 * Should we panic when a soft-lockup or hard-lockup occurs: 48 * Should we panic when a soft-lockup or hard-lockup occurs:
49 */ 49 */
50#ifdef CONFIG_HARDLOCKUP_DETECTOR 50#ifdef CONFIG_HARDLOCKUP_DETECTOR
51static int hardlockup_panic; 51static int hardlockup_panic =
52 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
52 53
53static int __init hardlockup_panic_setup(char *str) 54static int __init hardlockup_panic_setup(char *str)
54{ 55{
55 if (!strncmp(str, "panic", 5)) 56 if (!strncmp(str, "panic", 5))
56 hardlockup_panic = 1; 57 hardlockup_panic = 1;
58 else if (!strncmp(str, "nopanic", 7))
59 hardlockup_panic = 0;
57 else if (!strncmp(str, "0", 1)) 60 else if (!strncmp(str, "0", 1))
58 watchdog_enabled = 0; 61 watchdog_enabled = 0;
59 return 1; 62 return 1;
@@ -415,19 +418,22 @@ static int watchdog_prepare_cpu(int cpu)
415static int watchdog_enable(int cpu) 418static int watchdog_enable(int cpu)
416{ 419{
417 struct task_struct *p = per_cpu(softlockup_watchdog, cpu); 420 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
418 int err; 421 int err = 0;
419 422
420 /* enable the perf event */ 423 /* enable the perf event */
421 err = watchdog_nmi_enable(cpu); 424 err = watchdog_nmi_enable(cpu);
422 if (err) 425
423 return err; 426 /* Regardless of err above, fall through and start softlockup */
424 427
425 /* create the watchdog thread */ 428 /* create the watchdog thread */
426 if (!p) { 429 if (!p) {
427 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); 430 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
428 if (IS_ERR(p)) { 431 if (IS_ERR(p)) {
429 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); 432 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
430 return PTR_ERR(p); 433 if (!err)
434 /* if hardlockup hasn't already set this */
435 err = PTR_ERR(p);
436 goto out;
431 } 437 }
432 kthread_bind(p, cpu); 438 kthread_bind(p, cpu);
433 per_cpu(watchdog_touch_ts, cpu) = 0; 439 per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -435,7 +441,8 @@ static int watchdog_enable(int cpu)
435 wake_up_process(p); 441 wake_up_process(p);
436 } 442 }
437 443
438 return 0; 444out:
445 return err;
439} 446}
440 447
441static void watchdog_disable(int cpu) 448static void watchdog_disable(int cpu)
@@ -547,7 +554,13 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
547 break; 554 break;
548#endif /* CONFIG_HOTPLUG_CPU */ 555#endif /* CONFIG_HOTPLUG_CPU */
549 } 556 }
550 return notifier_from_errno(err); 557
558 /*
559 * hardlockup and softlockup are not important enough
560 * to block cpu bring up. Just always succeed and
561 * rely on printk output to flag problems.
562 */
563 return NOTIFY_OK;
551} 564}
552 565
553static struct notifier_block __cpuinitdata cpu_nfb = { 566static struct notifier_block __cpuinitdata cpu_nfb = {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ee6578b578ad..04ef830690ec 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -251,10 +251,12 @@ struct workqueue_struct *system_wq __read_mostly;
251struct workqueue_struct *system_long_wq __read_mostly; 251struct workqueue_struct *system_long_wq __read_mostly;
252struct workqueue_struct *system_nrt_wq __read_mostly; 252struct workqueue_struct *system_nrt_wq __read_mostly;
253struct workqueue_struct *system_unbound_wq __read_mostly; 253struct workqueue_struct *system_unbound_wq __read_mostly;
254struct workqueue_struct *system_freezable_wq __read_mostly;
254EXPORT_SYMBOL_GPL(system_wq); 255EXPORT_SYMBOL_GPL(system_wq);
255EXPORT_SYMBOL_GPL(system_long_wq); 256EXPORT_SYMBOL_GPL(system_long_wq);
256EXPORT_SYMBOL_GPL(system_nrt_wq); 257EXPORT_SYMBOL_GPL(system_nrt_wq);
257EXPORT_SYMBOL_GPL(system_unbound_wq); 258EXPORT_SYMBOL_GPL(system_unbound_wq);
259EXPORT_SYMBOL_GPL(system_freezable_wq);
258 260
259#define CREATE_TRACE_POINTS 261#define CREATE_TRACE_POINTS
260#include <trace/events/workqueue.h> 262#include <trace/events/workqueue.h>
@@ -316,6 +318,11 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
316 318
317static struct debug_obj_descr work_debug_descr; 319static struct debug_obj_descr work_debug_descr;
318 320
321static void *work_debug_hint(void *addr)
322{
323 return ((struct work_struct *) addr)->func;
324}
325
319/* 326/*
320 * fixup_init is called when: 327 * fixup_init is called when:
321 * - an active object is initialized 328 * - an active object is initialized
@@ -387,6 +394,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state)
387 394
388static struct debug_obj_descr work_debug_descr = { 395static struct debug_obj_descr work_debug_descr = {
389 .name = "work_struct", 396 .name = "work_struct",
397 .debug_hint = work_debug_hint,
390 .fixup_init = work_fixup_init, 398 .fixup_init = work_fixup_init,
391 .fixup_activate = work_fixup_activate, 399 .fixup_activate = work_fixup_activate,
392 .fixup_free = work_fixup_free, 400 .fixup_free = work_fixup_free,
@@ -1358,8 +1366,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
1358 worker->id = id; 1366 worker->id = id;
1359 1367
1360 if (!on_unbound_cpu) 1368 if (!on_unbound_cpu)
1361 worker->task = kthread_create(worker_thread, worker, 1369 worker->task = kthread_create_on_node(worker_thread,
1362 "kworker/%u:%d", gcwq->cpu, id); 1370 worker,
1371 cpu_to_node(gcwq->cpu),
1372 "kworker/%u:%d", gcwq->cpu, id);
1363 else 1373 else
1364 worker->task = kthread_create(worker_thread, worker, 1374 worker->task = kthread_create(worker_thread, worker,
1365 "kworker/u:%d", id); 1375 "kworker/u:%d", id);
@@ -3775,8 +3785,10 @@ static int __init init_workqueues(void)
3775 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); 3785 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3776 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, 3786 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3777 WQ_UNBOUND_MAX_ACTIVE); 3787 WQ_UNBOUND_MAX_ACTIVE);
3788 system_freezable_wq = alloc_workqueue("events_freezable",
3789 WQ_FREEZABLE, 0);
3778 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || 3790 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
3779 !system_unbound_wq); 3791 !system_unbound_wq || !system_freezable_wq);
3780 return 0; 3792 return 0;
3781} 3793}
3782early_initcall(init_workqueues); 3794early_initcall(init_workqueues);